Skip to content

Commit

Permalink
new matcher
Browse files Browse the repository at this point in the history
  • Loading branch information
raulcf committed Feb 20, 2017
1 parent bdba82d commit 95e7a87
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 117 deletions.
138 changes: 34 additions & 104 deletions modelstore/elasticstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,15 +137,6 @@ def get_all_fields_with(self, attrs):
scroll_id = res['_scroll_id'] # update the scroll_id
client.clear_scroll(scroll_id=scroll_id)

def peek_values(self, field, num_values):
"""
Reads sample values for the given field
:param field: The field from which to read values
:param num_values: The number of values to read
:return: A list with the sample values read for field
"""
print("TODO")

def search_keywords(self, keywords, elasticfieldname, max_hits=15):
"""
Performs a search query on elastic_field_name to match the provided keywords
Expand Down Expand Up @@ -186,6 +177,40 @@ def search_keywords(self, keywords, elasticfieldname, max_hits=15):
el['_source']['columnName'], el['_score'])
yield data

def fuzzy_keyword_match(self, keywords, max_hits=15):
"""
Performs a search query on elastic_field_name to match the provided keywords
:param keywords: the list of keyword to match
:param max_hits: maximum number of returned objects
:return: the list of documents that contain the keywords
"""
filter_path = ['hits.hits._source.id',
'hits.hits._score',
'hits.total',
'hits.hits._source.dbName',
'hits.hits._source.sourceName',
'hits.hits._source.columnName']
index = "text"
query_body = {
"from": 0, "size": max_hits,
"query": {
"match": {
"text": {
"query": keywords,
"fuzziness": "AUTO"
}
}
}
}
res = client.search(index=index, body=query_body,
filter_path=filter_path)
if res['hits']['total'] == 0:
return []
for el in res['hits']['hits']:
data = Hit(el['_source']['id'], el['_source']['dbName'], el['_source']['sourceName'],
el['_source']['columnName'], el['_score'])
yield data

def get_all_fields_entities(self):
"""
Retrieves all fields and entities from the store
Expand Down Expand Up @@ -671,98 +696,3 @@ def _current_time(self):

if __name__ == "__main__":
print("Elastic Store")

"""
def get_all_text_fields(self):
query_body = {
"query": {"bool": {"filter": [{"term": {"dataType": "T"}}]}}}
res = client.search(index='profile', body=query_body, scroll="10m",
filter_path=['_scroll_id',
'hits.hits._id',
'hits.total',
'hits.hits._source.sourceName',
'hits.hits._source.columnName',
'hits.hits._source.totalValues',
'hits.hits._source.uniqueValues']
)
scroll_id = res['_scroll_id']
remaining = res['hits']['total']
while remaining > 0:
hits = res['hits']['hits']
for h in hits:
id_source_and_file_name = (h['_id'], h['_source']['sourceName'], h['_source']['columnName'],
h['_source']['totalValues'], h['_source']['uniqueValues'])
yield id_source_and_file_name
remaining -= 1
res = client.scroll(scroll="3m", scroll_id=scroll_id,
filter_path=['_scroll_id',
'hits.hits._id',
'hits.hits._source.sourceName',
'hits.hits._source.columnName',
'hits.hits._source.totalValues',
'hits.hits._source.uniqueValues']
)
scroll_id = res['_scroll_id'] # update the scroll_id
client.clear_scroll(scroll_id=scroll_id)
def get_fields_text_index(self):
'''
Reads all fields, described as (id, source_name, field_name) from the store (text index).
:return: a list of all fields with the form (id, source_name, field_name)
'''
body = {"query": {"match_all": {}}}
res = client.search(index='text', body=body, scroll="10m",
filter_path=['_scroll_id',
'hits.hits._id',
'hits.total',
'hits.hits._source.id',
'hits.hits._source.sourceName',
'hits.hits._source.columnName']
)
scroll_id = res['_scroll_id']
remaining = res['hits']['total']
while remaining > 0:
hits = res['hits']['hits']
for h in hits:
rawid_id_source_and_file_name = (h['_id'], h['_source']['id'],
h['_source']['sourceName'], h['_source']['columnName'])
yield rawid_id_source_and_file_name
remaining -= 1
res = client.scroll(scroll="3m", scroll_id=scroll_id,
filter_path=['_scroll_id',
'hits.hits._id',
'hits.hits._source.id',
'hits.hits._source.sourceName',
'hits.hits._source.columnName']
)
scroll_id = res['_scroll_id'] # update the scroll_id
client.clear_scroll(scroll_id=scroll_id)
def get_all_fields_of_source(self, source_name):
body = {"query": {"match": {"sourceName": source_name}}}
res = client.search(index='profile', body=body, scroll="10m",
filter_path=['_scroll_id',
'hits.hits._id',
'hits.total',
'hits.hits._source.sourceName',
'hits.hits._source.columnName']
)
scroll_id = res['_scroll_id']
remaining = res['hits']['total']
while remaining > 0:
hits = res['hits']['hits']
for h in hits:
hit = Hit(h['_id'], h['_source']['sourceName'],
h['_source']['columnName'], -1)
yield hit
remaining -= 1
res = client.scroll(scroll="3m", scroll_id=scroll_id,
filter_path=['_scroll_id',
'hits.hits._id',
'hits.hits._source.sourceName',
'hits.hits._source.columnName']
)
scroll_id = res['_scroll_id'] # update the scroll_id
client.clear_scroll(scroll_id=scroll_id)
"""
53 changes: 41 additions & 12 deletions ontomatch/matcher_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from datasketch import MinHash, MinHashLSH
from knowledgerepr.networkbuilder import LSHRandomProjectionsIndex
from dataanalysis import dataanalysis as da
import operator
import numpy as np


class MatchingType(Enum):
Expand Down Expand Up @@ -445,20 +447,47 @@ def find_sem_coh_matchings(self):
matchings.append(match)
return matchings

cutoff_likely_match_threshold = 0.4
min_relevance_score = 0.2
scoring_threshold = 0.4
min_classes = 50

def find_hierarchy_content_fuzzy():

# access class names

# query elastic for fuzzy matches

# keep matches in structure

# consolidate from time to time

# return most promising matches

return
def find_hierarchy_content_fuzzy(kr_handlers, store):
matchings = []
# access class names, per hierarchical level (this is one assumption that makes sense)
for kr_name, kr in kr_handlers.items():
ch = kr.class_hierarchy
for ch_name, ch_classes in ch:
if len(ch_classes) < min_classes: # do this only for longer hierarchies
continue
# query elastic for fuzzy matches
matching_evidence = defaultdict(int)
for class_id, class_name in ch_classes:
matches = store.fuzzy_keyword_match(class_name)
keys_in_matches = set()
for m in matches:
# record
if m.score > min_relevance_score:
key = (m.db_name, m.source_name, m.field_name)
keys_in_matches.add(key)
for k in keys_in_matches:
matching_evidence[k] += 1
num_classes = len(ch_classes)
num_potential_matches = len(matching_evidence.items())
cutoff_likely_match = float(num_potential_matches/num_classes)
if cutoff_likely_match > cutoff_likely_match_threshold: # if passes cutoff threshold then
continue
sorted_matching_evidence = sorted(matching_evidence.items(), key=operator.itemgetter(1), reverse=True)
# a perfect match would score 1
for key, value in sorted_matching_evidence:
score = float(value/num_classes)
if score > scoring_threshold:
match = (key, (kr_name, ch_name))
matchings.append(match)
else:
break # orderd, so once one does not comply, no one else does...
return matchings

if __name__ == "__main__":
print("Matcher lib")
25 changes: 24 additions & 1 deletion ontomatch/ss_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def find_matchings(self):
# L7: [Attribute names] -> [class names] (content - fuzzy naming)
print("Finding L7 matchings...")
st = time.time()
l7_matchings = matcherlib.find_hierarchy_content_fuzzy()
l7_matchings = matcherlib.find_hierarchy_content_fuzzy(self.kr_handlers, self.store_client)
print("Finding L7 matchings...OK, " + str(len(l7_matchings)) + " found")
et = time.time()
print("Took: " + str(et - st))
Expand Down Expand Up @@ -498,11 +498,34 @@ def test_find_semantic_sim():
if sim > 0.4:
print(str(cl) + " -> " + str(sim))

def test_fuzzy(path_to_serialized_model):
# Deserialize model
network = fieldnetwork.deserialize_network(path_to_serialized_model)
# Create client
store_client = StoreHandler()

# Retrieve indexes
schema_sim_index = io.deserialize_object(path_to_serialized_model + 'schema_sim_index.pkl')
content_sim_index = io.deserialize_object(path_to_serialized_model + 'content_sim_index.pkl')

# Create ontomatch api
om = SSAPI(network, store_client, schema_sim_index, content_sim_index)
# Load parsed ontology
om.add_krs([("efo", "cache_onto/efo.pkl")], parsed=True)

matchings = matcherlib.find_hierarchy_content_fuzzy(om.kr_handlers, store_client)

for m in matchings:
print(m)

if __name__ == "__main__":

#test_find_semantic_sim()
#exit()

test_fuzzy("../models/chembl21/")
exit()

test("../models/chembl21/")
exit()

Expand Down

0 comments on commit 95e7a87

Please sign in to comment.