new matcher

mitdbg · Feb 20, 2017 · 95e7a87 · 95e7a87
1 parent bdba82d
commit 95e7a87
Show file tree

Hide file tree

Showing 3 changed files with 99 additions and 117 deletions.
diff --git a/modelstore/elasticstore.py b/modelstore/elasticstore.py
@@ -137,15 +137,6 @@ def get_all_fields_with(self, attrs):
             scroll_id = res['_scroll_id']  # update the scroll_id
         client.clear_scroll(scroll_id=scroll_id)
 
-    def peek_values(self, field, num_values):
-        """
-        Reads sample values for the given field
-        :param field: The field from which to read values
-        :param num_values: The number of values to read
-        :return: A list with the sample values read for field
-        """
-        print("TODO")
-
     def search_keywords(self, keywords, elasticfieldname, max_hits=15):
         """
         Performs a search query on elastic_field_name to match the provided keywords
@@ -186,6 +177,40 @@ def search_keywords(self, keywords, elasticfieldname, max_hits=15):
                        el['_source']['columnName'], el['_score'])
             yield data
 
+    def fuzzy_keyword_match(self, keywords, max_hits=15):
+        """
+        Performs a search query on elastic_field_name to match the provided keywords
+        :param keywords: the list of keyword to match
+        :param max_hits: maximum number of returned objects
+        :return: the list of documents that contain the keywords
+        """
+        filter_path = ['hits.hits._source.id',
+                       'hits.hits._score',
+                       'hits.total',
+                       'hits.hits._source.dbName',
+                       'hits.hits._source.sourceName',
+                       'hits.hits._source.columnName']
+        index = "text"
+        query_body = {
+            "from": 0, "size": max_hits,
+                "query": {
+                    "match": {
+                        "text": {
+                            "query": keywords,
+                            "fuzziness": "AUTO"
+                        }
+                    }
+                }
+            }
+        res = client.search(index=index, body=query_body,
+                            filter_path=filter_path)
+        if res['hits']['total'] == 0:
+            return []
+        for el in res['hits']['hits']:
+            data = Hit(el['_source']['id'], el['_source']['dbName'], el['_source']['sourceName'],
+                       el['_source']['columnName'], el['_score'])
+            yield data
+
     def get_all_fields_entities(self):
         """
         Retrieves all fields and entities from the store
@@ -671,98 +696,3 @@ def _current_time(self):
 
 if __name__ == "__main__":
     print("Elastic Store")
-
-    """
-        def get_all_text_fields(self):
-            query_body = {
-                "query": {"bool": {"filter": [{"term": {"dataType": "T"}}]}}}
-            res = client.search(index='profile', body=query_body, scroll="10m",
-                                filter_path=['_scroll_id',
-                                             'hits.hits._id',
-                                             'hits.total',
-                                             'hits.hits._source.sourceName',
-                                             'hits.hits._source.columnName',
-                                             'hits.hits._source.totalValues',
-                                             'hits.hits._source.uniqueValues']
-                                )
-            scroll_id = res['_scroll_id']
-            remaining = res['hits']['total']
-            while remaining > 0:
-                hits = res['hits']['hits']
-                for h in hits:
-                    id_source_and_file_name = (h['_id'], h['_source']['sourceName'], h['_source']['columnName'],
-                                               h['_source']['totalValues'], h['_source']['uniqueValues'])
-                    yield id_source_and_file_name
-                    remaining -= 1
-                res = client.scroll(scroll="3m", scroll_id=scroll_id,
-                                    filter_path=['_scroll_id',
-                                                 'hits.hits._id',
-                                                 'hits.hits._source.sourceName',
-                                                 'hits.hits._source.columnName',
-                                                 'hits.hits._source.totalValues',
-                                                 'hits.hits._source.uniqueValues']
-                                    )
-                scroll_id = res['_scroll_id']  # update the scroll_id
-            client.clear_scroll(scroll_id=scroll_id)
-
-        def get_fields_text_index(self):
-            '''
-            Reads all fields, described as (id, source_name, field_name) from the store (text index).
-            :return: a list of all fields with the form (id, source_name, field_name)
-            '''
-            body = {"query": {"match_all": {}}}
-            res = client.search(index='text', body=body, scroll="10m",
-                                filter_path=['_scroll_id',
-                                             'hits.hits._id',
-                                             'hits.total',
-                                             'hits.hits._source.id',
-                                             'hits.hits._source.sourceName',
-                                             'hits.hits._source.columnName']
-                                )
-            scroll_id = res['_scroll_id']
-            remaining = res['hits']['total']
-            while remaining > 0:
-                hits = res['hits']['hits']
-                for h in hits:
-                    rawid_id_source_and_file_name = (h['_id'], h['_source']['id'],
-                                                     h['_source']['sourceName'], h['_source']['columnName'])
-                    yield rawid_id_source_and_file_name
-                    remaining -= 1
-                res = client.scroll(scroll="3m", scroll_id=scroll_id,
-                                    filter_path=['_scroll_id',
-                                                 'hits.hits._id',
-                                                 'hits.hits._source.id',
-                                                 'hits.hits._source.sourceName',
-                                                 'hits.hits._source.columnName']
-                                    )
-                scroll_id = res['_scroll_id']  # update the scroll_id
-            client.clear_scroll(scroll_id=scroll_id)
-
-
-        def get_all_fields_of_source(self, source_name):
-            body = {"query": {"match": {"sourceName": source_name}}}
-            res = client.search(index='profile', body=body, scroll="10m",
-                                filter_path=['_scroll_id',
-                                             'hits.hits._id',
-                                             'hits.total',
-                                             'hits.hits._source.sourceName',
-                                             'hits.hits._source.columnName']
-                                )
-            scroll_id = res['_scroll_id']
-            remaining = res['hits']['total']
-            while remaining > 0:
-                hits = res['hits']['hits']
-                for h in hits:
-                    hit = Hit(h['_id'], h['_source']['sourceName'],
-                              h['_source']['columnName'], -1)
-                    yield hit
-                    remaining -= 1
-                res = client.scroll(scroll="3m", scroll_id=scroll_id,
-                                    filter_path=['_scroll_id',
-                                                 'hits.hits._id',
-                                                 'hits.hits._source.sourceName',
-                                                 'hits.hits._source.columnName']
-                                    )
-                scroll_id = res['_scroll_id']  # update the scroll_id
-            client.clear_scroll(scroll_id=scroll_id)
-        """
diff --git a/ontomatch/matcher_lib.py b/ontomatch/matcher_lib.py
@@ -8,6 +8,8 @@
 from datasketch import MinHash, MinHashLSH
 from knowledgerepr.networkbuilder import LSHRandomProjectionsIndex
 from dataanalysis import dataanalysis as da
+import operator
+import numpy as np
 
 
 class MatchingType(Enum):
@@ -445,20 +447,47 @@ def find_sem_coh_matchings(self):
                     matchings.append(match)
     return matchings
 
+cutoff_likely_match_threshold = 0.4
+min_relevance_score = 0.2
+scoring_threshold = 0.4
+min_classes = 50
 
-def find_hierarchy_content_fuzzy():
 
-    # access class names
-
-    # query elastic for fuzzy matches
-
-    # keep matches in structure
-
-    # consolidate from time to time
-
-    # return most promising matches
-
-    return
+def find_hierarchy_content_fuzzy(kr_handlers, store):
+    matchings = []
+    # access class names, per hierarchical level (this is one assumption that makes sense)
+    for kr_name, kr in kr_handlers.items():
+        ch = kr.class_hierarchy
+        for ch_name, ch_classes in ch:
+            if len(ch_classes) < min_classes:  # do this only for longer hierarchies
+                continue
+            # query elastic for fuzzy matches
+            matching_evidence = defaultdict(int)
+            for class_id, class_name in ch_classes:
+                matches = store.fuzzy_keyword_match(class_name)
+                keys_in_matches = set()
+                for m in matches:
+                    # record
+                    if m.score > min_relevance_score:
+                        key = (m.db_name, m.source_name, m.field_name)
+                        keys_in_matches.add(key)
+                for k in keys_in_matches:
+                    matching_evidence[k] += 1
+            num_classes = len(ch_classes)
+            num_potential_matches = len(matching_evidence.items())
+            cutoff_likely_match = float(num_potential_matches/num_classes)
+            if cutoff_likely_match > cutoff_likely_match_threshold:  # if passes cutoff threshold then
+                continue
+            sorted_matching_evidence = sorted(matching_evidence.items(), key=operator.itemgetter(1), reverse=True)
+            # a perfect match would score 1
+            for key, value in sorted_matching_evidence:
+                score = float(value/num_classes)
+                if score > scoring_threshold:
+                    match = (key, (kr_name, ch_name))
+                    matchings.append(match)
+                else:
+                    break  # orderd, so once one does not comply, no one else does...
+    return matchings
 
 if __name__ == "__main__":
     print("Matcher lib")
diff --git a/ontomatch/ss_api.py b/ontomatch/ss_api.py
@@ -224,7 +224,7 @@ def find_matchings(self):
         # L7: [Attribute names] -> [class names] (content - fuzzy naming)
         print("Finding L7 matchings...")
         st = time.time()
-        l7_matchings = matcherlib.find_hierarchy_content_fuzzy()
+        l7_matchings = matcherlib.find_hierarchy_content_fuzzy(self.kr_handlers, self.store_client)
         print("Finding L7 matchings...OK, " + str(len(l7_matchings)) + " found")
         et = time.time()
         print("Took: " + str(et - st))
@@ -498,11 +498,34 @@ def test_find_semantic_sim():
             if sim > 0.4:
                 print(str(cl) + " -> " + str(sim))
 
+def test_fuzzy(path_to_serialized_model):
+    # Deserialize model
+    network = fieldnetwork.deserialize_network(path_to_serialized_model)
+    # Create client
+    store_client = StoreHandler()
+
+    # Retrieve indexes
+    schema_sim_index = io.deserialize_object(path_to_serialized_model + 'schema_sim_index.pkl')
+    content_sim_index = io.deserialize_object(path_to_serialized_model + 'content_sim_index.pkl')
+
+    # Create ontomatch api
+    om = SSAPI(network, store_client, schema_sim_index, content_sim_index)
+    # Load parsed ontology
+    om.add_krs([("efo", "cache_onto/efo.pkl")], parsed=True)
+
+    matchings = matcherlib.find_hierarchy_content_fuzzy(om.kr_handlers, store_client)
+
+    for m in matchings:
+        print(m)
+
 if __name__ == "__main__":
 
     #test_find_semantic_sim()
     #exit()
 
+    test_fuzzy("../models/chembl21/")
+    exit()
+
     test("../models/chembl21/")
     exit()