## TEXTO Ontology Matching

* To install dependencies, please run
`
pip install -r requirements.txt
`.
* Please change the data_path to corresponding dataset.


In [82]:
data_path = './data/schema-wikidata/'

In [83]:
import parse as onto_parser
from embedding import Embedding, similarity_evaluate, pool_simi
from string_match import string_matching
from evaluate import Precision_Recall_F1

import pandas as pd
from sentence_transformers import SentenceTransformer
import gensim.downloader as api

In [84]:
# parse input ontologies
onto_parser.rdf2csv(data_path+"source.rdf", "source.csv")
onto_parser.rdf2csv(data_path+"target.rdf", "target.csv")
onto_parser.reference2csv(data_path+'reference.rdf', 'reference.csv')

In [85]:
# load data and pre-trained model
df_reference, df_source, df_target = pd.read_csv('reference.csv'), pd.read_csv('source.csv'), pd.read_csv('target.csv')
gold_standard = df_reference.apply(lambda row: (row['Class1_id'], row['Class2_id']), axis=1).tolist()
source_classes_kl = df_source.to_dict(orient='list')
target_classes_kl = df_target.to_dict(orient='list')

w2v_model = api.load("glove-wiki-gigaword-300")
sbert_model = SentenceTransformer('all-mpnet-base-v2')

In [86]:
# parameters
alpha = 0.5
cf_limit = 0.4
k = 5

* String Matching and Text embedding

In [87]:
embedder_desc = Embedding(sbert_model, name='SBert')
embedder_label = Embedding(w2v_model, name='Glove')

simi_matrix = string_matching(source_classes_kl['label'], target_classes_kl['label'])
print('Now calculating cosine similarity by using labels knowledge')
label_simi = similarity_evaluate(source_classes_kl['label'], target_classes_kl['label'], embedder_label, simi_matrix, metrics='cos')
print('Now calculating cosine similarity by using descriptions knowledge')
desc_simi = similarity_evaluate(source_classes_kl['comment'], target_classes_kl['comment'], embedder_desc, simi_matrix, metrics='cos')

# weighted average
class_simi = pool_simi(desc_simi, label_simi, pooling='weighted', weights=[alpha,1-alpha]) + simi_matrix.toarray()

string matching: 100%|██████████| 343/343 [00:17<00:00, 19.79it/s]


Now calculating cosine similarity by using labels knowledge


generate embedding for source ontology classes: 100%|██████████| 343/343 [00:00<00:00, 9628.08it/s]
generate embedding for target ontology classes: 100%|██████████| 343/343 [00:00<00:00, 10088.12it/s]


Now calculating cosine similarity by using descriptions knowledge


generate embedding for source ontology classes: 100%|██████████| 343/343 [00:06<00:00, 49.14it/s]
generate embedding for target ontology classes: 100%|██████████| 343/343 [00:07<00:00, 43.04it/s]


* Candidate Selection and Max-Weight Matching

In [88]:
from scipy.sparse import csr_matrix
import numpy as np
from scipy.sparse.csgraph import min_weight_full_bipartite_matching

def candidate_slection(similarity, k=5):
    row_max_indices = np.argpartition(similarity, -k, axis=1)[:, -k:]
    col_max_indices = np.argpartition(similarity, -k, axis=1)[:, -k:]
    result = np.zeros_like(similarity)
    result[np.arange(similarity.shape[0])[:, np.newaxis], row_max_indices] = similarity[np.arange(similarity.shape[0])[:, np.newaxis], row_max_indices]
    result[np.arange(similarity.shape[0])[:, np.newaxis], col_max_indices] = similarity[np.arange(similarity.shape[0])[:, np.newaxis], col_max_indices]
    return result

new_simi = candidate_slection(class_simi, k)
graph = csr_matrix(new_simi + 1e-9)
row_ixs, col_ixs = min_weight_full_bipartite_matching(graph, maximize=True)

* Final alignment generation
* Note: using the commented final_alignments when evaluating on nell-dbpedia dataset

In [90]:
final_alignments = []
for i in range(len(row_ixs)):
    if class_simi[row_ixs[i], col_ixs[i]] > cf_limit:
        final_alignments.append((source_classes_kl['id'][row_ixs[i]], target_classes_kl['id'][col_ixs[i]]))
        # final_alignments.append((target_classes_kl['id'][col_ixs[i]], source_classes_kl['id'][row_ixs[i]]))
# evaluate
print('Precision_Recall_F1 score:', Precision_Recall_F1(gold_standard, final_alignments).round(3))

Precision_Recall_F1 score: [0.994 0.977 0.985]
