In [1]:
from sentence_transformers import SentenceTransformer, util
import torch.backends
import torch.backends.mps
import torch
import rdflib
from tqdm import tqdm


BERTModel = 'multi-qa-mpnet-base-dot-v1'
BIOBERT = 'FremyCompany/BioLORD-2023' # this model accommodate semantic textual similarity
NLPModel = 'en_core_sci_lg'
device = 'gpu' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

In [3]:
biobert_model = SentenceTransformer(BIOBERT, device=device)

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [8]:
g1 = rdflib.Graph()
g1.parse('test_bg.ttl', format='ttl')
g2 = rdflib.Graph()
g2.parse('test.ttl', format='ttl')

<Graph identifier=N97bc0a996cc74ba9afed79d08439f8f2 (<class 'rdflib.graph.Graph'>)>

In [51]:
# calculate embeddings for each predicate and object


def get_embeddings(g):
    embeddings = {}

    def get_subject_embedding(s):
        s_embeddings = []
        for p, o in g.predicate_objects(subject=s):
            if (p, o) in embeddings:
                s_embeddings += [embeddings[(p,o)]]
            else:
                if not str(o).startswith('file'):  
                    stacked_embeddings = biobert_model.encode([str(p), str(o)], convert_to_tensor=True)
                    p_o_embedding = torch.mean(stacked_embeddings, 0)
                else:
                    o_embedding = get_subject_embedding(o)
                    p_embedding = biobert_model.encode(str(p), convert_to_tensor=True)
                    stacked_embeddings = torch.stack([p_embedding, o_embedding], dim=0)
                    p_o_embedding = torch.mean(stacked_embeddings, 0)
                s_embeddings += [p_o_embedding]
                embeddings[(p,o)] = p_o_embedding
        return torch.mean(torch.stack(s_embeddings, 0), 0)

    subject_embeddings = {str(s).split('/')[-1]:get_subject_embedding(s) for s in tqdm(g.subjects())}
            
    return subject_embeddings

g1_embeddings_dict = get_embeddings(g1)
g2_embeddings_dict = get_embeddings(g2)

11it [00:02,  5.08it/s]
19it [00:00, 68.12it/s]


In [54]:
g1_keys, g1_embeddings = zip(*g1_embeddings_dict.items())
g2_keys, g2_embeddings = zip(*g2_embeddings_dict.items())

g1_tensor = torch.stack(g1_embeddings)  # Shape: [len(dict1), embedding_dim]
g2_tensor = torch.stack(g1_embeddings)  # Shape: [len(dict2), embedding_dim]

# Compute cosine similarity (matrix multiplication + normalization)
g1_norm = g1_tensor / g1_tensor.norm(dim=1, keepdim=True)
g2_norm = g2_tensor / g2_tensor.norm(dim=1, keepdim=True)
similarity_matrix = torch.mm(g1_norm, g2_norm.T)  # Shape: [len(dict1), len(dict2)]

# Retrieve top N matches for each entry in dict1
top_n = 1  # Change to desired number of top matches
top_n_scores, top_n_indices = torch.topk(similarity_matrix, top_n, dim=1)

# Map back to keys for results
results = {}
for i, key1 in enumerate(g1_keys):
    matches = [(g2_keys[idx], top_n_scores[i, j].item()) for j, idx in enumerate(top_n_indices[i])]
    results[key1] = matches

# Print results
print("Similarity Scores:")
for key1, matches in results.items():
    print(f"\n{key1}:")
    for match in matches:
        print(f"  {match[0]}: {match[1]:.4f}")


Similarity Scores:

bg.json#bioProcID:
  test.ttl#local_bioProcID2: 1.0000

bg.json#mediatorID:
  test.cellml#sinkID2: 1.0000

bg.json#sourceID:
  test.ttl#local_sinkID2: 1.0000

bg.json#sinkID:
  test.ttl#local_sourceID2: 1.0000
