In [1]:
from sentence_transformers import SentenceTransformer, util
import torch.backends
import torch.backends.mps
import torch
import rdflib
from tqdm import tqdm


BIOBERT = 'FremyCompany/BioLORD-2023' # this model accommodate semantic textual similarity
device = 'gpu' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

In [2]:
biobert_model = SentenceTransformer(BIOBERT, device=device)

In [3]:
g1 = rdflib.Graph()
g1.parse('test_bg.ttl', format='ttl')
g2 = rdflib.Graph()
g2.parse('test.ttl', format='ttl')

<Graph identifier=Nd68083d6a1a6450e8e44326935e6573c (<class 'rdflib.graph.Graph'>)>

In [10]:
# calculate embeddings for each predicate and object


def get_embeddings(g):
    embeddings = {}

    def get_subject_embedding(s):
        s_embeddings = []
        for p, o in g.predicate_objects(subject=s):
            if (p, o) in embeddings:
                s_embeddings += [embeddings[(p,o)]]
            else:
                if not str(o).startswith('file'):  
                    stacked_embeddings = biobert_model.encode([str(p), str(o)], convert_to_tensor=True)
                    p_o_embedding = torch.mean(stacked_embeddings, 0)
                else:
                    o_embedding = get_subject_embedding(o)
                    p_embedding = biobert_model.encode(str(p), convert_to_tensor=True)
                    stacked_embeddings = torch.stack([p_embedding, o_embedding], dim=0)
                    p_o_embedding = torch.mean(stacked_embeddings, 0)
                s_embeddings += [p_o_embedding]
                embeddings[(p,o)] = p_o_embedding
        return torch.mean(torch.stack(s_embeddings, 0), 0)

    subject_embeddings = {str(s).split('/')[-1]:get_subject_embedding(s) for s in tqdm(g.subjects())}
            
    return subject_embeddings

g1_embeddings_dict = get_embeddings(g1)
g2_embeddings_dict = get_embeddings(g2)

11it [00:02,  4.45it/s]
19it [00:00, 61.47it/s]


In [16]:
def compare(G1_embeddings_dict, G2_embeddings_dict):
    g1_keys, g1_embeddings = zip(*G1_embeddings_dict.items())
    g2_keys, g2_embeddings = zip(*G2_embeddings_dict.items())
    
    g1_tensor = torch.stack(g1_embeddings)  # Shape: [len(dict1), embedding_dim]
    g2_tensor = torch.stack(g2_embeddings)  # Shape: [len(dict2), embedding_dim]
    
    # Compute cosine similarity (matrix multiplication + normalization)
    g1_norm = g1_tensor / g1_tensor.norm(dim=1, keepdim=True)
    g2_norm = g2_tensor / g2_tensor.norm(dim=1, keepdim=True)
    
    # Retrieve top K matches for each entry in dict1
    top_k = 1  # Change to desired number of top matches
    
    for k, e in G1_embeddings_dict.items():
        cos_scores = util.pytorch_cos_sim(e, g2_tensor)[0]
        top_results = torch.topk(cos_scores, top_k)
        print(k)
        for _, (score, idx) in enumerate(zip(top_results[0], top_results[1])):
            print('   ', g1_keys[idx], float(score))


In [17]:
compare(g1_embeddings_dict, g2_embeddings_dict)

bg.json#bioProcID
    bg.json#bioProcID 1.0
bg.json#sourceID
    bg.json#sourceID 1.0
bg.json#sinkID
    bg.json#sinkID 1.0
bg.json#mediatorID
    bg.json#mediatorID 1.0


In [18]:
compare(g2_embeddings_dict, g1_embeddings_dict)

test.cellml#sourceID2
    test.cellml#sourceID2 1.0
test.cellml#sinkID2
    test.cellml#sinkID2 0.9999998807907104
test.ttl#local_bioProcID2
    test.ttl#local_bioProcID2 1.0
test.ttl#local_sourceID2
    test.ttl#local_sourceID2 1.0
test.cellml#bioProcID2
    test.cellml#bioProcID2 1.0
test.ttl#local_sinkID2
    test.ttl#local_sinkID2 1.0
test.cellml#mediatorID2
    test.cellml#mediatorID2 1.0
test.ttl#local_mediatorID2
    test.ttl#local_mediatorID2 1.0
