In [28]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from tqdm import tqdm

Let's fetch the dataframes.

In [39]:
model = SentenceTransformer('all-mpnet-base-v2')
vtt_domain_df = pd.read_csv("./data/dataframes/df_relationships_vtt_domain.csv")
comp_domain_df = pd.read_csv("./data/dataframes/df_relationships_comp_domain.csv")

In [81]:
# Different relationship types we have:
print("Vtt domain relationships", vtt_domain_df["relationship type"].unique())
print("Comp domain relationships", comp_domain_df["relationship type"].unique())

Vtt domain relationships ['DEVELOPED_BY' 'COLLABORATION']
Comp domain relationships ['DEVELOPED_BY']


In [43]:
comp_domain_df.head(2)

Unnamed: 0,Document number,Source Company,relationship description,source id,source type,source description,relationship type,target id,target type,target description,Link Source Text,Source Text
0,0,FORTUM OYJ,Fortum Corporation is responsible for executin...,First nuclear decommissioning project in Finla...,Innovation,The first nuclear reactor decommissioning proj...,DEVELOPED_BY,Fortum Corporation,Organization,A company with over 40 years of experience in ...,https://www.fortum.com/media/2020/04/fortum-aw...,"Press release\n 08 April 2020, 7:00 EEST ..."
1,0,FORTUM OYJ,VTT Technical Research Centre of Finland Ltd c...,First nuclear decommissioning project in Finla...,Innovation,The first nuclear reactor decommissioning proj...,DEVELOPED_BY,VTT Technical Research Centre of Finland Ltd,Organization,A research organization in Finland contracting...,https://www.fortum.com/media/2020/04/fortum-aw...,"Press release\n 08 April 2020, 7:00 EEST ..."


In [42]:
vtt_domain_df.head(2)

Unnamed: 0,Document number,VAT id,relationship description,source id,source type,source english_id,source description,relationship type,target id,target type,target english_id,target description,Link Source Text,Source Text
0,0,FI10292588,"FiR 1 nuclear research reactor was developed, ...",FiR 1,Innovation,FiR 1,FiR 1 is a Triga-type nuclear research reactor...,DEVELOPED_BY,FI26473754,Organization,VTT Technical Research Centre of Finland Ltd.,VTT is a Finnish research and innovation partn...,https://www.vttresearch.com/en/news-and-ideas/...,Skip to main content Beyond the obvious Open m...
1,0,FI10292588,Centre for Nuclear Safety is being developed a...,Centre for Nuclear Safety,Innovation,Centre for Nuclear Safety,A modern research facility under construction ...,DEVELOPED_BY,FI26473754,Organization,VTT Technical Research Centre of Finland Ltd.,VTT is a Finnish research and innovation partn...,https://www.vttresearch.com/en/news-and-ideas/...,Skip to main content Beyond the obvious Open m...


In [63]:
def create_text_to_compare(df):
    english_id = "/" + df["source english_id"] if "source english_id" in df.columns else ""
    text_to_compare = df["source description"].fillna("") + " | Developer by " + df["source id"].fillna("") + english_id
    return text_to_compare

In [75]:
# Clean the VTT domain df.

vtt_domain_df["text_to_compare"] = create_text_to_compare(vtt_domain_df)

vtt_domain_df = vtt_domain_df[vtt_domain_df["source type"] != "Organization"]
vtt_domain_df = vtt_domain_df.drop_duplicates(subset="source description", keep="first")

# Clean the comp domain df.

comp_domain_df["text_to_compare"] = create_text_to_compare(comp_domain_df)

comp_domain_df = comp_domain_df[comp_domain_df["source type"] != "Organization"]
comp_domain_df = comp_domain_df.drop_duplicates(subset="source description", keep="first")

print("Comp domain df length:", len(comp_domain_df), " - VTT domain df length:", len(vtt_domain_df))
print("Example of text to compare", vtt_domain_df.iloc[0]["text_to_compare"])

Comp domain df length: 1528  - VTT domain df length: 1504
Example of text to compare FiR 1 is a Triga-type nuclear research reactor located in Otaniemi, Espoo, Finland, decommissioned by VTT after serving since 1962, used for nuclear research, training, and medical radiation therapy. | Developer by FiR 1/FiR 1


Embed everything.

In [36]:
vtt_embeddings = model.encode(vtt_domain_df["text_to_compare"].tolist(), convert_to_tensor=True)
comp_embeddings = model.encode(comp_domain_df["text_to_compare"].tolist(), convert_to_tensor=True)

In [69]:
threshold = 0.80
similar_pairs = []

print("Comparing entries pair by pair...")

for i in tqdm(range(len(vtt_embeddings))):
    for j in range(i + 1, len(comp_embeddings)):
        sim = util.cos_sim(vtt_embeddings[i], comp_embeddings[j]).item()
        
        if sim > threshold:
            i_elem = vtt_domain_df.iloc[i]
            j_elem = comp_domain_df.iloc[j]
            source_i = i_elem["text_to_compare"]
            source_j = j_elem["text_to_compare"]
            
            print(f"\nSimilarity: {sim:.3f} | From: {i+1} To: {j+1}")
            print(f"→ Source {i+1} description: {source_i} and columns: {i_elem.index}")
            print(f"→ Source {j+1} description: {source_j} and columns: {j_elem.index}")
            
            
            similar_pairs.append((i, j, sim))

# Output results
print(f"\nFound {len(similar_pairs)} high-similarity pairs (> {threshold}):")
for i, j, score in similar_pairs:
    print(f"[{i}] '{df.iloc[i]['source english_id']}' <--> [{j}] '{df.iloc[j]['source english_id']}' | Similarity: {score:.3f}")

Comparing entries pair by pair...


  0%|          | 0/1504 [00:00<?, ?it/s]

Document number                                                             0
VAT id                                                             FI10292588
relationship description    FiR 1 nuclear research reactor was developed, ...
source id                                                               FiR 1
source type                                                        Innovation
source english_id                                                       FiR 1
source description          FiR 1 is a Triga-type nuclear research reactor...
relationship type                                                DEVELOPED_BY
target id                                                          FI26473754
target type                                                      Organization
target english_id               VTT Technical Research Centre of Finland Ltd.
target description          VTT is a Finnish research and innovation partn...
Link Source Text            https://www.vttresearch.com/en/news-




AttributeError: 'Series' object has no attribute 'columns'