In [28]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from tqdm import tqdm

Let's fetch the dataframes.

In [39]:
model = SentenceTransformer('all-mpnet-base-v2')
vtt_domain_df = pd.read_csv("./data/dataframes/df_relationships_vtt_domain.csv")
comp_domain_df = pd.read_csv("./data/dataframes/df_relationships_comp_domain.csv")

In [43]:
comp_domain_df.head(2)

Unnamed: 0,Document number,Source Company,relationship description,source id,source type,source description,relationship type,target id,target type,target description,Link Source Text,Source Text
0,0,FORTUM OYJ,Fortum Corporation is responsible for executin...,First nuclear decommissioning project in Finla...,Innovation,The first nuclear reactor decommissioning proj...,DEVELOPED_BY,Fortum Corporation,Organization,A company with over 40 years of experience in ...,https://www.fortum.com/media/2020/04/fortum-aw...,"Press release\n 08 April 2020, 7:00 EEST ..."
1,0,FORTUM OYJ,VTT Technical Research Centre of Finland Ltd c...,First nuclear decommissioning project in Finla...,Innovation,The first nuclear reactor decommissioning proj...,DEVELOPED_BY,VTT Technical Research Centre of Finland Ltd,Organization,A research organization in Finland contracting...,https://www.fortum.com/media/2020/04/fortum-aw...,"Press release\n 08 April 2020, 7:00 EEST ..."


In [42]:
vtt_domain_df.head(2)

Unnamed: 0,Document number,VAT id,relationship description,source id,source type,source english_id,source description,relationship type,target id,target type,target english_id,target description,Link Source Text,Source Text
0,0,FI10292588,"FiR 1 nuclear research reactor was developed, ...",FiR 1,Innovation,FiR 1,FiR 1 is a Triga-type nuclear research reactor...,DEVELOPED_BY,FI26473754,Organization,VTT Technical Research Centre of Finland Ltd.,VTT is a Finnish research and innovation partn...,https://www.vttresearch.com/en/news-and-ideas/...,Skip to main content Beyond the obvious Open m...
1,0,FI10292588,Centre for Nuclear Safety is being developed a...,Centre for Nuclear Safety,Innovation,Centre for Nuclear Safety,A modern research facility under construction ...,DEVELOPED_BY,FI26473754,Organization,VTT Technical Research Centre of Finland Ltd.,VTT is a Finnish research and innovation partn...,https://www.vttresearch.com/en/news-and-ideas/...,Skip to main content Beyond the obvious Open m...


In [None]:
def create_text_to_compare(row):
    text_to_compare = row["source_id"]

In [None]:
# Clean the VTT domain df.



vtt_domain_df["text_to_compare"] = (
    vtt_domain_df["source id"].fillna("") + " " +
    vtt_domain_df["source english_id"].fillna("") + " " +
    vtt_domain_df["source description"].fillna(""))

vtt_domain_df = vtt_domain_df[vtt_domain_df["source type"] != "Organization"]
vtt_domain_df = vtt_domain_df.drop_duplicates(subset="source description", keep="first")

# Clean the comp domain df.

comp_domain_df["text_to_compare"] = (
    comp_domain_df["source id"].fillna("") + " " +
    comp_domain_df["source description"].fillna(""))

comp_domain_df = comp_domain_df[comp_domain_df["source type"] != "Organization"]
comp_domain_df = comp_domain_df.drop_duplicates(subset="source description", keep="first")

print("Comp domain df length:", len(comp_domain_df), " - VTT domain df length:", len(vtt_domain_df))

Comp domain df length: 1528  - VTT domain df length: 1504


Embed everything.

In [36]:
vtt_embeddings = model.encode(vtt_domain_df["text_to_compare"].tolist(), convert_to_tensor=True)
comp_embeddings = model.encode(comp_domain_df["text_to_compare"].tolist(), convert_to_tensor=True)

In [37]:
threshold = 0.80
similar_pairs = []

print("Comparing entries pair by pair...")

for i in tqdm(range(len(vtt_embeddings))):
    for j in range(i + 1, len(comp_embeddings)):
        sim = util.cos_sim(vtt_embeddings[i], comp_embeddings[j]).item()
        
        if sim > threshold:
            source_i = vtt_domain_df.iloc[i]["text_to_compare"]
            source_j = comp_domain_df.iloc[j]["text_to_compare"]
            
            print(f"\nSimilarity: {sim:.3f} | From: {i+1} To: {j+1}")
            print(f"→ Source {i+1} description: {source_i}")
            print(f"→ Source {j+1} description: {source_j}")
            
            similar_pairs.append((i, j, sim))

# Output results
print(f"\nFound {len(similar_pairs)} high-similarity pairs (> {threshold}):")
for i, j, score in similar_pairs:
    print(f"[{i}] '{df.iloc[i]['source english_id']}' <--> [{j}] '{df.iloc[j]['source english_id']}' | Similarity: {score:.3f}")

Comparing entries pair by pair...


  0%|          | 0/1504 [00:00<?, ?it/s]


Similarity: 0.828 | From: 1 To: 880
→ Source 1 description: FiR 1 FiR 1 FiR 1 is a Triga-type nuclear research reactor located in Otaniemi, Espoo, Finland, decommissioned by VTT after serving since 1962, used for nuclear research, training, and medical radiation therapy.
→ Source 880 description: FiR 1 -tutkimusreaktori A 250 kilowatt research nuclear reactor located in Otaniemi, Espoo, used from 1962 to 2015, now undergoing decommissioning.


  0%|          | 1/1504 [00:00<21:03,  1.19it/s]


Similarity: 0.803 | From: 2 To: 285
→ Source 2 description: Centre for Nuclear Safety Centre for Nuclear Safety A modern research facility under construction by VTT in Otaniemi, Espoo for nuclear safety research, equipped with state-of-the-art technology for studying radioactive materials and improving nuclear safety.
→ Source 285 description: nuclear safety house A new four-storey building bringing together all nuclear safety research activities of VTT under one roof, integrating office and laboratory spaces with an activity-based environment to improve collaboration and experimental research capabilities.


  1%|          | 10/1504 [00:07<17:28,  1.42it/s]


Similarity: 0.846 | From: 10 To: 1442
→ Source 10 description: bio-based packaging solution bio-based packaging solution A compostable and lightweight packaging material made of cellulose films developed by VTT that reduces the use of plastics, extends shelf life of food, and reduces food waste and microplastics.
→ Source 1442 description: bio-based, recyclable and reusable packaging materials Bio-based, recyclable and reusable packaging materials developed by Paptic Ltd that aim to replace plastic alternatives.


  2%|▏         | 31/1504 [00:20<15:45,  1.56it/s]


Similarity: 0.826 | From: 32 To: 791
→ Source 32 description: 5G-Safe project 5G-Safe project A project exploring the use of the 5G mobile network to improve road safety through fast data transmission solutions, collecting vehicle and road data for applications such as road weather services, road maintenance, and self-driving car control.
→ Source 791 description: Challenge Finland 5G-SAFE project A research and development project conducted by VTT on new 5G-enabled road safety services in Finland.

Similarity: 0.892 | From: 32 To: 884
→ Source 32 description: 5G-Safe project 5G-Safe project A project exploring the use of the 5G mobile network to improve road safety through fast data transmission solutions, collecting vehicle and road data for applications such as road weather services, road maintenance, and self-driving car control.
→ Source 884 description: 5G-Safe -hanke The 5G-Safe project developing new vehicle network solutions and local road weather and safety services to suppo

  4%|▍         | 61/1504 [00:39<14:51,  1.62it/s]


Similarity: 0.874 | From: 62 To: 640
→ Source 62 description: New wood-based materials to prevent the spread of microorganisms in hospitals New wood-based materials to prevent the spread of microorganisms in hospitals Innovative renewable wood-based materials developed to prevent the spread of microorganisms in hospital environments, aiming to replace traditional materials with bio-based solutions.
→ Source 640 description: Sami&Samu An innovation involving wood-based substances with antimicrobial qualities to tackle microbial resistance in hospitals and replace plastics with bio-based materials.

Similarity: 0.800 | From: 62 To: 1079
→ Source 62 description: New wood-based materials to prevent the spread of microorganisms in hospitals New wood-based materials to prevent the spread of microorganisms in hospitals Innovative renewable wood-based materials developed to prevent the spread of microorganisms in hospital environments, aiming to replace traditional materials with bio-based so

  4%|▍         | 62/1504 [00:40<14:59,  1.60it/s]


Similarity: 0.885 | From: 63 To: 640
→ Source 63 description: New wood-based antimicrobial and low-carbon hospital materials New wood-based antimicrobial and low-carbon hospital materials Innovative renewable wood-based materials, surface finishes and textiles developed to prevent the spread of microorganisms in hospitals, aiming to replace traditional oil-based plastics and reduce waste.
→ Source 640 description: Sami&Samu An innovation involving wood-based substances with antimicrobial qualities to tackle microbial resistance in hospitals and replace plastics with bio-based materials.

Similarity: 0.826 | From: 63 To: 1079
→ Source 63 description: New wood-based antimicrobial and low-carbon hospital materials New wood-based antimicrobial and low-carbon hospital materials Innovative renewable wood-based materials, surface finishes and textiles developed to prevent the spread of microorganisms in hospitals, aiming to replace traditional oil-based plastics and reduce waste.
→ Source 10

  5%|▍         | 71/1504 [00:45<14:46,  1.62it/s]


Similarity: 0.889 | From: 72 To: 495
→ Source 72 description: Carbamate technology Carbamate technology A fibre production method transforming wood-based dissolving pulp and discarded cotton into viscose-type fibres without harmful chemicals.
→ Source 495 description: carbamate technology A technology owned by VTT producing cellulose carbamate fibre from recycled cotton by forming carbamate groups and dissolving cellulose in cold sodium zincate solution.


  5%|▍         | 72/1504 [00:46<15:10,  1.57it/s]


Similarity: 0.853 | From: 73 To: 497
→ Source 73 description: Ioncell-F technology Ioncell-F technology A technology based on direct dissolution of cellulose using ionic liquid solvent to produce strong textile fibres by dry-jet wet spinning.
→ Source 497 description: Ioncell-F technology A technology based on direct dissolution of cellulose using dry-jet wet spinning technique, developed in collaboration between Aalto University and University of Helsinki.


  5%|▍         | 73/1504 [00:47<15:47,  1.51it/s]


Similarity: 0.894 | From: 73 To: 1505
→ Source 73 description: Ioncell-F technology Ioncell-F technology A technology based on direct dissolution of cellulose using ionic liquid solvent to produce strong textile fibres by dry-jet wet spinning.
→ Source 1505 description: Ioncell-F-prosessi A technology developed through Aalto University and University of Helsinki collaboration, using ionic liquid solvents and a dry-jet wet-spinning process to produce textile fibers with excellent strength.

Similarity: 0.880 | From: 74 To: 496
→ Source 74 description: BioCelSol technology BioCelSol technology A technology that enhances cellulose dissolution via mechanical and enzymatic treatments prior to fibre spinning.
→ Source 496 description: BioCelSol technology A technology jointly owned by VTT and Tampere University of Technology enhancing cellulose dissolution by mechanical and enzymatic treatments before dissolution in sodium zincate.


  5%|▍         | 74/1504 [00:47<16:03,  1.48it/s]


Similarity: 0.838 | From: 74 To: 1506
→ Source 74 description: BioCelSol technology BioCelSol technology A technology that enhances cellulose dissolution via mechanical and enzymatic treatments prior to fibre spinning.
→ Source 1506 description: BioCelSol-teknologia A technology demonstrated by VTT that enhances cellulose dissolution via mechanical-enzymatic treatment before solvent and spinning processes for textile fiber production.


  5%|▍         | 75/1504 [00:48<15:59,  1.49it/s]


Similarity: 0.872 | From: 76 To: 991
→ Source 76 description: Finnish 5G technology enables efficient mobile streaming to large audiences Finnish 5G technology enables efficient mobile streaming to large audiences A novel 5G technology in Finland using evolved Multimedia Broadcast Multicast Service (eMBMS) enabling high-quality live video streaming to large audiences on commercial smartphones in test networks.
→ Source 991 description: evolved Multimedia Broadcast Multicast Service (eMBMS) technology A technology enabling live streaming of TV channels in mobile wireless networks, enhancing bandwidth efficiency and reliability, demonstrated for the first time in Finland in 5G test networks.


  5%|▌         | 77/1504 [00:50<15:28,  1.54it/s]


KeyboardInterrupt: 