In [28]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from tqdm import tqdm

Let's fetch the dataframes.

In [96]:
model = SentenceTransformer('all-mpnet-base-v2')
vtt_domain_df = pd.read_csv("./data/dataframes/df_relationships_vtt_domain.csv")
comp_domain_df = pd.read_csv("./data/dataframes/df_relationships_comp_url.csv")

vtt_domain_df = vtt_domain_df[vtt_domain_df["relationship type"] == "DEVELOPED_BY"].copy()
comp_domain_df = comp_domain_df[comp_domain_df["relationship type"] == "DEVELOPED_BY"].copy()

In [97]:
# Different relationship types we have:
print("Vtt domain relationships", vtt_domain_df["relationship type"].unique())
print("Comp domain relationships", comp_domain_df["relationship type"].unique())

Vtt domain relationships ['DEVELOPED_BY']
Comp domain relationships ['DEVELOPED_BY']


In [98]:
print(comp_domain_df.columns)
comp_domain_df.sample(n=3)

Index(['Unnamed: 0', 'Document number', 'Source Company',
       'relationship description', 'source id', 'source type',
       'source english_id', 'source description', 'relationship type',
       'target id', 'target type', 'target english_id', 'target description',
       'Link Source Text', 'Source Text'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,Document number,Source Company,relationship description,source id,source type,source english_id,source description,relationship type,target id,target type,target english_id,target description,Link Source Text,Source Text
417,417,62,VALMET OYJ,Valmet developed the industrial-scale pyrolize...,pyrolizer plant,Innovation,pyrolizer plant,An industrial-scale plant to be delivered to C...,DEVELOPED_BY,FI25530198,Organization,Beyond Circularity,An ambitious R&D program and ecosystem launche...,https://www.valmet.com/insights/articles/susta...,"Nov 15, 2023 \r\n Valme..."
5133,5133,975,HAMEEN AMMATTIKORKEAKOULU OY,The KieMaRa-hanke innovation was developed and...,KieMaRa-hanke,Innovation,KieMaRa-hanke,Comprehensive circular economy concepts and so...,DEVELOPED_BY,FI26174893,Organization,Hämeen ammattikorkeakoulu,A Finnish university of applied sciences contr...,https://www.hamk.fi/projektit/kiemara/,Kokonaisvaltaisesta kiertotaloudesta maaseudun...
164,164,33,UPM-KYMMENE OYJ,The innovation 'Xampla plant-based protein mat...,Xampla plant-based protein material,Innovation,Xampla plant-based protein material,A material developed by Xampla that replaces s...,DEVELOPED_BY,temp_35,Organization,Xampla,A startup company specializing in biomass indu...,https://www.upm.com/articles/beyond-fossils/22...,"\n Traffic, construction and t..."


In [99]:
print(vtt_domain_df.columns)
vtt_domain_df.sample(n=3)

Index(['Document number', 'VAT id', 'relationship description', 'source id',
       'source type', 'source english_id', 'source description',
       'relationship type', 'target id', 'target type', 'target english_id',
       'target description', 'Link Source Text', 'Source Text'],
      dtype='object')


Unnamed: 0,Document number,VAT id,relationship description,source id,source type,source english_id,source description,relationship type,target id,target type,target english_id,target description,Link Source Text,Source Text
6356,845,FI19274001,Vaasa participated in developing the data-base...,data-based framework for assessing social sust...,Innovation,data-based framework for assessing social sust...,A newly developed framework that helps assess ...,DEVELOPED_BY,FI02096026,Organization,Vaasa,An organization collaborating in the Sustainab...,https://www.vttresearch.com/en/project_news/wh...,Skip to main content Beyond the obvious Open m...
8666,1224,FI19274001,The innovation 'VTT:n innovaatio uudistaa pakk...,VTT:n innovaatio uudistaa pakkaukset - näytta...,Innovation,VTT:n innovaatio uudistaa pakkaukset - näytta...,An innovation by VTT that renovates packaging ...,DEVELOPED_BY,FI26473754,Organization,VTT,"VTT is a visionary research, development and i...",https://www.vttresearch.com/fi/uutiset-ja-tari...,Hyppää pääsisältöön Beyond the obvious ...
3047,428,FI32297208,VTT is participating in the development of com...,components and their fabrication technologies ...,Innovation,components and their fabrication technologies ...,Development of components and their fabricatio...,DEVELOPED_BY,FI26473754,Organization,VTT,A visionary research and innovation partner fo...,https://www.vttresearch.com/en/news-and-ideas/...,Skip to main content Beyond the obvious Open m...


In [100]:
def create_text_to_compare(df):

    text_to_compare = df["source id"] + " - " + df["source description"].fillna("") + " | Developed by " + df["target id"].fillna("")
    return text_to_compare

In [101]:
# Clean the VTT domain df.

vtt_domain_df["text_to_compare"] = create_text_to_compare(vtt_domain_df)

vtt_domain_df = vtt_domain_df[vtt_domain_df["source type"] != "Organization"]
vtt_domain_df = vtt_domain_df.drop_duplicates(subset="source description", keep="first")

# Clean the comp domain df.

comp_domain_df["text_to_compare"] = create_text_to_compare(comp_domain_df)

comp_domain_df = comp_domain_df[comp_domain_df["source type"] != "Organization"]
comp_domain_df = comp_domain_df.drop_duplicates(subset="source description", keep="first")

print("Comp domain df length:", len(comp_domain_df), " - VTT domain df length:", len(vtt_domain_df))
print("Example of text to compare:", vtt_domain_df.iloc[0]["text_to_compare"])

Comp domain df length: 1912  - VTT domain df length: 1504
Example of text to compare: FiR 1 - FiR 1 is a Triga-type nuclear research reactor located in Otaniemi, Espoo, Finland, decommissioned by VTT after serving since 1962, used for nuclear research, training, and medical radiation therapy. | Developed by FI26473754


Embed everything.

In [102]:
vtt_embeddings = model.encode(vtt_domain_df["text_to_compare"].tolist(), convert_to_tensor=True)
comp_embeddings = model.encode(comp_domain_df["text_to_compare"].tolist(), convert_to_tensor=True)

In [69]:
threshold = 0.80
similar_pairs = []

print("Comparing entries pair by pair...")

for i in tqdm(range(len(vtt_embeddings))):
    for j in range(i + 1, len(comp_embeddings)):
        sim = util.cos_sim(vtt_embeddings[i], comp_embeddings[j]).item()
        
        if sim > threshold:
            i_elem = vtt_domain_df.iloc[i]
            j_elem = comp_domain_df.iloc[j]
            source_i = i_elem["text_to_compare"]
            source_j = j_elem["text_to_compare"]
            
            print(f"\nSimilarity: {sim:.3f} | From: {i+1} To: {j+1}")
            print(f"→ Source {i+1} description: {source_i} and columns: {i_elem.index}")
            print(f"→ Source {j+1} description: {source_j} and columns: {j_elem.index}")
            
            
            similar_pairs.append((i, j, sim))

# Output results
print(f"\nFound {len(similar_pairs)} high-similarity pairs (> {threshold}):")
for i, j, score in similar_pairs:
    print(f"[{i}] '{df.iloc[i]['source english_id']}' <--> [{j}] '{df.iloc[j]['source english_id']}' | Similarity: {score:.3f}")

Comparing entries pair by pair...


  0%|          | 0/1504 [00:00<?, ?it/s]

Document number                                                             0
VAT id                                                             FI10292588
relationship description    FiR 1 nuclear research reactor was developed, ...
source id                                                               FiR 1
source type                                                        Innovation
source english_id                                                       FiR 1
source description          FiR 1 is a Triga-type nuclear research reactor...
relationship type                                                DEVELOPED_BY
target id                                                          FI26473754
target type                                                      Organization
target english_id               VTT Technical Research Centre of Finland Ltd.
target description          VTT is a Finnish research and innovation partn...
Link Source Text            https://www.vttresearch.com/en/news-




AttributeError: 'Series' object has no attribute 'columns'