In [36]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd

In [37]:
def calc_cosine_sim(model_name, sentence_src, sentence_target):
    model = SentenceTransformer(model_name)
    embedding_src = model.encode(sentence_src, convert_to_tensor=True)
    embedding_target = model.encode(sentence_target, convert_to_tensor=True)
    cosine_score = util.pytorch_cos_sim(embedding_src, embedding_target)
    return cosine_score

In [39]:
df_exio2fao = pd.read_excel("../data/raw/Exio4_vs_FCL.xlsx", sheet_name="Exio4_vs_FAO")
df_exio2fao = df_exio2fao.drop_duplicates(subset=["Item"], keep="first").reset_index(drop=True)

In [40]:
df_exio2fao.head()

Unnamed: 0,description,Exio prod code,Item,Item code,Unnamed: 4,Unnamed: 5
0,Agave fibres nes,C_Agavs,Agave fibres nes,800,,
1,"Alcohol, Non-Food Purposes",C_Alc_food,"Alcohol, Non-Food Purposes",632,,
2,"Almonds, Shelled",C_Almod,"Almonds, Shelled",231,,
3,Almonds,C_Almol,Almonds,221,,
4,"Anise, badian, fennel",C_Anisr,"Anise, badian, fennel",711,,


In [41]:
arr_exio_descrip = df_exio2fao["description"].values
arr_fao_descrip = df_exio2fao["Item"].values

cosine_score = calc_cosine_sim("all-mpnet-base-v2", arr_fao_descrip, arr_exio_descrip)

In [42]:
sorted_cs, indices = cosine_score.sort(dim=1, descending=True)

In [43]:
result_df = pd.DataFrame()

for ix, product in enumerate(arr_fao_descrip):
    sorted_product_cs = sorted_cs[ix].cpu().numpy()
    exio_ix = indices[ix].cpu().numpy()
    result_df.loc[ix, "fao_description"] = product
    result_df.loc[ix, "exio_description_nlp"] = df_exio2fao.loc[exio_ix[0], "description"]
    result_df.loc[ix, "exio_code_nlp"] = df_exio2fao.loc[exio_ix[0], "Exio prod code"]
    result_df.loc[ix, 'cosine_score'] = float("{:.3f}".format(sorted_product_cs[0]))


In [44]:
result_df.tail()

Unnamed: 0,fao_description,exio_description_nlp,exio_code_nlp,cosine_score
445,Ducks,Duck meat,C_Meat_duck,0.562
446,Turkeys,Turkey meat,C_Meat_turk,0.725
447,Asses,Meat of asses,C_Meat_ass,0.602
448,Horses,Offals of horses,C_Offas_hors,0.711
449,Rabbits,Rabbit meat,C_Meat_rabb,0.619


In [45]:
df_merge = result_df.merge(
    df_exio2fao, 
    left_on=["fao_description"], 
    right_on=["Item"],
    how="right",
    ).drop_duplicates()

In [46]:
len(df_merge), len(df_exio2fao), len(result_df)

(450, 450, 450)

In [47]:
df_merge = df_merge[["fao_description", "exio_description_nlp", "exio_code_nlp", "description","Exio prod code", "cosine_score"]]
df_merge["compare"] = df_merge.apply(lambda row: row["exio_code_nlp"] == row["Exio prod code"], axis=1)


In [48]:
len(df_merge[df_merge["compare"]==True])/len(df_merge)

0.9777777777777777

In [49]:
df_merge.sort_values(by="cosine_score", ascending=True).head(20)

Unnamed: 0,fao_description,exio_description_nlp,exio_code_nlp,description,Exio prod code,cosine_score,compare
445,Ducks,Duck meat,C_Meat_duck,Poultry - Meat (live),C_Polmeat,0.562,False
447,Asses,Meat of asses,C_Meat_ass,Other animal products,C_Othanpr,0.602,False
449,Rabbits,Rabbit meat,C_Meat_rabb,Other animal products,C_Othanpr,0.619,False
349,Pigs,Pig meat,C_Meat_pig,Pig meat (live),C_Pigmeat,0.668,False
356,Chickens,Hen eggs,C_Eggsl,Poultry - Meat (live),C_Polmeat,0.682,False
186,Goats,Goats - Meat (live),C_Goameat,Goats - Meat (live),C_Goameat,0.688,True
382,Sheep,Sheep - Milk,C_Shemilk,Sheep - Meat (live),C_Shemeat,0.693,False
335,Camels,Fat of camels,C_Fat_cam,Other animal products,C_Othanpr,0.705,False
448,Horses,Offals of horses,C_Offas_hors,Other animal products,C_Othanpr,0.711,False
446,Turkeys,Turkey meat,C_Meat_turk,Poultry - Meat (live),C_Polmeat,0.725,False


In [50]:
df_merge[df_merge["compare"]==False]

Unnamed: 0,fao_description,exio_description_nlp,exio_code_nlp,description,Exio prod code,cosine_score,compare
73,Cattle,Cattle - Milk,C_Catmilk,Cattle - Meat (live),C_Catmeat,0.779,False
335,Camels,Fat of camels,C_Fat_cam,Other animal products,C_Othanpr,0.705,False
349,Pigs,Pig meat,C_Meat_pig,Pig meat (live),C_Pigmeat,0.668,False
356,Chickens,Hen eggs,C_Eggsl,Poultry - Meat (live),C_Polmeat,0.682,False
382,Sheep,Sheep - Milk,C_Shemilk,Sheep - Meat (live),C_Shemeat,0.693,False
445,Ducks,Duck meat,C_Meat_duck,Poultry - Meat (live),C_Polmeat,0.562,False
446,Turkeys,Turkey meat,C_Meat_turk,Poultry - Meat (live),C_Polmeat,0.725,False
447,Asses,Meat of asses,C_Meat_ass,Other animal products,C_Othanpr,0.602,False
448,Horses,Offals of horses,C_Offas_hors,Other animal products,C_Othanpr,0.711,False
449,Rabbits,Rabbit meat,C_Meat_rabb,Other animal products,C_Othanpr,0.619,False


In [51]:
cosine_score = calc_cosine_sim("all-mpnet-base-v2", ["Cattle"], ["Other animal products"])

In [52]:
cosine_score

tensor([[0.4923]])