In [1]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import sqlite3

## Data preparation

In [30]:
conn = sqlite3.connect("../scripts/data/clean/correspondence.db")
sql_query = "SELECT * FROM correspondence"
df_corr = pd.read_sql_query(sql_query, conn)
df_corr["name_external"] = df_corr["name_external"] + ", " + df_corr["source_external"]
df_corr.head()
# df_exio2fao = df_exio2fao.drop_duplicates(subset=["Item"], keep="first").reset_index(drop=True)

Unnamed: 0,name_bonsai,name_external,source_external
0,agave fibres nes,"agave fibres nes, food and agriculture organiz...",food and agriculture organization corporate st...
1,"alcohol, non-food purposes","alcohol, non-food purposes, food and agricultu...",food and agriculture organization corporate st...
2,"almonds, shelled","almonds, shelled, food and agriculture organiz...",food and agriculture organization corporate st...
3,almonds,"almonds, food and agriculture organization cor...",food and agriculture organization corporate st...
4,"anise, badian, fennel","anise, badian, fennel, food and agriculture or...",food and agriculture organization corporate st...


In [31]:
arr_bonsai= df_corr["name_bonsai"].values
arr_external= df_corr["name_external"].values

# cosine_score = calc_cosine_sim("all-mpnet-base-v2", arr_external, arr_bonsai)
# cosine_score = calc_cosine_sim("all-MiniLM-L6-v2", arr_external, arr_bonsai)

# Pretrained models

## Sentence transformer

Encoder Models:
- multi-qa-mpnet-base-dot-v1
- all-MiniLM-L6-v2
- all-mpnet-base-v2

In [86]:
def calc_cosine_sim(model_name, sentence_src, sentence_target):
    model = SentenceTransformer(model_name)
    embedding_src = model.encode(sentence_src, convert_to_tensor=True)
    embedding_target = model.encode(sentence_target, convert_to_tensor=True)
    cosine_score = util.pytorch_cos_sim(embedding_src, embedding_target)
    return cosine_score

def match_classification(arr_external, arr_bonsai, model_name):
    cosine_score = calc_cosine_sim(model_name, arr_external, arr_bonsai)
    sorted_cs, indices = cosine_score.sort(dim=1, descending=True)
    result_df = pd.DataFrame()
    
    for ix, product in enumerate(arr_external):
        sorted_product_cs = sorted_cs[ix].cpu().numpy()
        exio_ix = indices[ix].cpu().numpy()
        result_df.loc[ix, "name_external"] = product
        result_df.loc[ix, "name_bonsai_"+model_name] = df_corr.loc[exio_ix[0], "name_bonsai"]
        result_df.loc[ix, 'cosine_score'] = float("{:.3f}".format(sorted_product_cs[0]))
    
    df_merge = result_df.merge(
        df_corr, 
        on=["name_external"], 
        how="right",
        ).drop_duplicates()
    df_merge["compare"] = df_merge.apply(lambda row: row["name_bonsai_"+ model_name] == row["name_bonsai"], axis=1)
    print(len(df_merge[df_merge["compare"]==True])/len(df_merge))
    return df_merge

In [87]:
df_merge = match_classification(arr_external, arr_bonsai, "multi-qa-mpnet-base-dot-v1")

0.5581818181818182


In [77]:
len(df_merge), len(df_corr), len(result_df)
df_merge.head()

Unnamed: 0,name_external,name_bonsai_nlp,cosine_score,name_bonsai,source_external
0,"agave fibres nes, food and agriculture organiz...",agave fibres nes,0.765,agave fibres nes,food and agriculture organization corporate st...
1,"alcohol, non-food purposes, food and agricultu...","alcohol, non-food purposes",0.693,"alcohol, non-food purposes",food and agriculture organization corporate st...
2,"almonds, shelled, food and agriculture organiz...","almonds, shelled",0.717,"almonds, shelled",food and agriculture organization corporate st...
3,"almonds, food and agriculture organization cor...",almonds,0.697,almonds,food and agriculture organization corporate st...
4,"anise, badian, fennel, food and agriculture or...","anise, badian, fennel",0.703,"anise, badian, fennel",food and agriculture organization corporate st...


In [35]:
df_merge.sort_values(by="cosine_score", ascending=True).head(10)
df_merge[df_merge["compare"]==False]

Unnamed: 0,name_external,name_bonsai_nlp,cosine_score,name_bonsai,source_external,compare
473,"aquavit, 40 % vol., average values","juice, orange, single strength",0.407,"beverages, distilled alcoholic",concito,False
1093,"remoulade, average values","peppermint, spearmint",0.445,"eggs, hen, in shell",concito,False
1095,"remoulade, average values","peppermint, spearmint",0.445,"oil, sunflower",concito,False
1359,"water, tap, drinking, average values",steam and hot water supply services,0.448,food prep nes,concito,False
731,"dumplings, average values",cake of kapok,0.459,pastry,concito,False
...,...,...,...,...,...,...
737,energy drink,"beverages, distilled alcoholic",0.584,food prep nes,concito,False
1346,"vegan nuggets, soy based",soybeans,0.584,soya paste,concito,False
617,"celeriac, celery root, raw",lettuce and chicory,0.584,roots and tubers nes,concito,False
1272,"spring roll, frozen, ready meals","oats, rolled",0.585,fonio,concito,False


## BERT model

In [32]:
from transformers import BertTokenizer, BertModel
import torch
from torch.nn.functional import cosine_similarity
import pandas as pd
from torch import Tensor, device

# Initialize BERT tokenizer and model
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
device = device("cuda")
model = model.to(device)
def encode_sentences(sentences):
    tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    tokens = {key: value.to(device) for key, value in tokens.items()}
    
    with torch.no_grad():
        outputs = model(**tokens)
    embeddings = outputs.last_hidden_state[:, 0, :]  # Use the [CLS] token embeddings as the sentence representation
    return embeddings
    
def cos_sim(a: Tensor, b: Tensor):
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0)

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))
    
source_sentence = arr_external.tolist()
source_embedding = encode_sentences(source_sentence)
target_sentence = arr_bonsai.tolist()#["buffalo meat", "buffalos - meat (live)"]
target_embedding = encode_sentences(target_sentence)

In [33]:
similarity = cos_sim(source_embedding, target_embedding)
sorted_cs, indices = similarity.sort(dim=1, descending=True)

In [34]:
similarity

tensor([[0.8116, 0.7409, 0.7865,  ..., 0.7822, 0.7822, 0.7207],
        [0.7806, 0.7711, 0.7763,  ..., 0.7949, 0.7949, 0.7347],
        [0.7930, 0.7415, 0.8101,  ..., 0.7839, 0.7839, 0.7206],
        ...,
        [0.7789, 0.7717, 0.7774,  ..., 0.8345, 0.8345, 0.7520],
        [0.7258, 0.7239, 0.7556,  ..., 0.7531, 0.7531, 0.6888],
        [0.6820, 0.6750, 0.7135,  ..., 0.6858, 0.6858, 0.6463]],
       device='cuda:0')

In [35]:
result_df = pd.DataFrame()
for ix, product in enumerate(source_sentence):
    sorted_product_cs = sorted_cs[ix].cpu().numpy()
    bonsai_ix = indices[ix].cpu().numpy()
    result_df.loc[ix, "name_external"] = product
    result_df.loc[ix, "name_bonsai_"+model_name] = df_corr.loc[bonsai_ix[0], "name_bonsai"]
    result_df.loc[ix, 'cosine_score'] = float("{:.3f}".format(sorted_product_cs[0]))

df_merge = result_df.merge(
    df_corr, 
    on=["name_external"], 
    how="right",
    ).drop_duplicates()
df_merge["compare"] = df_merge.apply(lambda row: row["name_bonsai_"+ model_name] == row["name_bonsai"], axis=1)
print(len(df_merge[df_merge["compare"]==True])/len(df_merge))

0.03636363636363636


In [29]:
df_merge[df_merge["compare"]==False]

Unnamed: 0,name_external,name_bonsai_bert-base-uncased,cosine_score,name_bonsai,source_external,compare
52,buffaloes,yams,0.850,buffalos - meat (live),food and agriculture organization corporate st...,False
53,buffalo milk,buffalo meat,0.943,buffalos - milk,food and agriculture organization corporate st...,False
61,camel milk,dry skim cow milk,0.955,camel - milk,food and agriculture organization corporate st...,False
73,cattle,wheat,0.943,cattle - meat (live),food and agriculture organization corporate st...,False
74,"cow milk, whole (fresh)","milk, skimmed cow",0.926,cattle - milk,food and agriculture organization corporate st...,False
...,...,...,...,...,...,...
1422,solar thermal,biogas,0.925,electricity,international energy agency,False
1423,"""tide, wave and ocean""","cocoons, reelable",0.803,electricity,international energy agency,False
1424,wind,garlic,0.969,electricity,international energy agency,False
1425,other sources,millet,0.924,electricity,international energy agency,False
