In [9]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import math
from sentence_transformers import LoggingHandler, SentenceTransformer

In [53]:
eval_data = [ ['Soybeans', '120190'],
['Cell phones', '851713'],
['Crude oil', '270900'],
['Pharmaceuticals waste', '300692'],
['Solar cell panels', '854143'],
['Cotton', '520100'],
['Perfumes', '330300'],
['Textile fabrics', '520911'],
['Meat of goat', '020450' ],
['Room Cooling', '841510'],
['badminton shoes', '640411' ],
['Leather shoes', '640510']]

In [48]:
class SimilaritySearch:
    def __init__(self, database, model):
        self.database = database
        self.model = model
        self.target_embedding = None
        self.similarity_scores = None
        self.top_indices = None

    def search(self, target_text):
        self.target_embedding = self.model.encode(target_text)
        self.similarity_scores = [cosine_similarity([embedding[0] for embedding in self.database], self.target_embedding.reshape(1,-1))]
        self.top_indices = np.argsort([sim[0] for sim in self.similarity_scores[0]])[::-1]

In [32]:
#vectorizer
model = SentenceTransformer('mini_lm_5M')

In [33]:
with open('vectorized_data.pkl', 'rb') as f:
       database = pickle.load(f)

In [34]:
df = pd.read_csv('data/harmonized-system.csv')
df_lv6 = df[df['level']==6]
df_lv6.reset_index(drop=True, inplace=True)

In [54]:
top_5 = []
for sample in eval_data:
    similarity_search = SimilaritySearch(database, model)
    similarity_search.search(sample[0])
    top_index = similarity_search.top_indices[0:5]
    print(df_lv6.iloc[top_index]['hscode'].tolist())
    top_5.append(df_lv6.iloc[top_index]['hscode'].tolist())
    #print(df_lv6.iloc[top_index]['description'].tolist())
    

['210310', '120190', '120110', '120810', '150710']
['851713', '851714', '851718', '850690', '851769']
['270900', '271020', '271390', '150810', '151411']
['300692', '382530', '382510', '854911', '854999']
['854142', '854143', '854149', '841912', '853720']
['520100', '520291', '520300', '520210', '520299']
['330300', '330720', '330790', '330730', '330749']
['590110', '590310', '590320', '590390', '540720']
['020450', '020443', '020430', '020410', '020442']
['841850', '841899', '847960', '841582', '841891']
['640411', '950659', '640212', '640312', '640590']
['640510', '640420', '640320', '640359', '411410']


In [57]:
ranks = []
for i in range(len(eval_data)):
    try:
        rank = top_5[i].index(eval_data[i][1])+1
        ranks.append(rank)
    except:
        pass
    
ranks

[2, 1, 1, 1, 2, 1, 1, 1, 1, 1]

In [64]:
top5_accuracy= (len(ranks)/len(eval_data))*100
print("The top 5 accuracy of the model is:", top5_accuracy)

The top 5 accuracy of the model is: 83.33333333333334
