In [57]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import LoggingHandler, SentenceTransformer

class VectorizeDatabase:
    def __init__(self, dataframe, model):
        self.df = dataframe
        self.model = model
        self.database = []

    def vectorize_database(self):
        df_lv6 = self.df[self.df['level']==6]
        df_lv6.reset_index(drop=True, inplace=True)
        embeddings = [self.model.encode(embed) for embed in df_lv6['description'].tolist()]
        self.database = list(zip(embeddings, df_lv6['hscode'].tolist()))
        return self.database

In [58]:
model = SentenceTransformer('mini_lm_5M')
df = pd.read_csv('data/harmonized-system.csv')
df_lv6 = df[df['level']==6]
vectorize_database = VectorizeDatabase(df, model)
database = vectorize_database.vectorize_database()

In [83]:
import pickle
with open('vectorized_data.pkl', 'wb') as f:
       pickle.dump(database, f)

In [59]:
class SimilaritySearch:
    def __init__(self, database, model):
        self.database = database
        self.model = model
        self.target_embedding = None
        self.similarity_scores = None
        self.top_indices = None

    def search(self, target_text):
        self.target_embedding = self.model.encode(target_text)
        self.similarity_scores = [cosine_similarity([embedding[0] for embedding in self.database], self.target_embedding.reshape(1,-1))]
        self.top_indices = np.argsort([sim[0] for sim in self.similarity_scores[0]])[::-1]

In [81]:
similarity_search = SimilaritySearch(database, model)
similarity_search.search('')

#print(similarity_search.top_indices)

top_index = similarity_search.top_indices[0:5]
print(df_lv6.iloc[top_index]['hscode'].tolist())

['910211', '910219', '911019', '910212', '910119']


In [82]:
df_lv6.iloc[top_index]['description'].tolist()

['Wrist-watches; electrically operated, with or without a stop-watch, (other than those of heading no. 9101) with mechanical display only',
 'Wrist-watches; electrically operated, with or without a stop-watch, (other than those of heading no. 9101) without mechanical or opto-electronic display',
 'Watches; rough movements',
 'Wrist-watches; electrically operated, with or without a stop-watch, (other than those of heading no. 9101) with opto-electronic display only',
 'Wrist-watches; electrically operated, with or without a stop-watch, with case of precious metal or metal clad with precious metal, without mechanical display']