In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity
import nltk

from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [2]:
laptops = pd.read_csv("data/laptops.csv")

In [60]:
features_dict = {}
features_dict['display_size'] = [10.1, 11.6, 12.4, 13.0, 13.3, 13.4, 13.5, 13.6, 14.0, 14.1, 14.2, 14.5, 15.0, 15.3, 15.6, 16.0, 16.1, 16.2, 17.3, 18.0]
features_dict['brand'] = ['acer', 'apple', 'asus', 'avita', 'axl', 'chuwi', 'dell', 'fujitsu', 'gigabyte', 'honor', 'hp', 'iball', 'infinix', 'jio', 'lenovo', 'lg', 'microsoft', 'msi', 'primebook', 'realme', 'samsung', 'tecno', 'ultimus', 'walker', 'wings', 'zebronics']
features_dict['ram_memory'] = ['2gb', '4gb', '8gb', '12gb', '16gb', '32gb', '36gb']
features_dict['processor_tier'] = ['celeron', 'core i3', 'core i5', 'core i7', 'core i9', 'core ultra 7', 'm1', 'm2', 'm3', 'other', 'pentium', 'ryzen 3', 'ryzen 5', 'ryzen 7', 'ryzen 9']

In [None]:
new_corpus = api.load("word2vec-google-news-300	")



In [None]:
def get_embedding(word, model):
    try:
        return model[word]
    except KeyError:
        return np.zeros(model.vector_size)
    

In [61]:
precomputed_embeddings = {}
for category, items in features_dict.items():
    precomputed_embeddings[category] = {
        item: get_embedding(str(item), new_corpus) for item in items
    }

In [62]:
def calculate_cosine_similarity(input_vector, category_embeddings):
    similarities = {}
    for label, vector in category_embeddings.items():
        if np.any(vector):
            similarity = cosine_similarity([input_vector], [vector])[0][0]
            similarities[label] = similarity
    return similarities

In [None]:
def classify_tokens(tokens, precomputed_embeddings, model):
    results = []
    
    for token in tokens:
        embedding = get_embedding(token, model)
        
        if not np.any(embedding): # if there is no embedding returned, then we return 0
            results.append((token, "unknown", 0))
            continue
        
        # Calculate similarity for each category
        token_scores = {}
        for category, embeddings in precomputed_embeddings.items():
            similarities = calculate_cosine_similarity(embedding, embeddings)
            if similarities:
                best_match = max(similarities, key=similarities.get)
                token_scores[category] = (best_match, similarities[best_match])
        
        # Find the highest similarity category
        if token_scores:
            final_category = max(token_scores, key=lambda x: token_scores[x][1])
            best_label, best_score = token_scores[final_category]

            if best_score > 0.7: ## our score is 0.5 for it to be classified as a label
                results.append((token, final_category, best_score))
        else:
            results.append((token, "unknown", 0))
    
    return results


In [64]:
def preprocess_sentences(input):
    tokens = input.split(" ")
    stop_words = set(stopwords.words('english'))

    return [token for token in tokens if token not in stop_words]


In [70]:
import nltk
from nltk.corpus import stopwords

input = "i want a core i9 laptop from dell brand please. my budget is 1500. it should be 8gb"
tokens = preprocess_sentences(input)

result = classify_tokens(tokens, precomputed_embeddings,new_corpus)

for token, category, score in result: 
    print(f"token is {token}, it's score is {score}, and the category is {category}")

token is dell, it's score is 1.0000001192092896, and the category is brand
token is please., it's score is 0, and the category is unknown
token is 1500., it's score is 0, and the category is unknown
token is 8gb, it's score is 1.0, and the category is ram_memory


In [None]:
from gensim.models import FastText

domain_sentences = ['celeron', 'core i3', 'core i5', 'core i7', 'core i9', 'core ultra 7', 'm1', 'm2', 'm3', 'other', 'pentium', 'ryzen 3', 'ryzen 5', 'ryzen 7', 'ryzen 9']
# Assuming you have a FastText model trained or loaded
fasttext_model = FastText(sentences=domain_sentences, vector_size=300)

embedding = fasttext_model.wv['core i7']  # FastText handles phrases better 
