### BERT 

In [1]:
from transformers import BertTokenizer, BertModel
import torch
from scipy.spatial.distance import cosine


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert = BertModel.from_pretrained("bert-base-uncased")


def get_word_embedding(word):
    inputs = tokenizer(word, return_tensors='pt')
    with torch.no_grad():
        outputs = bert(**inputs)
    word_embedding = outputs.last_hidden_state[0][1].numpy()
    return word_embedding

rims_embedding = get_word_embedding("Panamera")

words_to_compare = ["turismo", "Macan", "Car", "Models", "Porsche","Boxster","BMW_X6"]

def cosine_similarity(emb1,emb2):
    return 1 - cosine(emb1,emb2)

similarities = {}
for word in words_to_compare:
    word_embedding = get_word_embedding(word)
    similarity = cosine_similarity(rims_embedding, word_embedding)
    similarities[word] = similarity

sorted_similar_words = sorted(similarities.items(), key=lambda x: x[1], reverse=True)


print("Most similar word to 'Panamera':", sorted_similar_words[0][0])
print("All similarities:", sorted_similar_words)



Most similar word to 'Panamera': BMW_X6
All similarities: [('BMW_X6', 0.4768236997788239), ('Macan', 0.4240032517990302), ('Car', 0.37994390226362995), ('Boxster', 0.3322195664028227), ('Porsche', 0.29447236004579835), ('Models', 0.2554825586111331), ('turismo', 0.11738918452304026)]


### Word2Vec

In [7]:
from gensim.models import KeyedVectors

model_path = './GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [3]:
similar_words = word2vec.most_similar('Panamera', topn=5)
print("Words similar to 'Panamera':", similar_words)

Words similar to 'Panamera': [('Boxster', 0.7418075203895569), ('Porsche_Panamera', 0.7396426200866699), ('coupé', 0.7206888198852539), ('Passat_CC', 0.7171149849891663), ('BMW_X6', 0.7134248614311218)]


In [4]:
similar_words = word2vec.most_similar('turismo', topn=5)
print("Words similar to 'turismo':", similar_words)

Words similar to 'turismo': [('nuevos', 0.6451931595802307), ('la_próxima', 0.6395909190177917), ('ciudad', 0.6383398175239563), ('speciale', 0.6336588859558105), ('del_mundo', 0.6315416097640991)]


In [5]:
similar_words = word2vec.most_similar('Macan', topn=5)
print("Words similar to 'turismo':", similar_words)

Words similar to 'turismo': [('ER_Ejercito', 0.5041291117668152), ('Daligdig', 0.4992683529853821), ('Dewa_Made', 0.49745988845825195), ('Juan_Babauta', 0.4964611232280731), ('Rahman_Nava', 0.4960399866104126)]


In [10]:
similar_words = word2vec.most_similar('Drivetrain', topn=5)
print("Words similar to 'Drivetrain':", similar_words)

Words similar to 'Drivetrain': [('Driveline', 0.6034685373306274), ('Automatic_Transmission', 0.5890157222747803), ('powertrain', 0.5830097198486328), ('drivetrain', 0.575386106967926), ('Tecstar', 0.5679606795310974)]


In [9]:
similar_words = word2vec.most_similar('ECU', topn=5)
print("Words similar to 'ECU':", similar_words)

Words similar to 'turismo': [('UAB', 0.6781703233718872), ('FSU', 0.6764650344848633), ('Clemson', 0.6736557483673096), ('Southern_Miss', 0.671258807182312), ('UCF', 0.6631731390953064)]


In [12]:
similar_words = word2vec.most_similar('Homologation', topn=5)
print("Words similar to 'Homologation':", similar_words)

Words similar to 'Homologation': [('FIA_homologation', 0.6235979199409485), ('Technical_Specifications', 0.5964402556419373), ('India_ARAI', 0.5924283862113953), ('homologation', 0.5902777910232544), ('Bluetooth_Qualification', 0.5897161364555359)]


In [14]:
similar_words = word2vec.most_similar('drag', topn=5)
print("Words similar to 'drag':", similar_words)

Words similar to 'drag': [('drags', 0.6448634266853333), ('dragging', 0.6019067168235779), ('dragged', 0.5838485956192017), ('Dragging', 0.541349470615387), ('Drag', 0.5289705991744995)]
