In [1]:
from transformers import AutoTokenizer
import csv
import numpy as np
from scipy.spatial.distance import cosine
from dotenv import load_dotenv
import os
load_dotenv()
hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")

model_name = 'FacebookAI/roberta-base'
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load vocabulary from glove.6B.300d-vocabulary.txt
def load_vocabulary(vocab_file):
    with open(vocab_file, 'r') as f:
        words = [line.strip() for line in f.readlines()]
    return words

# Tokenize vocabulary words
def tokenize_words(words):
    tokenized_words = {}
    for word in words:
        tokens = tokenizer.tokenize(word)
        tokenized_words[word] = tokens
    return tokenized_words


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Testing with some example words
def test_examples(word_embeddings):
    test_words = ["cactus", "cake", "angry", "quickly", "between", "the"]
    for word in test_words:
        try:
            similar_words = most_similar(word, word_embeddings)
            print(f"Most similar words to '{word}': {similar_words}")
        except ValueError as e:
            print(e)

In [3]:
# Load embeddings from result.csv
def load_embeddings(embedding_file):
    embeddings = {}
    with open(embedding_file, 'r') as f:
        reader = csv.reader(f)
        next(reader)  # Skip header
        for row in reader:
            token = row[0]
            embedding = np.fromstring(row[1].strip("[]"), sep=',')
            embeddings[token] = embedding
    return embeddings


In [4]:
# Compute word embeddings by averaging token embeddings
def compute_word_embeddings(tokenized_words, token_embeddings):
    word_embeddings = {}
    for word, tokens in tokenized_words.items():
        token_vectors = [token_embeddings[token] for token in tokens if token in token_embeddings]
        if token_vectors:
            word_embeddings[word] = np.mean(token_vectors, axis=0)
    return word_embeddings


In [5]:
# Find most similar words based on cosine similarity
def most_similar(word, word_embeddings, top_n=5):
    if word not in word_embeddings:
        return []
    
    word_vector = word_embeddings[word]
    similarities = {}
    for other_word, other_vector in word_embeddings.items():
        if other_word != word:
            similarities[other_word] = 1 - cosine(word_vector, other_vector)
    # Sort by similarity score
    return sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_n]


In [6]:
# Load vocabulary and embeddings
vocab_file = 'glove.6B.300d-vocabulary.txt'
embedding_file = 'result.csv'

words = load_vocabulary(vocab_file)
tokenized_words = tokenize_words(words)
token_embeddings = load_embeddings(embedding_file)
word_embeddings = compute_word_embeddings(tokenized_words, token_embeddings)

# Example usage
examples = ["cactus", "cake", "angry","quickly","between","the"]  # Replace with actual words
for word in examples:
    print(f"Most similar words to {word}:")
    similar_words = most_similar(word, word_embeddings)
    for similar, score in similar_words:
        print(f"{similar}: {score:.4f}")


Most similar words to cactus:
creticus: 0.9831
cantus: 0.9829
cercocarpus: 0.9827
carcasses: 0.9825
crescentic: 0.9823
Most similar words to cake:
fruitcake: 1.0000
cakebread: 0.9798
cakewalk: 0.9794
mooncake: 0.9783
cakey: 0.9757
Most similar words to angry:
ryang: 1.0000
ryanggang: 0.9932
mlanghenry: 0.9872
yungang: 0.9867
yanchang: 0.9863
Most similar words to quickly:
quickies: 0.9838
quick-fire: 0.9818
quickness: 0.9811
quickie: 0.9810
quickplay: 0.9807
Most similar words to between:
inbetween: 0.9815
betweenness: 0.9790
inbetweeners: 0.9704
in-between: 0.9681
go-between: 0.9675
Most similar words to the:
theorem: 1.0000
theocrats: 1.0000
andthe: 0.9822
thet: 0.9822
bythe: 0.9821
