<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/NLP-Projects/blob/main/CosineSimilarity-Glove-Embedding/SimilaritySearch_Glove_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import

In [1]:
import numpy as np
import requests
import zipfile
import io
import os

# Download GloVe

In [7]:
def download_glove():
    """
    Helper function to download the GloVe dataset if it doesn't exist locally.
    Note: This file is ~822MB. It might take time.
    """
    if not os.path.exists("glove.6B.50d.txt"):
        print("Downloading GloVe...")
        url = "http://nlp.stanford.edu/data/glove.6B.zip"
        r = requests.get(url)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        z.extractall()
        print("Download and extraction complete.")
    else:
        print("GloVe file already exists.")

download_glove()

Downloading GloVe...
Download and extraction complete.


# Build Embedding Matrix

In [12]:
def load_glove_manual(file_path):
    """
    Parses the raw GloVe text file line by line and converts it into a Python dictionary.

    Args:
        file_path (str): Path to the .txt file (e.g., 'glove.6B.50d.txt')

    Returns:
        dict: A dictionary where keys are words (str) and values are vectors (numpy array).
    """
    embeddings_dictionary = {}

    print(f"Reading file: {file_path}...")

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]

                vector = np.asarray(values[1:], dtype='float32')

                embeddings_dictionary[word] = vector

        print(f"Loading complete. Loaded {len(embeddings_dictionary)} words.")
        return embeddings_dictionary

    except FileNotFoundError:
        print("File not found. Creating a dummy dictionary for demonstration purposes.")
        return {
            'cat': np.array([0.9, 0.1, 0.5, 0.2], dtype='float32'),
            'dog': np.array([0.8, 0.2, 0.4, 0.3], dtype='float32'),
            'car': np.array([0.1, 0.9, 0.0, 0.1], dtype='float32')
        }

# Cosine Similarity

In [13]:
def manual_cosine_similarity(vec_a, vec_b):
    """
    Calculates the cosine similarity between two vectors manually.
    Formula: (A . B) / (||A|| * ||B||)
    """
    dot_product = np.dot(vec_a, vec_b)
    norm_a = np.linalg.norm(vec_a)
    norm_b = np.linalg.norm(vec_b)

    # calculate similarity
    if norm_a == 0 or norm_b == 0:
        return 0.0
    similarity = dot_product / (norm_a * norm_b)
    return similarity


# Execution

In [14]:
# Path to the dataset
glove_path = 'glove.6B.50d.txt'

# Load embeddings
embeddings = load_glove_manual(glove_path)

# Test words
word1 = 'cat'
word2 = 'dog'
word3 = 'car'

# Check if words exist in our loaded dictionary
if word1 in embeddings and word2 in embeddings and word3 in embeddings:

    # Retrieve vectors
    vec1 = embeddings[word1]
    vec2 = embeddings[word2]
    vec3 = embeddings[word3]

    # Perform mathematical calculation
    sim_cat_dog = manual_cosine_similarity(vec1, vec2)
    sim_cat_car = manual_cosine_similarity(vec1, vec3)

    print(f"\nResults:")
    print(f">> Vector for '{word1}' (First 5 dims): {vec1[:5]}")
    print(f">> Similarity between '{word1}' and '{word2}': {sim_cat_dog:.4f}")
    print(f">> Similarity between '{word1}' and '{word3}': {sim_cat_car:.4f}")

    # Logical interpretation
    if sim_cat_dog > sim_cat_car:
        print("\nLogic Check Passed: 'Cat' is mathematically closer to 'Dog' than to 'Car'.")
    else:
        print("\nLogic Check Failed.")

else:
    print("One or more words were not found in the dictionary.")

Reading file: glove.6B.50d.txt...
Loading complete. Loaded 400000 words.

Results:
>> Vector for 'cat' (First 5 dims): [ 0.45281  -0.50108  -0.53714  -0.015697  0.22191 ]
>> Similarity between 'cat' and 'dog': 0.9218
>> Similarity between 'cat' and 'car': 0.3638

Logic Check Passed: 'Cat' is mathematically closer to 'Dog' than to 'Car'.
