# 0. Importing dependencies

### Libraries

In [1]:
from utils import Skipgram, SkipgramNeg, Glove

In [2]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [3]:
import torch
import torch.nn.functional as F
import pickle
import numpy as np
from scipy.stats import spearmanr

### Trained data

In [4]:
# Importing training data
Data = pickle.load(open('./app/models/Data.pkl', 'rb'))
corpus = Data['corpus']
vocab = Data['vocab']
word2index = Data['word2index']
voc_size = Data['voc_size']
embed_size = Data['embedding_size']

### Models

In [5]:
# Instantiate the model and load saved parameters
skipgram = Skipgram(voc_size, embed_size)
skipgram.load_state_dict(torch.load('app/models/Skipgram-v1.pt'))
skipgram.eval()

Skipgram(
  (embedding_v): Embedding(9775, 50)
  (embedding_u): Embedding(9775, 50)
)

In [6]:
# Instantiate the model and load saved parameters
skipgramNeg = SkipgramNeg(voc_size, embed_size)
skipgramNeg.load_state_dict(torch.load('app/models/SkipgramNeg-v1.pt'))
skipgramNeg.eval()

SkipgramNeg(
  (embedding_center): Embedding(9775, 50)
  (embedding_outside): Embedding(9775, 50)
  (logsigmoid): LogSigmoid()
)

In [7]:
# Instantiate the model and load saved parameters
glove = Glove(voc_size, embed_size)
glove.load_state_dict(torch.load('app/models/GloVe-v1.pt'))
glove.eval()

Glove(
  (center_embedding): Embedding(9775, 50)
  (outside_embedding): Embedding(9775, 50)
  (center_bias): Embedding(9775, 1)
  (outside_bias): Embedding(9775, 1)
)

In [8]:
#you have to put this file in some python/gensim directory; just run it and it will inform where to put....
glove_file = datapath('glove.6B.100d.txt')  #search on the google
gensim = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True, limit=None)

### Similarity Function

In [9]:
# Function for semantic and syntatic analysis
def similarities(lines, model):
    # Prepare the vectors of all the words
    all_word_vectors = []
    for word in vocab:
        all_word_vectors.append(model.get_vector(word))
    all_word_vectors = torch.stack(all_word_vectors)

    correct = 0
    # Perform vector manipulation for each set of four words
    for line in lines:
        words = line.split()

        # Assuming there are four words in each line
        vectors = []
        for word in words:
            if word in vocab:
                word = word.lower()
                vectors.append(skipgram.get_vector(word))
            else:
                vectors.append(skipgram.get_vector('<UNK>'))
        
        # Perform vector manipulation (e.g., subtraction, addition)
        result_vector = vectors[1][0] - vectors[0][0] + vectors[2][0]
        
        # Add a batch dimension to result_vector
        result_vector = result_vector.unsqueeze(0)

        # Calculate cosine similarities
        similarities = F.cosine_similarity(result_vector, all_word_vectors)

        # Find the index of the word with the highest similarity
        closest_word_index = torch.argmax(similarities).item()

        # Get the closest word from vocabulary
        closest_word = vocab[closest_word_index]
        if closest_word == words[3]:
            correct+=1

        # print(f"The word with the closest cosine similarity to the result of {line} is: {closest_word}")

    print('---------------------------------------------------------')
    print(f'Accuracy : {(correct / len(lines)) * 100: .2f}%')
    return


# 1. Semantic and Syntatic Analysis

## Loading test text files

In [10]:
# Read the text file and create a list of tuples
with open('./Test data/semantic(capital country).txt', 'r') as file:
    sem_lines = file.readlines()

In [11]:
# Read the text file and create a list of tuples
with open('./Test data/syntatic(past tense).txt', 'r') as file:
    syn_lines = file.readlines()

## Skipgram model

### Semantic

In [12]:
similarities(sem_lines, skipgram)

---------------------------------------------------------
Accuracy :  0.00%


### Syntatic

In [13]:
similarities(syn_lines, skipgram)

---------------------------------------------------------
Accuracy :  0.00%


## Skipgram Negative Sampling model

### Semantic

In [14]:
similarities(sem_lines, skipgramNeg)

---------------------------------------------------------
Accuracy :  0.00%


### Syntatic

In [15]:
similarities(syn_lines, skipgramNeg)

---------------------------------------------------------
Accuracy :  0.00%


## GloVe Model

### Semantic

In [16]:
similarities(sem_lines, glove)

---------------------------------------------------------
Accuracy :  0.00%


### Syntatic

In [17]:
similarities(syn_lines, glove)

---------------------------------------------------------
Accuracy :  0.00%


## GloVe(Gensim) Model

In [18]:
def analogy(lines):
    correct = 0
    # Perform vector manipulation for each set of four words
    for line in lines:
        words = line.split()
        for i in range(len(words)):
            words[i] = words[i].lower() # Convert all words to lower case
            if words[i] not in gensim: # Check if gensim contains the word
                words[i] = 'unknown' # Set as unknown if not
                
        result = gensim.most_similar(positive=[words[2], words[1]], negative=[words[0]])

        # Get the closest word from most similar output
        closest_word = result[0][0]
        if closest_word == words[3]:
            correct += 1

        # print(f"The word with the closest cosine similarity to the result of {line} is: {closest_word}")

    print('---------------------------------------------------------')
    print(f'Accuracy : {(correct / len(lines)) * 100: .2f}%')
    return


### Semantic

In [19]:
analogy(sem_lines)

---------------------------------------------------------
Accuracy :  93.87%


### Syntatic

In [20]:
analogy(syn_lines)

---------------------------------------------------------
Accuracy :  55.45%


# 2. Similarity Analysis

In [21]:
# Read the text file and create a list of tuples
with open('./Test data/wordsim_similarity_goldstandard.txt', 'r') as file:
    sim_lines = file.readlines()

## Defining functions

In [22]:
def cosine_similarity(A, B):
    dot_product = np.dot(A.flatten(), B.flatten())
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)
    similarity = dot_product / (norm_a * norm_b)
    return similarity

In [23]:
def similar(lines, model):
    scores_real = [] # Store scores in test file
    scores_pred = [] # Store cosine similarities of models

    # Loop through test file
    for line in lines:
        words = line.split()
        vec = [] # Empty array to store word vectors
        for word in words[:2]: # Assuming first two words are to be compared
            try:
                vec.append(model.get_vector(word).detach().numpy())
            except:
                vec.append(model.get_vector('<UNK>').detach().numpy())
        scores_real.append(float(words[2])) # Third word should be score
        scores_pred.append(cosine_similarity(np.array(vec[0]), np.array(vec[1])))

    return spearmanr(scores_real, scores_pred) # Spearman rank-value correlation


## Skipgram model

In [24]:
print(f'Skipgram correlation score: {similar(sim_lines,skipgram)[0]}')

Skipgram correlation score: 0.008293643289343964


## Skipgram (Negative sampling) model

In [25]:
print(f'Skipgram (Neg) correlation score: {similar(sim_lines,skipgramNeg)[0]}')

Skipgram (Neg) correlation score: 0.08838395787478513


## GloVe model

In [26]:
print(f'Skipgram correlation score: {similar(sim_lines,glove)[0]}')

Skipgram correlation score: -0.0013843071649341963


## GloVe (Gensim) model

In [27]:
# Similar function but for gensim (because gensim vectors does not need to be detached)
def similar_gensim(lines, model):
    scores_real = []
    scores_pred = []
    for line in lines:
        words = line.split()
        vec = []
        for word in words[:2]:
            try:
                vec.append(model.get_vector(word))
            except:
                vec.append(model.get_vector('unknown'))
        scores_real.append(float(words[2]))
        scores_pred.append(cosine_similarity(np.array(vec[0]), np.array(vec[1])))

    return spearmanr(scores_real, scores_pred)

In [28]:
print(f'Skipgram correlation score: {similar_gensim(sim_lines,gensim)[0]}')

Skipgram correlation score: 0.5962863369934295


## Human model

In [29]:
# Similar function but for human input
def similar_human(lines):
    scores_real = []
    scores_pred = []

    for line in lines:
        words = line.split()
        scores_real.append(float(words[2]))

        while True:
            try:
                # Ask the user for input
                human_score = float(input(f"How would you rate the relation between '{words[0]}' and '{words[1]}' on a scale of 0 to 10: "))
                
                # Check if the input is within the valid range (0 to 10)
                if 0 <= human_score <= 10:
                    scores_pred.append(human_score)
                    break  # Exit the loop if the input is valid
                else:
                    print("Invalid input. Please enter a score between 0 and 10.")

            except ValueError:
                print("Invalid input. Please enter a numeric value.")

    return spearmanr(scores_real, scores_pred)

In [30]:
print(f'Human correlation score: {similar_human(sim_lines)[0]}')

Invalid input. Please enter a numeric value.
Invalid input. Please enter a score between 0 and 10.
Human correlation score: 0.6637109000604999


# Test scores tabulated in README file.