# Analogy Prediction

In [237]:
import os
import gensim
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
from gensim.scripts.glove2word2vec import glove2word2vec

## Pre-training Word2Vector and Glove word embeddings from gensim with Google News files
 Note: For compression file issues, files such as 'GoogleNews-vectors-negative300.bin' and 'glove.840B.300d.txt' haven't been included in this zip. Kindly download them in this folder for running the jupyter code
-  https://nlp.stanford.edu/projects/glove/

In [238]:
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True,  limit=500000)
glove2word2vec('glove.840B.300d.txt', 'glove.840B.300d.txt.word2vec')
glove_model = gensim.models.KeyedVectors.load_word2vec_format('./glove.840B.300d.txt.word2vec', binary=False, limit=500000)


## Using Mikolov's analogy test file
-  #### Store starting position of all categories

In [239]:
test_file = open('word-test.v1.txt', 'r')

categories=[]
analogy_loc ={}

while(True):
    line = test_file.readline()
    
    if not line:
        break;
        
    elif(line.startswith(':')):
        category = line[2:].strip()
        position = test_file.tell()
        analogy_loc[category]= position
        categories.append(category)


## Question 1
### Groups to execute analogy prediction on:
-  capital-world
-  currency
-  city-in-state
-  family
-  gram1-adjective-to-adverb
-  gram2-opposite
-  gram3-comparative
-  gram6-nationality-adjective

In [240]:
test_groups=['capital-world', 'currency', 'city-in-state', 'family', 'gram1-adjective-to-adverb', 'gram2-opposite', 'gram3-comparative', 'gram6-nationality-adjective']

## Word2Vector Embedding Model
-  Feeding first word as negative and next two words as positive to recieve top negative as predicted outcome
-  If the predicted word is same as actual word, increment the score of embedding model
-  Accuracy calculation per group: correct predictions * 100 / total words
-  Average of accuracies from all groups: (overall accuracy / 8)

## Glove Embedding Model
-  Feeding first word as negative and next two words as positive to recieve top negative as predicted outcome
-  If the predicted word is same as actual word, increment the score of embedding model
-  Accuracy calculation per group: correct predictions * 100 / total words
-  Average of accuracies from all groups: (overall accuracy / 8)

In [248]:
overall_w2v_accuracy = 0
overall_glove_accuracy = 0

for group in test_groups:
    total_words = 0
    w2v_score = 0
    glove_score = 0 
    accuracy = 0
    test_file.seek(analogy_loc[group])
    
    for line in test_file:
        if (line.startswith(':')):
            break
        total_words += 1
        line = line.split()
        line = [w.strip() for w in line]
        
        try:
            w2v_prediction = w2v_model.most_similar(positive=[line[1], line[2]], negative=[line[0]], topn=1)
            glove_prediction = glove_model.most_similar(positive=[line[1], line[2]], negative=[line[0]], topn=1)

            if(w2v_prediction[0][0] == line[3]):
                w2v_score += 1
            if(glove_prediction[0][0] == line[3]):
                glove_score += 1
            
        except Exception as ex:
                exception=1

    if w2v_score > 0:
        accuracy = (w2v_score * 100) / total_words
        
    print('Accuracy of Word2Vec model for group : ', (group, accuracy))
    overall_w2v_accuracy+=accuracy
    accuracy = 0
    if glove_score > 0:
        accuracy = (glove_score * 100) / total_words
    print('Accuracy of Glove model for group : ', (group, accuracy))
    overall_glove_accuracy+=accuracy

Accuracy of Word2Vec model for group :  ('capital-world', 78.9787798408488)
Accuracy of Glove model for group :  ('capital-world', 90.36251105216623)
Accuracy of Word2Vec model for group :  ('currency', 29.907621247113163)
Accuracy of Glove model for group :  ('currency', 20.785219399538107)
Accuracy of Word2Vec model for group :  ('city-in-state', 71.99027158492096)
Accuracy of Glove model for group :  ('city-in-state', 70.08512363194163)
Accuracy of Word2Vec model for group :  ('family', 85.17786561264822)
Accuracy of Glove model for group :  ('family', 95.8498023715415)
Accuracy of Word2Vec model for group :  ('gram1-adjective-to-adverb', 29.233870967741936)
Accuracy of Glove model for group :  ('gram1-adjective-to-adverb', 42.84274193548387)
Accuracy of Word2Vec model for group :  ('gram2-opposite', 42.98029556650246)
Accuracy of Glove model for group :  ('gram2-opposite', 34.35960591133005)
Accuracy of Word2Vec model for group :  ('gram3-comparative', 91.14114114114115)
Accuracy o

## Average accuracy

In [250]:
print('Average accuracies:')
print('Word2Vec Model: ', overall_w2v_accuracy/8)
print('Glove Model: ', overall_glove_accuracy/8)

Average accuracies:
Word2Vec Model:  64.93326639239415
Glove Model:  66.54270907532153


### Observation
</br> Glove embedding model has comparatively better prediction rate.

## Question 2
### Comparison of antonyms cosine similarities
__Word__    __Antonym__ </br>  
decrease --- increase </br>  
leave --- stay </br>  
ascend --- descend </br>  
go --- come </br>  
above --- below </br>  

In [251]:
def cosine_similarities(wrd):
    print('\nTop 10 similar words to : ', wrd)
    similar_words = w2v_model.similar_by_word(word=wrd, topn=10)
    [print(s) for s in similar_words]

In [252]:
verbs = ['decrease', 'leave', 'ascend', 'go', 'above']
for v in verbs:
    cosine_similarities(v)


Top 10 similar words to :  decrease
('increase', 0.8370319604873657)
('decreases', 0.8093847632408142)
('decreased', 0.7642107009887695)
('reduction', 0.7175438404083252)
('increased', 0.7083162069320679)
('decreasing', 0.6931016445159912)
('decline', 0.6863038539886475)
('increases', 0.6454968452453613)
('Decreased', 0.574552059173584)
('reduced', 0.5725899934768677)

Top 10 similar words to :  leave
('leaving', 0.6598548889160156)
('stay', 0.5787086486816406)
('depart', 0.5559219121932983)
('Leaving', 0.5488995313644409)
('left', 0.5250931978225708)
('leaves', 0.5131403803825378)
('return', 0.5068632364273071)
('vacate', 0.4940752387046814)
('quit', 0.4841381311416626)
('rejoin', 0.4835888743400574)

Top 10 similar words to :  ascend
('ascended', 0.718950629234314)
('ascending', 0.7094936370849609)
('ascends', 0.6623241901397705)
('climb', 0.6534912586212158)
('ascent', 0.6063636541366577)
('descend', 0.5348040461540222)
('ascension', 0.5295450687408447)
('clamber', 0.52797752618789

__Word__    __Antonym__   __Similarity__ </br>  
-  decrease --- increase --- 83% </br>  
-  leave --- stay --- 53% </br>  
-  ascend --- descend--- 53% </br>  
-  go --- come --- 66% </br>  
-  above --- below --- 80% </br>  

__Why are word embeddings similar for antonyms?__  
</br> Based on above output, we can see that often opposite words are listed as top ten similar words based on their word embeddings. This could happen because even though these terms have opposite meanings, they hold a similar context. Words like 'decrease' and 'increase', 'leave' and 'stay' appear in the same sentence more frequently. Hence, inspite of being semantically diverse, such words are still found to be similar based on frequency and statistics of a corpora.

## Question 3
### Prediction on a Custom-made Test

### Category 1 (Animal Sounds)
-  duck quack dog bark
-  duck quack bat screech
-  duck quack dolphin click

### Category 2 (Owner Company - AI Products)
-  apple siri google google_assistant
-  apple siri amazon alexa
-  apple siri microsoft cortana

In [245]:
custom_test_file = open('custom-test.txt', 'r')

custom_categories=[]
custom_analogy_loc ={}

while(True):
    
    line = custom_test_file.readline()
    if not line:
        break;
        
    elif(line.startswith(':')):
        custom_analogy_loc[line[2:].strip()]= custom_test_file.tell()
        custom_categories.append(line[2:].strip())

-  Feeding first word as negative and next two words as positive to recieve top negative as predicted outcome
-  If the predicted word is same as actual word, increment the score of respective embedding models
-  Accuracy calculation per group: correct predictions * 100 / total words
-  Average of accuracies from all groups: (overall accuracy / 8)

In [246]:
custom_test_groups=['animal-sounds', 'company-AI_product']

for group in custom_test_groups:
    total_words = 0
    w2v_score = 0
    glove_score = 0 
    accuracy = 0
    custom_test_file.seek(custom_analogy_loc[group])
    print('\nGroup: ', group)
    for line in custom_test_file:
        if (line.startswith(':')):
            break
        total_words += 1
        line = line.split()
        line = [w.strip() for w in line]
        
        try:
            w2v_prediction = w2v_model.most_similar(positive=[line[1], line[2]], negative=[line[0]], topn=1)
            print(line[0], line[1], line[2], line[3], end =" ")
            print('Word2Vec Prediction: ', w2v_prediction[0][0], end =" ")
            glove_prediction = glove_model.most_similar(positive=[line[1], line[2]], negative=[line[0]], topn=1)
            
            print('Glove Prediction: ', glove_prediction[0][0])
            if(w2v_prediction[0][0] == line[3]):
                w2v_score += 1
            if(glove_prediction[0][0] == line[3]):
                glove_score += 1
            
        except Exception as ex:
                print(ex)

    if w2v_score > 0:
        accuracy = (w2v_score * 100) / total_words
        
    print('\nAccuracy of Word2Vec model for group : ', (group, accuracy))
    accuracy = 0
    if glove_score > 0:
        accuracy = (glove_score * 100) / total_words
    print('Accuracy of Glove model for group : ', (group, accuracy))


Group:  animal-sounds
duck quack dog bark Word2Vec Prediction:  dogs Glove Prediction:  puppy
duck quack bat screech Word2Vec Prediction:  bats Glove Prediction:  bats
duck quack dolphin click Word2Vec Prediction:  dolphins Glove Prediction:  dolphins

Accuracy of Word2Vec model for group :  ('animal-sounds', 0)
Accuracy of Glove model for group :  ('animal-sounds', 0)

Group:  company-AI_product
"word 'siri' not in vocabulary"
"word 'siri' not in vocabulary"
"word 'siri' not in vocabulary"

Accuracy of Word2Vec model for group :  ('company-AI_product', 0)
Accuracy of Glove model for group :  ('company-AI_product', 0)


### Observation on results:
-  In the first set of analogies, words are known to the pretrained word embedding models, but in a different context. Hence, we can see its failed attempts at predicting the fourt word.
-  The second category set has words completely unknown to word embedding models like 'siri'. Therefore, its unable to proceed with any predictions.