In [1]:
from gensim.test.utils import get_tmpfile, common_texts
from gensim.models import Word2Vec

In [2]:
print(common_texts)

[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]


In [3]:
# path = get_tmpfile("word2vec.model")
# model = Word2Vec(common_texts, size=300, window=5, min_count=1, workers=4)
# model.save("word2vec.model")

In [4]:
model = Word2Vec.load("word2vec.model")
model.train([["hello", "world"]], total_examples=1, epochs=1)

(0, 2)

In [5]:
vector = model.wv['computer']  # numpy vector of a word
vector.shape

(300,)

In [6]:
try:
    w11 = model.wv['king']
    w12 = model.wv['man']
    w21 = model.wv['queen']
    w22 = model.wv['woman']
except:
    print("Some words were not found in the vocabulary...")

Some words were not found in the vocabulary...


### https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html
### https://radimrehurek.com/gensim/auto_examples/index.html

#### <font color='red'>Too slow to load each time. Is there a way to store this?</font>

In [7]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

In [8]:
print(len(wv.vocab))
vec_king = wv['king']
print(len(vec_king))

3000000
300


### <font color = 'red'>My Questions</font>

From Paper Page 5 (Figure 1 New model architectures): 
* The CBOW architecture predicts the **current word based on the context**, and 
* The Skip-gram **predicts surrounding words given the current word**.

My understanding:
* So for CBOW, output should be a V sized softmax?
* So for Skip-gram (lets say we pick C = 4 surrounding words +-2), output should be a of size 4 and each is a V sized Softmax (matches slide 10 from class PPT but contradicts slide 8.


Slides from Class clarification:
* Slide 8: Output (why does it have to be 1 for each vocab? Should it not just be a V sized softmax for the neighboring C words)


Also 
* Slide 13, how does "Dot Product/Similarity" help? and what are we comparing this to?



### <font color ='red'> Homework related questions: </font>

1. Question 1: How do you want us to compare the vectors (they are 300x1)? Are you looking for a similarity measure like cosine similarity? --> use the one provided by the library. Gensim --> ?wv.relative_cosine_similarity
2. Question 4 and 5, are we supposed to only calculate the distance for Spacy and not for Gensim? Is Gensim same as manual and scipy (does not look like it below).

Question 3: Are the vecotors the same?

### <font color ='red'> More General Questions: </font>
1. Case matters below?
2. Debiasing?
4. Concept of soft cosine similarity and relative cosine similarity?

In [42]:
four_grams = [
    [('king', 'queen'), ('man', 'woman')],
    [('king', 'man'), ('queen', 'woman')],
    [('King', 'man'), ('Queen', 'woman')],
    [('King', 'man'), ('queen', 'woman')],
    [('man', 'woman'), ('boy', 'girl')],
    [('Ottawa', 'Canada'), ('Nairobi', 'Kenya')],
    [('big', 'bigger'), ('tall', 'taller')],
    [('yen', 'japan'), ('ruble', 'russia')],
    [('man', 'doctor'), ('woman', 'nurse')],  ## Bias in language  
    [('France', 'Paris'), ('England', 'London')]
]

In [55]:
calc = wv['Paris'] - wv['France'] + wv['England']
cosine = lambda v1, v2: np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
cosine(calc,wv['London'])

0.6500488

In [50]:
import numpy as np
from numpy.linalg import norm
from scipy import spatial
from gensim.matutils import softcossim

for four_gram in four_grams:
    lhs = wv[four_gram[0][0]] - wv[four_gram[0][1]] + wv[four_gram[1][1]]
    rhs = wv[four_gram[1][0]]  
    print("-"*50)
    print("1.0 Using inbuilt functions to find most similar word ...")
    print(f"Positive: {four_gram[0][0]} {four_gram[1][1]} | Negative: {four_gram[0][1]}")
    top_similar = wv.most_similar(positive=[four_gram[0][0], four_gram[1][1]], negative=[four_gram[0][1]], topn=5)
    print(f"'{four_gram[0][1]}:{four_gram[0][0]}' as '{four_gram[1][1]}': ?")
    print(f"Ans: '{top_similar[0][0]}' with a similarity of {top_similar[0][1]}")
    print(top_similar)
    
    print("\n2.0 Using manual vecotor addition/subtraction ...")
   
    # # Gensim diatance function needs "words" as input not "vectors"
    # gensim_similarity = 1 - wv.distance(lhs, rhs)
    # print(f"Similarity using Gensim: {gensim_similarity}")
    cosine_similarity1 = 1 - spatial.distance.cosine(lhs, rhs)  ## Scipy
    print(f"Cosine Similarity using scipy: {cosine_similarity1}")
    cosine_similarity2 = np.dot(lhs, rhs)/(norm(lhs)*norm(rhs))
    print(f"Cosine Similarity using numpy: {cosine_similarity2}")    
    
    # print(softcossim(lhs,rhs, wv.similarity_matrix))

--------------------------------------------------
1.0 Using inbuilt functions to find most similar word ...
Positive: king woman | Negative: queen
'queen:king' as 'woman': ?
Ans: 'man' with a similarity of 0.72110915184021
[('man', 0.72110915184021), ('boy', 0.5595242977142334), ('teenage_girl', 0.513959527015686), ('girl', 0.49721550941467285), ('teenager', 0.4869248569011688)]

2.0 Using manual vecotor addition/subtraction ...
Cosine Similarity using scipy: 0.6913302540779114
Cosine Similarity using numpy: 0.6913303732872009
--------------------------------------------------
1.0 Using inbuilt functions to find most similar word ...
Positive: king woman | Negative: man
'man:king' as 'woman': ?
Ans: 'queen' with a similarity of 0.7118192911148071
[('queen', 0.7118192911148071), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321243286133)]

2.0 Using manual vecotor addition/subtraction ...
Cosine Similarity usin

In [51]:
wv.most_similar(['Paris','England'], ['France'])

[('London', 0.6441212892532349),
 ('stock_symbol_BNK', 0.5432794094085693),
 ('ticker_symbol_BNK', 0.5106071829795837),
 ('LSO_St_Lukes', 0.474970281124115),
 ('Leeds', 0.465692400932312),
 ('Englands', 0.4634588956832886),
 ('Islamabad_Slyvia_Hui', 0.46285343170166016),
 ('Manchester', 0.4587923288345337),
 ('Covent_Garden', 0.4549728333950043),
 ('Kensington', 0.4531983435153961)]

In [18]:
#?wv.similarity_matrix

In [19]:
#?wv.relative_cosine_similarity

In [20]:
#?softcossim

## Spacy

### https://spacy.io/usage/vectors-similarity

In [21]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [22]:
tokens = nlp("dog cat banana afskfsd")

# Text: The original token text.
# has vector: Does the token have a vector representation?
# Vector norm: The L2 norm of the token’s vector (the square root of the sum of the values squared)
# OOV: Out-of-vocabulary

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov, token.vector.shape)

dog True 7.0336733 False (300,)
cat True 6.6808186 False (300,)
banana True 6.700014 False (300,)
afskfsd False 0.0 True (300,)


In [23]:
tokens = nlp("dog cat banana")

for token1 in tokens:
    for token2 in tokens:
    print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat 0.80168545
dog banana 0.24327643
cat dog 0.80168545
cat cat 1.0
cat banana 0.28154364
banana dog 0.24327643
banana cat 0.28154364
banana banana 1.0


In [56]:
?token1.similarity

[1;31mDocstring:[0m
Make a semantic similarity estimate. The default estimate is cosine
similarity using an average of word vectors.

other (object): The object to compare with. By default, accepts `Doc`,
    `Span`, `Token` and `Lexeme` objects.
RETURNS (float): A scalar similarity score. Higher is more similar.

DOCS: https://spacy.io/api/doc#similarity
[1;31mType:[0m      builtin_function_or_method


In [24]:
token1.vector.shape

(300,)

In [27]:
doc1 = nlp("I hate cats")
doc2 = nlp("I like dogs")

doc1.similarity(doc2)

# # Analyzing survey data
# doc1 = nlp("I returned to India because of family reasons. I had older parents who needed help.")
# doc2 = nlp("I decided to stay back in the US because of more opportunities.")
# doc1.similarity(doc2)

0.9088526108382569

In [53]:
four_grams = [
    ['king', 'queen', 'man', 'woman'],
    ['king', 'man', 'queen', 'woman'],
    ['King', 'man', 'Queen', 'woman'],
    ['King', 'man', 'queen', 'woman'],
    ['man', 'woman', 'boy', 'girl'],
    ['Ottawa', 'Canada', 'Nairobi', 'Kenya'],
    ['big', 'bigger', 'tall', 'taller'],
    ['yen', 'japan', 'ruble', 'russia'],
    ['man', 'doctor', 'woman', 'nurse'],  ## Bias in language  
    ['Paris', 'France', 'London', 'England']
]

In [54]:
for four_gram in four_grams:
    print(four_gram)
    token1 = nlp(four_gram[0])
    token2 = nlp(four_gram[1])
    token3 = nlp(four_gram[2])
    token4 = nlp(four_gram[3])
    lhs = token1.vector - token2.vector + token4.vector
    rhs = token3.vector
    

    print("\n2.0 Using manual vecotor addition/subtraction ...")
    cosine_similarity1 = 1 - spatial.distance.cosine(lhs, rhs)
    print(f"Cosine Similarity using scipy: {cosine_similarity1}")
    cosine_similarity2 = np.dot(lhs, rhs)/(norm(lhs)*norm(rhs))
    print(f"Cosine Similarity using numpy: {cosine_similarity2}")    


['king', 'queen', 'man', 'woman']

2.0 Using manual vecotor addition/subtraction ...
Cosine Similarity using scipy: 0.7820168733596802
Cosine Similarity using numpy: 0.7820168137550354
['king', 'man', 'queen', 'woman']

2.0 Using manual vecotor addition/subtraction ...
Cosine Similarity using scipy: 0.7880843877792358
Cosine Similarity using numpy: 0.7880844473838806
['King', 'man', 'Queen', 'woman']

2.0 Using manual vecotor addition/subtraction ...
Cosine Similarity using scipy: 0.7880843877792358
Cosine Similarity using numpy: 0.7880844473838806
['King', 'man', 'queen', 'woman']

2.0 Using manual vecotor addition/subtraction ...
Cosine Similarity using scipy: 0.7880843877792358
Cosine Similarity using numpy: 0.7880844473838806
['man', 'woman', 'boy', 'girl']

2.0 Using manual vecotor addition/subtraction ...
Cosine Similarity using scipy: 0.832800567150116
Cosine Similarity using numpy: 0.8328006863594055
['Ottawa', 'Canada', 'Nairobi', 'Kenya']

2.0 Using manual vecotor addition/su