In [33]:
from nltk.corpus import brown
from nltk.util import ngrams
from nltk import word_tokenize
from collections import Counter
from sklearn.decomposition import PCA
import requests
from collections import defaultdict
import numpy as np 
from scipy.sparse import csr_matrix 
from sklearn import metrics
import scipy
from sklearn.metrics.pairwise import pairwise_distances

In [34]:
#Pre-exercise
 
words = brown.words()

words = [w.lower() for w in words if w.isalpha()]

c = Counter(words)
top_5k = c.most_common(5000)
print(f"top 5 most common words: {top_5k[:5]}")
print(f"top 5 least common words: {top_5k[-5:]}")

top 5 most common words: [('the', 69971), ('of', 36412), ('and', 28853), ('to', 26158), ('a', 23195)]
top 5 least common words: [('vertex', 19), ('rourke', 19), ('killpath', 19), ('haney', 19), ('letch', 19)]


In [35]:
# load in words from table 1
r = requests.get('https://raw.githubusercontent.com/AlexGrinch/ro_sgns/master/datasets/rg65.csv')
rg65_set = set([word for line in r.text.split('\n') for word in line.strip().split(';')[:2]])

top_5k_words_initial = [i[0] for i in top_5k]
top_5k_words_after = [i[0] for i in top_5k]
new_words = []
for word in rg65_set:
    if word not in top_5k_words_initial:
        top_5k_words_after.append(word)
        new_words.append(word)

print(f"number of new words added: {len(new_words)}")

number of new words added: 30


In [36]:
#get bigrams

bigrams = ngrams(words, 2)
top_5k_bigrams = []

for gram in bigrams:
    if gram[0] in top_5k_words_after and gram[1] in top_5k_words_after:
        top_5k_bigrams.append(gram)

bigrams_freq = Counter(top_5k_bigrams)
print(f"numbr of bigrams that matched the search: {len(top_5k_bigrams)}")

numbr of bigrams that matched the search: 754420


In [37]:
# create M1 matrix
M1 = csr_matrix((len(top_5k_words_after), len(top_5k_words_after)),  
                          dtype = np.int32).toarray()

index_dict = {k: v for v, k in enumerate(top_5k_words_after)}

for entry in bigrams_freq:
    row = entry[0]
    col = entry[1]
    count = bigrams_freq[entry]   
    
    M1[index_dict[row], index_dict[col]] = count

In [38]:
#ppmi on M1
def ppmi(mat):
    col_totals = mat.sum(axis=0)
    total = col_totals.sum()
    row_totals = mat.sum(axis=1)
    expected = np.outer(row_totals, col_totals) / total
    mat = mat / (expected + 1e-16)
    # Silence distracting warnings about log(0):
    with np.errstate(divide='ignore'):
        mat = np.log(mat)
    mat[np.isinf(mat)] = 0.0  # log(0) = 0
    mat[mat < 0] = 0.0
    return mat

M1_plus = ppmi(M1)

In [39]:
#PCA
pca10 = PCA(n_components=10)
pca100 = PCA(n_components=100)
pca300 = PCA(n_components=300)

M2_10 = pca10.fit_transform(M1_plus)
M2_100 = pca100.fit_transform(M1_plus)
M2_300 = pca300.fit_transform(M1_plus)


In [40]:
#pairs of words 
rg65 = [s.strip().split(';') for s in r.text.split('\n')]

P = []
S = []
for w1, w2, score in rg65:
    if w1 in new_words or w2 in new_words:
        P.append((w1, w2))
        S.append(float(score))
        

In [41]:
models = [M1, M1_plus, M2_10, M2_100, M2_300]
M1_a, M1_plus_a, M2_10_a, M2_100_a, M2_300_a = [], [], [], [], []
arrs = [M1_a, M1_plus_a, M2_10_a, M2_100_a, M2_300_a]
for i, model in enumerate(models):
    for w1, w2, in P:
        arrs[i].append(metrics.pairwise.cosine_similarity(model[index_dict[w1]].reshape(1, -1), model[index_dict[w2]].reshape(1, -1))[0][0])


In [42]:
#pearson
model_names = ['M1', 'M1_plus', 'M2_10', 'M2_100', 'M2_300']
for i, arr in enumerate(arrs):
    pearson = scipy.stats.pearsonr(arr, S)
    print(f"pearson stats for model {model_names[i]}: {pearson[0]}")

pearson stats for model M1: 0.23682594116301078
pearson stats for model M1_plus: 0.201237999273438
pearson stats for model M2_10: 0.24786070380823913
pearson stats for model M2_100: 0.41870300892924095
pearson stats for model M2_300: 0.39571575881845256


In [43]:
# get P and S from table 1
rg65 = [s.strip().split(';') for s in r.text.split('\n')]

P = []
S = []
for w1, w2, score in rg65:
    if w1 in new_words or w2 in new_words:
        P.append((w1, w2))
        S.append(float(score))

print(P)
print(S)

[('cord', 'smile'), ('rooster', 'voyage'), ('fruit', 'furnace'), ('autograph', 'shore'), ('automobile', 'wizard'), ('mound', 'stove'), ('grin', 'implement'), ('asylum', 'fruit'), ('asylum', 'monk'), ('graveyard', 'madhouse'), ('glass', 'magician'), ('boy', 'rooster'), ('cushion', 'jewel'), ('monk', 'slave'), ('asylum', 'cemetery'), ('grin', 'lad'), ('shore', 'woodland'), ('monk', 'oracle'), ('boy', 'sage'), ('automobile', 'cushion'), ('mound', 'shore'), ('lad', 'wizard'), ('forest', 'graveyard'), ('food', 'rooster'), ('cemetery', 'woodland'), ('shore', 'voyage'), ('bird', 'woodland'), ('furnace', 'implement'), ('crane', 'rooster'), ('hill', 'woodland'), ('cemetery', 'mound'), ('glass', 'jewel'), ('magician', 'oracle'), ('crane', 'implement'), ('brother', 'lad'), ('sage', 'wizard'), ('oracle', 'sage'), ('bird', 'crane'), ('bird', 'cock'), ('brother', 'monk'), ('asylum', 'madhouse'), ('furnace', 'stove'), ('magician', 'wizard'), ('hill', 'mound'), ('cord', 'string'), ('glass', 'tumbler')

In [44]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [45]:
cosine_sims = []

for w1, w2 in P:
    cosine_sims.append(metrics.pairwise.cosine_similarity(model[w1].reshape(1, -1), model[w2].reshape(1, -1))[0][0])

In [46]:
pearson = scipy.stats.pearsonr(cosine_sims, S)
print(f"pearson correlation between w2v and human similarities {pearson[0]}")

pearson correlation between w2v and human similarities 0.7516600628757987


In [47]:
# create analogies dictionary

r = requests.get('http://www.fit.vutbr.cz/~imikolov/rnnlm/word-test.v1.txt')

analogies_dict = {}
current_collection = ''
for line in r.text.split("\n")[1:]:
    if line.startswith(":"):
        current_collection = line[1:].strip()
        analogies_dict[current_collection] = []
    else:
        analogies_dict[current_collection].append(line.strip().split())

In [48]:
import operator

sem_analogies = []
syn_analogies = []
sem_num = 0
syn_num = 0

for key in analogies_dict.keys():
    if 'gram' in key:
        lst_choice = syn_analogies
        num_choice = syn_num
    else:
        lst_choice = sem_analogies
        num_choice = sem_num

    for lst in analogies_dict[key]:
        if len(lst) == 4:
            flag = True
            for i in range(4):
                if lst[i] not in top_5k_words_after:
                    flag = False
            if flag:
                lst_choice.append(lst)

print(f"number of semantic analogies involving the top 5000 words: {len(sem_analogies)}")
print(f"number of syntactic analogies involving the top 5000 words: {len(syn_analogies)}")

with open('sem_analogies.txt', 'w') as f:
    f.write(": sem-tests\n")
    for i in range(len(sem_analogies)):
        f.write(f"{sem_analogies[i][0]} {sem_analogies[i][1]} {sem_analogies[i][2]} {sem_analogies[i][3]}\n") 

with open('syn_analogies.txt', 'w') as f:
    f.write(": syn-tests\n")
    for i in range(len(syn_analogies)):
        f.write(f"{syn_analogies[i][0]} {syn_analogies[i][1]} {syn_analogies[i][2]} {syn_analogies[i][3]}\n") 


number of semantic analogies involving the top 5000 words: 90
number of syntactic analogies involving the top 5000 words: 2024


In [49]:
sem_evaluation = model.evaluate_word_analogies("sem_analogies.txt")[0]
syn_evaluation = model.evaluate_word_analogies("syn_analogies.txt")[0]

print(f"w2v accuracy on semantic analogies: {sem_evaluation}%")
print(f"w2v accuracy on syntactic analogies: {syn_evaluation}%")

w2v accuracy on semantic analogies: 0.9222222222222223%
w2v accuracy on syntactic analogies: 0.6773715415019763%


In [50]:
m2300_sem_results = []

# sem analogies
for sem in sem_analogies:
    w1, w2, w3, target = sem
    
    m2300_vec =  M2_300[index_dict[w2]] - M2_300[index_dict[w1]] + M2_300[index_dict[w3]]
    #square distance metrix to find closest vector
    distances = pairwise_distances(m2300_vec.reshape(1, -1), M2_300, metric="l2").reshape(5030)
    ids = distances.argsort()[0]
    word = top_5k_words_after[ids]

    if word == target:
        m2300_sem_results.append(1)
    else:
        m2300_sem_results.append(0)

print(f"m2300 got {sum(m2300_sem_results)} out of {len(m2300_sem_results)} on the semantic dataset")

m2300 got 0 out of 90 on the semantic dataset


In [51]:
#syn analogies
m2300_syn_results = []

# this takes a while to run so please feel free to increase or decrease this amount
max_analogies = 300

# sem analogies
for syn in syn_analogies[:max_analogies]:
    w1, w2, w3, target = syn
    
    m2300_vec =  M2_300[index_dict[w2]] - M2_300[index_dict[w1]] + M2_300[index_dict[w3]]
    #square distance metrix to find closest vector
    distances = pairwise_distances(m2300_vec.reshape(1, -1), M2_300, metric="l2").reshape(5030)
    ids = distances.argsort()[0]
    word = top_5k_words_after[ids]

    if word == target:
        m2300_syn_results.append(1)
    else:
        m2300_syn_results.append(0)

print(f"m2300 got {sum(m2300_syn_results)} out of {len(m2300_syn_results)} on the semantic dataset")

m2300 got 0 out of 300 on the semantic dataset
