In [1]:
from nltk.corpus import brown, stopwords
from nltk.probability import FreqDist
from nltk import word_tokenize 
from nltk.util import ngrams

from scipy import sparse, stats
import pandas as pd
import numpy as np

from collections import Counter
from itertools import combinations
from math import log
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pprint import pformat
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds, norm
from string import punctuation
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# https://gist.github.com/emaadmanzoor/1d06e0751a3f7d39bc6814941b37531d


In [3]:
data = map(lambda sent_words: ' '.join(sent_words), brown.sents())
df = pd.DataFrame(data)
df.head()

Unnamed: 0,0
0,The Fulton County Grand Jury said Friday an in...
1,The jury further said in term-end presentments...
2,The September-October term jury had been charg...
3,`` Only a relative handful of such reports was...
4,The jury said it did find that many of Georgia...


In [4]:
%%time
# Tokenize and remove punctuation
punctrans = str.maketrans(dict.fromkeys(punctuation))
def tokenize(title):
    x = title.lower() # Lowercase
    x = x.encode('ascii', 'ignore').decode() # Keep only ascii chars.
    x = x.translate(punctrans) # Remove punctuation
    return x.split()

texts_tokenized = df[0].apply(tokenize)
texts_tokenized.head()

CPU times: user 314 ms, sys: 44.5 ms, total: 358 ms
Wall time: 356 ms


0    [the, fulton, county, grand, jury, said, frida...
1    [the, jury, further, said, in, termend, presen...
2    [the, septemberoctober, term, jury, had, been,...
3    [only, a, relative, handful, of, such, reports...
4    [the, jury, said, it, did, find, that, many, o...
Name: 0, dtype: object

In [12]:
%%time
# Uni- Bi-gram counters
def count_grams(texts_tokenized):
    cx = Counter()
    cxy = Counter()
    for text in texts_tokenized:
        for x in text:
            cx[x] += 1
        for x, y in map(sorted, combinations(text, 2)):
            cxy[(x, y)] += 1
    return cx, cxy

cx, cxy = count_grams(texts_tokenized)
cx

CPU times: user 8.56 s, sys: 139 ms, total: 8.7 s
Wall time: 8.7 s


Counter({'the': 69971,
         'fulton': 17,
         'county': 155,
         'grand': 48,
         'jury': 67,
         'said': 1961,
         'friday': 60,
         'an': 3748,
         'investigation': 51,
         'of': 36412,
         'atlantas': 4,
         'recent': 179,
         'primary': 96,
         'election': 77,
         'produced': 90,
         'no': 2202,
         'evidence': 204,
         'that': 10594,
         'any': 1344,
         'irregularities': 8,
         'took': 426,
         'place': 570,
         'further': 218,
         'in': 21341,
         'termend': 1,
         'presentments': 1,
         'city': 393,
         'executive': 55,
         'committee': 168,
         'which': 3561,
         'had': 5133,
         'overall': 47,
         'charge': 122,
         'deserves': 16,
         'praise': 17,
         'and': 28853,
         'thanks': 37,
         'atlanta': 35,
         'for': 9489,
         'manner': 124,
         'was': 9815,
         'conducted': 55,

In [26]:
%%time

rg65_table1 = pd.read_csv("rg65_table1.txt", header=[0])
def add_rg65(cx, cxy, rg65_table1):
    rg65_table1 = rg65_table1[['context','word']].to_numpy()
    rg65 = list(map(lambda pair: (pair[0], pair[1]), rg65_table1))
    crg = Counter(rg65)
    # Take top 5000 words and append rg65, keys with greater than 1 appear in both
    cw = Counter()
    for w in cx.most_common()[:5000]:
        cw[w[0]] += 1
    for (x,y) in rg65:
        # print(x, y)
        cw[x] += 1
        cw[y] += 1
        cxy[(x, y)] += 1
    for x, y in list(cxy.keys()):
        if x not in cw or y not in cw:
            del cxy[(x, y)]
    return cw, cx, cxy, crg

def remove_stopwords(cx, cxy):
    print('%d unigrams before' % len(cx))
    for x in stopwords.words('english'):
        del cx[x]
    print('%d unigrams after' % len(cx))
    print('%d bigrams before' % len(cxy))
    # Remove bigrams of removed unigrams
    for x, y in list(cxy.keys()):
        if x not in cx or y not in cx:
            del cxy[(x, y)]
    print('%d bigrams after' % len(cxy))
    return cx, cxy

# cw: top 5000 brown corpus and rg65 table 1 words
# cx: clean brown unigram counter
# cxy: clean brown bigram counter
# crg: rg65 table pairs
cx, cxy = remove_stopwords(cx, cxy)
cw, cx, cxy, crg = add_rg65(cx, cxy, rg65_table1)





47880 unigrams before
47880 unigrams after
2547612 bigrams before
2547610 bigrams after
CPU times: user 1.66 s, sys: 3.8 ms, total: 1.67 s
Wall time: 1.66 s


In [27]:
# Write top 5000 brown words
def brown_5000(cx):
    with open(r'brown_5000.txt', 'w') as f:
        f.write(" ".join(map(lambda tok_count: tok_count[0], cx.most_common()[:5000])))
               
brown_5000(cx)

In [28]:
%%time
# W words of interest
W = list(cw.keys())
# index to/from token maps
x2i, i2x = {}, {}
for i, x in enumerate(cw.keys()):
    x2i[x] = i
    i2x[i] = x

CPU times: user 1.73 ms, sys: 44 µs, total: 1.77 ms
Wall time: 1.78 ms


In [29]:
x2i['county']

528

In [30]:
%%time

def model_matrices(unigram_counter, bigram_counter):
    sx = sum(unigram_counter.values())
    sxy = sum(bigram_counter.values())
    
    ppmi_counter = Counter()
    cooccurrence_counter = Counter()
    
    rows, cols = [], []
    ppmi_data = []
    cooccurence_data = []
    for (x, y), n in bigram_counter.items():
        rows.append(x2i[x])
        cols.append(x2i[y])
        
        ppmi_data.append(max(0, log((n / sxy) / (cw[x] / sx) / (cw[y] / sx))))
        ppmi_counter[(x, y)] = ppmi_data[-1]
        
        cooccurence_data.append(cxy[(x,y)])
        cooccurrence_counter[(x,y)] = cooccurence_data[-1]
    ppmi = csc_matrix((ppmi_data, (rows, cols)))
    cooccurence = csc_matrix((cooccurence_data, (rows, cols)))
    return ppmi, ppmi_counter, cooccurence, cooccurrence_counter

ppmi, ppmi_counter, cooccurrence, cooccurence_counter = model_matrices(cw, cxy)

CPU times: user 3.06 s, sys: 35.7 ms, total: 3.1 s
Wall time: 3.09 s


In [31]:
M1 = ppmi.asfptype()
M1_plus = cooccurrence.asfptype()
print('%d non-zero elements in cooccurrence' % M1.count_nonzero())
print('Sample PPMI values\n', pformat(ppmi_counter.most_common()[:10]))

M1_plus

1157092 non-zero elements in cooccurrence
Sample PPMI values
 [(('af', 'af'), 9.107352479164453),
 (('states', 'united'), 8.879673772684852),
 (('mrs', 'mrs'), 8.594513769665781),
 (('new', 'york'), 8.445388226041194),
 (('one', 'one'), 8.07016300312275),
 (('would', 'would'), 8.04429949253283),
 (('another', 'one'), 8.04429949253283),
 (('one', 'would'), 7.9671637636506025),
 (('one', 'two'), 7.948115568679908),
 (('cent', 'per'), 7.928697482822806)]


<5031x5031 sparse matrix of type '<class 'numpy.float64'>'
	with 1157096 stored elements in Compressed Sparse Column format>

In [53]:
%%time
# help(svds)
def normalize_matrix(matrix):
    norms = np.sqrt(np.sum(np.square(matrix), axis=1, keepdims=True))
    matrix /= np.maximum(norms, 1e-7)
    return matrix

def model_svd(matrix, k=10, normalize=True):
    print(type(matrix))
    U, S, VT = svds(matrix, k=k)
    print('U shape', U.shape)
    print('S shape', S.shape)
    print('VT shape', VT.shape)
    if (normalize):
        U = normalize_matrix(U)
    return {"U": U, "S": S, "VT": VT}

# k10_model = model_svd(M1_plus, k=10)
# k100_model = model_svd(M1_plus, k=100)
k300_model = model_svd(M1_plus, k=300)
# k1000_model = model_svd(M1_plus, k=1000)



<class 'scipy.sparse.csc.csc_matrix'>
U shape (5031, 300)
S shape (300,)
VT shape (300, 5031)
CPU times: user 22.6 s, sys: 284 ms, total: 22.9 s
Wall time: 5.82 s


In [33]:
def bigrams_intersection(crg, cw):
    cp = Counter()
    for (x,y), count in crg.items():
        if cw[x]==2 and cw[x]==2: #since we iterated over both top 5000 and rg into same counter cw
            # print('both', x,y, 'in W')
            cp[(x,y)] += 1
    # P = list(sum(bigrams_intersection(crg, cw), ())) # Flatten array of bigram keys
    return list(cp.keys())


# P: all pairs of words available in rg65 who also show up in brown top 5000
P = bigrams_intersection(crg, cw)

def rg65_human_similarities(rg65_table1):
    cs = Counter()
    for row in rg65_table1[['context','word','human_similarity']].to_numpy():
        cs[tuple(row[:2])] = row[2]
    return cs

S = rg65_human_similarities(rg65_table1)


In [55]:
# P: pair of unigrams that appear in both brown_5000 and rg65
def compute_cosine_pairs(P, model):
    U = model["U"]
    similarity_counter = Counter()
    for (x,y) in P:
        print(x,y)
        # print(x2i[x], x2i[y])
        # print(U[x2i[x]], U[x2i[y]])
        x_vec = U[x2i[x]].reshape(1, -1)
        y_vec = U[x2i[y]].reshape(1, -1)
        # print(cosine_similarity(x_vec, y_vec)[0][0])
        
        similarity_counter[(x,y)] = cosine_similarity(x_vec, y_vec)[0][0]
    return similarity_counter

def human_similarity_pearson(s_counter, S):
    pearson_x = []
    pearson_y = []
    for (x,y), cos_sim in s_counter.items():
        print(x,y, cos_sim)
        pearson_x.append(cos_sim)
        pearson_y.append(S[(x,y)])
        
    pearson_x = np.array(pearson_x)
    pearson_y = np.array(pearson_y)
    return stats.pearsonr(pearson_x, pearson_y)
    
s_k300 = compute_cosine_pairs(P, k300_model)
human_similarity_pearson(s_k300, S)

cord smile
autograph shore
cord string
autograph signature
cock rooster
cord smile 0.015823829787096358
autograph shore 0.09595738272156654
cord string -0.07427110536061576
autograph signature -0.06904959470665581
cock rooster 0.03614513495138225


(-0.6641241992956836, 0.22151855381696003)

In [35]:
from gensim.models import KeyedVectors, Word2Vec
google_model = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)
# google_model['dog']




In [182]:
# Word pairs human similarity scores for tokens both in RG65 and brown corpus
for p in P:
    print(p, S[p])

('cord', 'smile') 0.02
('autograph', 'shore') 0.06
('cord', 'string') 3.41
('autograph', 'signature') 3.59
('cock', 'rooster') 3.68


In [50]:
cosine_similarity(np.array([1,2,3]).reshape(-1,1), np.array([-1,-2,-3]).reshape(-1,1))

array([[-1., -1., -1.],
       [-1., -1., -1.],
       [-1., -1., -1.]])

In [37]:
P_wv = KeyedVectors(vector_size=300)
P_wv

for p in list(set(sum(P, ()))):
    print(p)
    P_wv.add_vector(p, google_model[p])
    
    

smile
shore
signature
string
rooster
cock
autograph
cord




In [180]:
def wv_cos_sim(wv, x_tok, y_tok):
    x = wv[pp[0]].reshape(1, -1)
    y = wv[pp[1]].reshape(1, -1)
    return cosine_similarity(x,y)[0][0]
    
G_sim_counter = Counter()
for pp in P:
    G_sim_counter[pp] = wv_cos_sim(google_model, pp[0], pp[1])
    print(pp, G_sim_counter[pp])

('cord', 'smile') 0.018116448
('autograph', 'shore') 0.03465592
('cord', 'string') 0.18951255
('autograph', 'signature') 0.3132112
('cock', 'rooster') 0.47867876


In [52]:
print(G_sim_counter.items())
# word2vec vs pearson
human_similarity_pearson(G_sim_counter, S)

dict_items([(('cord', 'smile'), 0.018116448), (('autograph', 'shore'), 0.03465592), (('cord', 'string'), 0.18951255), (('autograph', 'signature'), 0.3132112), (('cock', 'rooster'), 0.47867876)])
cord smile 0.018116448
autograph shore 0.03465592
cord string 0.18951255
autograph signature 0.3132112
cock rooster 0.47867876


(0.8731961158103346, 0.05316120417270493)

In [178]:
def summarize_analogy(analogy_test):
    for test in analogy_test[1]:
        section = test["section"]
        num_correct = len(test["correct"])
        num_incorrect= len(test["incorrect"])
        if (num_correct != 0) and (num_incorrect != 0):
            accuracy = "{:.2f}".format(100*num_correct/(num_correct+num_incorrect))
            print(section, num_correct, num_incorrect, accuracy)
            # print(section, '&', num_correct, '&', num_incorrect, '&', accuracy)

In [167]:
%%time
analogy_test = google_model.evaluate_word_analogies('analogy_word-test.v1.txt')

CPU times: user 28min 41s, sys: 7.54 s, total: 28min 49s
Wall time: 7min 12s


In [168]:
summarize_analogy(analogy_test)

capital-common-countries 421 85 83.20
capital-world 3552 816 81.32
currency 230 578 28.47
city-in-state 1779 688 72.11
family 436 70 86.17
gram1-adjective-to-adverb 290 702 29.23
gram2-opposite 353 459 43.47
gram3-comparative 1216 116 91.29
gram4-superlative 987 135 87.97
gram5-present-participle 829 227 78.50
gram6-nationality-adjective 1250 139 89.99
gram7-past-tense 1020 540 65.38
gram8-plural 1159 173 87.01
gram9-plural-verbs 593 277 68.16
Total accuracy 14115 5005 73.82


In [169]:
k300_model["U"].shape
lsa_wv = KeyedVectors(vector_size=300)
for (i,w) in enumerate(i2x.values()):
    lsa_wv.add_vectors(w, k300_model["U"][i, :])
    
lsa_analogy_test = lsa_wv.evaluate_word_analogies('analogy_word-test.v1.txt')
summarize_analogy(lsa_analogy_test)

family 2 54 3.57
gram1-adjective-to-adverb 2 340 0.58
gram3-comparative 4 236 1.67
gram5-present-participle 3 269 1.10
gram6-nationality-adjective 7 62 10.14
gram7-past-tense 18 582 3.00
gram8-plural 13 329 3.80
gram9-plural-verbs 7 125 5.30
Total accuracy 56 2131 2.56


In [179]:
lsa_comparable_tests = {}
for test in lsa_analogy_test[1]:
    if len(test['correct'])!=0  and len(test['incorrect'])!=0:
        lsa_comparable_tests[test['section']] = test
        
word2vec_comparable_tests = {}
for test in analogy_test[1]:
    if test['section'] in lsa_comparable_tests.keys():
        # print(test['section'])
        all_lsa_samples = lsa_comparable_tests[test['section']]['correct'] + lsa_comparable_tests[test['section']]['incorrect']
        analogy_in_lsa = lambda analogy: analogy in all_lsa_samples
        # print(lsa_section_tests)
        # print(len(test['correct']))
        test['correct'] = list(filter(analogy_in_lsa, test['correct']))
        test['incorrect'] = list(filter(analogy_in_lsa, test['incorrect']))
        word2vec_comparable_tests[test['section']] = test
        # print(len(test['correct']))
        
summarize_analogy((_, word2vec_comparable_tests.values()))        
        


family 53 3 94.64
gram1-adjective-to-adverb 106 236 30.99
gram3-comparative 191 49 79.58
gram5-present-participle 214 58 78.68
gram6-nationality-adjective 60 9 86.96
gram7-past-tense 433 167 72.17
gram8-plural 282 60 82.46
gram9-plural-verbs 105 27 79.55
Total accuracy 1548 639 70.78


In [227]:
import pickle
embeddings_pkl = pickle.load( open( "embeddings/data.pkl", "rb" ) )

In [323]:
embed_d

[1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990]

In [361]:
embed_E = embeddings_pkl['E']
embed_w = embeddings_pkl['w']
embed_d = embeddings_pkl['d']

df = pd.DataFrame(embed_E)
w2i = Counter(); i2w = Counter()
for i,w in enumerate(embed_w):
    w2i[w] = i
    i2w[i] = w

y2i = Counter(); i2y = Counter()
for i,y in enumerate(embed_d):
    y2i[y] = i
    i2y[i] = y

    
# for year in embed_d:
#     E_counter = Counter()
#     for word in embed_w:
#         E_counter[(year,word)] = df[y2i[year]][w2i[word]]

In [368]:
# y2i['1990']
y2i
# y2i.keys()
# y2i['1910']

Counter({1900: 0,
         1910: 1,
         1920: 2,
         1930: 3,
         1940: 4,
         1950: 5,
         1960: 6,
         1970: 7,
         1980: 8,
         1990: 9})

In [376]:
cosine_similarity(df[y2i['1900']][w2i['hell']].reshape(1,-1), df[y2i['1910']][w2i['hell']].reshape(1,-1))

array([[1.]])

In [378]:
# Compute semantic displacement using cos-dis (i.e. 1 - cos-sim)
def delta_cos_dis(t, t_delta):
    delta_counter = Counter()
    for w_token in embed_w:
        delta_counter[w_token] = 1 - cosine_similarity(df[y2i[t]][w2i[w_token]].reshape(1,-1), df[y2i[t_delta]][w2i[w_token]].reshape(1,-1))[0][0]
    return delta_counter
delta = delta_cos_dis('1900','1910')

In [386]:
delta

Counter({'time': 0.0,
         'man': 4.440892098500626e-16,
         'years': -2.220446049250313e-16,
         'part': 0.0,
         'way': -6.661338147750939e-16,
         'life': 0.0,
         'people': 0.0,
         'work': 0.0,
         'world': -2.220446049250313e-16,
         'states': 0.0,
         'state': 1.1102230246251565e-16,
         'day': 0.0,
         'united': -4.440892098500626e-16,
         'men': 1.1102230246251565e-16,
         'number': -2.220446049250313e-16,
         'case': 0.0,
         'fact': -2.220446049250313e-16,
         'use': 0.0,
         'york': 3.3306690738754696e-16,
         'place': 1.1102230246251565e-16,
         'end': 0.0,
         'year': -2.220446049250313e-16,
         'war': 0.0,
         'government': -4.440892098500626e-16,
         'order': -2.220446049250313e-16,
         'law': -4.440892098500626e-16,
         'point': 0.0,
         'country': -4.440892098500626e-16,
         'hand': -2.220446049250313e-16,
         'god': -2.220446

In [319]:
a = np.ndarray((10,10))

In [321]:
a[0, 0]

6.9244586081332e-310