# Numpy Implementation

In [1]:
# import pandas as pd
import numpy as np
import os
import json
import string
import random
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
def read_wiki(V=20000,n=None):
    all_words_count={}
    f ='./large_files/enwiki-preprocessed/'
    files = os.listdir(f)[:n]
    for file in files:
        for line in open(f+file):
            if line and line[0] not in '[*-|=\{\}':
                s = line.translate(str.maketrans('','',string.punctuation)).lower().split()
                if len(s)>1:
                    for word in s:
                        if word not in all_words_count:
                            all_words_count[word]=0
                        all_words_count[word]+=1


    V=min(V, len(all_words_count))

    all_words_count = sorted(all_words_count.items(),key=lambda x: x[1],reverse=True)
    top_words = [w for w,c in all_words_count[:V-1]]+['<UNK>']
    word2idx={w:i for i,w in enumerate(top_words)}
    unk=word2idx['<UNK>']

    sents=[]
    for file in files:
        for line in open(f+file):
            if line and line[0] not in '[*-|=\{\}':
                s = line.translate(str.maketrans('','',string.punctuation)).lower().split()
                if len(s)>1:
                    sent = [word2idx[word] if word in word2idx else unk for word in s]
                    sents.append(sent)
    return sents,word2idx

In [3]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

def sgd(word,targets,label,lr,W1,W2):
    # W1[input_] shape: D
    # W2[:,targets] shape: D x Number of context words
    # activation shape: N
    # print("input_:", input_, "targets:", targets)
    a_n = W2[:,targets].T.dot(W1[word]) # N,
    p_n= sigmoid(a_n) # N,
    gW2 = np.outer(W1[word],p_n-label) # D x N
    gW1 = ((p_n-label) *(W2[:,targets])).sum(axis=1) # D 

    W2[:,targets]-=lr*gW2 # D x N
    W1[word]-=lr*gW1 # D 

    # return cost (binary cross entropy)
    cost = label*np.log(p_n+1e-10)+(1-label)*np.log(1-p_n+1e-10)
    return cost.sum()

def get_context(pos,sentence,window_size):
    # input:
    # a sentence of the form: x x x x c c c pos c c c x x x x
    # output:
    # the context word indices: c c c c c c
    start=max(0,pos-window_size)
    end = min(len(sentence),pos+window_size)
    context=[]

    for i ,w in enumerate(sentence[start:end],start=start):
        if i!=pos:
            context.append(w)
    return context

def neg_sampling_dstn(sentences,vocab_size):
    # Pn(w) = prob of word occuring
    # we would like to sample the negative samples
    # such that words that occur more often
    # should be sampled more often

    word_freq=np.zeros(vocab_size)
    for sent in sentences:
        for word in sent:
            word_freq[word]+=1
    p_neg = word_freq**0.75/ (word_freq**0.75).sum()
    return p_neg

In [4]:
def train(n):
    sentences,word2idx = read_wiki(20000,n)

    vocab_size=len(word2idx)

    window_size=4
    lr=0.025
    final_lr=0.0001
    num_neg=5 # number of negative samples to draw per input word
    epoches=20
    D=50  # word embedding size

    # learning rate decay
    lr_delta=(lr-final_lr)/epoches

    W1=np.random.randn(vocab_size,D) # input-to-hidden
    W2=np.random.randn(D,vocab_size) # hidden-to-output

    # distribution for drawing negative samples
    p_neg = neg_sampling_dstn(sentences,vocab_size)

    costs = []
    total_words = sum(len(sent) for sent in sentences)
    print("Total words: ",total_words)

    threshold=1e-5
    p_drop=1-np.sqrt(threshold/p_neg)

    for epoch in range(epoches):
        random.shuffle(sentences)
        cost=0
        counter=0

        for sentence in sentences:

            sentence = [w for w in sentence if np.random.random()<1-p_drop[w]]

            if len(sentence)<2: continue
                
            # randomly order words so we don't always see samples in the same order
            randomly_ordered_sentence = np.random.choice(len(sentence),size = len(sentence),replace=False)

            for pos in randomly_ordered_sentence:
                # the middle word
                word = sentence[pos]
                # get the positive context words/negative samples
                context_words = get_context(pos,sentence,window_size)
                neg_word = np.random.choice(vocab_size,p=p_neg)
                targets=np.array(context_words)

                # do one iteration of stochastic gradient descent
                c=sgd(word,targets,1,lr,W1,W2)
                cost+=c
                c=sgd(neg_word,targets,0,lr,W1,W2)
                cost+=c
            counter+=1
            if counter % 2000 == 0:
                print("  processed %s / %s\r" % (counter, len(sentences)))
        print("epoch complete:", epoch, "cost:", cost)


        # save the cost
        costs.append(cost)

        # update the learning rate
        lr -= lr_delta

    plt.plot(costs)
    plt.show()

    if not os.path.isdir('./model'):
        os.mkdir('./model')

    with open('./model/word2vec.json','w') as f:
        json.dump(word2idx,f)

    np.savez('./model/weights.npz',W1,W2)
    print("saved model")
    return word2idx, W1, W2

In [None]:
word2idx,W1,W2=train(None)

Total words:  86478677
  processed 2000 / 1271558
  processed 4000 / 1271558
  processed 6000 / 1271558
  processed 8000 / 1271558
  processed 10000 / 1271558
  processed 12000 / 1271558
  processed 14000 / 1271558
  processed 16000 / 1271558
  processed 18000 / 1271558
  processed 20000 / 1271558
  processed 22000 / 1271558
  processed 24000 / 1271558
  processed 26000 / 1271558
  processed 28000 / 1271558
  processed 30000 / 1271558
  processed 32000 / 1271558
  processed 34000 / 1271558
  processed 36000 / 1271558
  processed 38000 / 1271558
  processed 40000 / 1271558
  processed 42000 / 1271558
  processed 44000 / 1271558
  processed 46000 / 1271558
  processed 48000 / 1271558
  processed 50000 / 1271558
  processed 52000 / 1271558
  processed 54000 / 1271558
  processed 56000 / 1271558
  processed 58000 / 1271558
  processed 60000 / 1271558
  processed 62000 / 1271558
  processed 64000 / 1271558
  processed 66000 / 1271558
  processed 68000 / 1271558
  processed 70000 / 1271558
 

  processed 570000 / 1271558
  processed 572000 / 1271558
  processed 574000 / 1271558
  processed 576000 / 1271558
  processed 578000 / 1271558
  processed 580000 / 1271558
  processed 582000 / 1271558
  processed 584000 / 1271558
  processed 586000 / 1271558
  processed 588000 / 1271558
  processed 590000 / 1271558
  processed 592000 / 1271558
  processed 594000 / 1271558
  processed 596000 / 1271558
  processed 598000 / 1271558
  processed 600000 / 1271558
  processed 602000 / 1271558
  processed 604000 / 1271558
  processed 606000 / 1271558
  processed 608000 / 1271558
  processed 610000 / 1271558
  processed 612000 / 1271558
  processed 614000 / 1271558
  processed 616000 / 1271558
  processed 618000 / 1271558
  processed 620000 / 1271558
  processed 622000 / 1271558
  processed 624000 / 1271558
  processed 626000 / 1271558
  processed 628000 / 1271558
  processed 630000 / 1271558
  processed 632000 / 1271558
  processed 634000 / 1271558
  processed 636000 / 1271558
  processed 63

In [5]:
def load_model():
    with open('./model/word2vec.json','r') as f:
        word2idx=json.load(f)
    weights = np.load('./model/weights.npz')
    W1=weights['arr_0']
    W2=weights['arr_1']
    return word2idx,W1,W2

In [6]:

def find_analogies_GloVe(w1,w2,w3):
    for w in (w1, w2, w3):
        if w not in word2vec:
            print("{} not in word2vec".format(w))

    king = word2vec[w1]
    man = word2vec[w2]
    woman = word2vec[w3]
    v0 = king-man+woman
    distances = pairwise_distances(v0.reshape(1,D), embedding, metric='cosine')
    distances = distances.reshape(V)
    idxs = distances.argsort()[:4]
    for idx in idxs:
        word = idx2word[idx]
        if  word not in (w1,w2,w3):
            best_word = word
            break
    print(w1, "-", w2, "=", best_word, "-", w3)

def analogy(pos1, neg1, pos2, neg2, word2idx, idx2word,  word_embedding):
    V,D=word_embedding.shape
    for w in (pos1, neg1, pos2, neg2):
        if w not in word2idx:
            print(w," not in the dictionary")
            return

    p1=word_embedding[word2idx[pos1]]
    n1=word_embedding[word2idx[neg1]]
    p2=word_embedding[word2idx[pos2]]
    n2=word_embedding[word2idx[neg2]]

    vec=p1-n1+n2
    distances = pairwise_distances(vec.reshape((1,D)),word_embedding,metric='cosine').reshape(V)

    idx=distances.argsort()[:10]
    
    for i in idx:
        word= idx2word[i]
        if word not in (pos1, neg1, neg2):
            best_word=word
            break
    print("got: %s - %s = %s - %s" % (pos1, neg1, best_word, neg2))
    print("closest 10:")
    for i in idx:
        print(idx2word[i], distances[i])
    print()

In [7]:
word2idx,W1,W2=load_model()
idx2word={v:k for k,v in word2idx.items()}

for We in (W1, (W1 + W2.T) / 2):
    print("**********")

    analogy('king', 'man', 'queen', 'woman', word2idx, idx2word, We)
    analogy('king', 'prince', 'queen', 'princess', word2idx, idx2word, We)
    analogy('miami', 'florida', 'dallas', 'texas', word2idx, idx2word, We)
    analogy('einstein', 'scientist', 'picasso', 'painter', word2idx, idx2word, We)
    analogy('japan', 'sushi', 'germany', 'bratwurst', word2idx, idx2word, We)
    analogy('man', 'woman', 'he', 'she', word2idx, idx2word, We)
    analogy('man', 'woman', 'uncle', 'aunt', word2idx, idx2word, We)
    analogy('man', 'woman', 'brother', 'sister', word2idx, idx2word, We)
    analogy('man', 'woman', 'husband', 'wife', word2idx, idx2word, We)
    analogy('man', 'woman', 'actor', 'actress', word2idx, idx2word, We)
    analogy('man', 'woman', 'father', 'mother', word2idx, idx2word, We)
    analogy('heir', 'heiress', 'prince', 'princess', word2idx, idx2word, We)
    analogy('nephew', 'niece', 'uncle', 'aunt', word2idx, idx2word, We)
    analogy('france', 'paris', 'japan', 'tokyo', word2idx, idx2word, We)
    analogy('france', 'paris', 'china', 'beijing', word2idx, idx2word, We)
    analogy('february', 'january', 'december', 'november', word2idx, idx2word, We)
    analogy('france', 'paris', 'germany', 'berlin', word2idx, idx2word, We)
    analogy('week', 'day', 'year', 'month', word2idx, idx2word, We)
    analogy('week', 'day', 'hour', 'minute', word2idx, idx2word, We)
    analogy('france', 'paris', 'italy', 'rome', word2idx, idx2word, We)
    analogy('paris', 'france', 'rome', 'italy', word2idx, idx2word, We)
    analogy('france', 'french', 'england', 'english', word2idx, idx2word, We)
    analogy('japan', 'japanese', 'china', 'chinese', word2idx, idx2word, We)
    analogy('china', 'chinese', 'america', 'american', word2idx, idx2word, We)
    analogy('japan', 'japanese', 'italy', 'italian', word2idx, idx2word, We)
    analogy('japan', 'japanese', 'australia', 'australian', word2idx, idx2word, We)
    analogy('walk', 'walking', 'swim', 'swimming', word2idx, idx2word, We)



**********
got: king - man = throne - woman
closest 10:
king 0.14414545237811727
throne 0.24656253580136755
son 0.267252433482311
emperor 0.26938471601024083
refused 0.27929521222287734
duke 0.28422610458449316
kingdom 0.28608770960783136
kings 0.2870797133460291
death 0.28884894888982726
empire 0.2932562292513611

got: king - prince = reign - princess
closest 10:
princess 0.15603193879218136
king 0.303246093978617
reign 0.35603959915712147
emperor 0.3677059694634932
rome 0.3842310894522529
church 0.38556999550249393
kings 0.386555663253876
throne 0.3902889021363075
duchy 0.392025662558692
holy 0.39738539408202

got: miami - florida = thirteenth - texas
closest 10:
miami 0.03146014966786037
thirteenth 0.48799562373487304
illustrations 0.4976661846445509
persecution 0.5245347552419982
embedded 0.5257009224935012
hierarchy 0.526959423369191
humor 0.5277639982051912
expects 0.5293928472867034
fbi 0.5326471935564904
remember 0.5382997996050276

picasso  not in the dictionary
sushi  not in 

got: japan - japanese = athens - chinese
closest 10:
chinese 0.21953527885205648
japan 0.2788977490014287
athens 0.3280739820399725
independence 0.3491686747849645
soldiers 0.3510886814516735
destroyed 0.3515397833248818
occupation 0.35357179371364555
colonial 0.3594536264729594
serving 0.36548033588682216
germany 0.36837307771440153

got: china - chinese = television - american
closest 10:
american 0.14386885145887796
television 0.2610146565708056
park 0.2715182477531476
actors 0.2798059081721269
2013 0.28558477013483186
america 0.29585443404035805
2005 0.3097679121421274
17 0.31432517672335114
1993 0.3181790369467482
directors 0.31960490582034284

got: japan - japanese = france - italian
closest 10:
italian 0.1390341003255292
france 0.244921477661169
succeeded 0.2537441912663132
defeat 0.2694627957496559
chief 0.2736483227087031
minister 0.2790252203531466
wars 0.2799364218658662
brother 0.2852730349752506
queen 0.2859845664544576
remained 0.28658925031589266

got: japan - japanese =

# Tensorflow Implementation