In [6]:
import json
import numpy as np
import sys
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

from scipy import stats
import pandas as pd 

from tqdm import tqdm
from itertools import islice
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from scipy.spatial.distance import cosine

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [None]:
def get_windows(seq,n):
    '''
    returns a sliding window (of width n) over data from the iterable
    taken from: https://stackoverflow.com/questions/6822725/rolling-or-sliding-window-iterator/6822773#6822773
    '''
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result


def sample_examples(docs,max_window_size,n_windows):
    '''generate target,context pairs and negative examples'''
    windows = []
    for i,doc in enumerate(docs):
        windows.append(list(get_windows(doc, 2*np.random.randint(1, max_window_size) + 1 )))
    windows = [elt for sublist in windows for elt in sublist] # flatten
    windows = list(np.random.choice(windows,size=n_windows)) # select a subset
    
    all_negs = list(np.random.choice(token_ints, size=n_negs*len(windows), p=neg_distr))
    return windows,all_negs

def compute_dot_products(pos,negs,target):
    prods = Wc[pos+negs,] @ Wt[target,] # (n_pos+n_negs,d) X (d,) -> (n_pos+n_negs,)
    return prods

def compute_loss(prodpos,prodnegs):
    '''prodpos and prodnegs are numpy vectors containing the dot products of the context word vectors with the target word vector'''
    term_pos, term_negs = np.log(1 + np.exp(-prodpos)), np.log(1 + np.exp(prodnegs))
    return np.sum(term_pos) + np.sum(term_negs)

def compute_gradients(pos,negs,target,prodpos,prodnegs):
    factors_pos = 1/(np.exp(prodpos)+1)
    factors_negs = 1/(np.exp(-prodnegs)+1)

    partial_pos = np.array([factors_pos[k,] * -Wt[target,] for k in range(len(factors_pos))])
    partial_negs = np.array([factors_negs[k,] * Wt[target,] for k in range(len(factors_negs))])
    
    term_pos = - Wc[pos,].T @ factors_pos
    term_negs = Wc[negs,].T @ factors_negs
    partial_target = np.sum(term_pos,axis=0) + np.sum(term_negs,axis=0)
    
    return partial_pos, partial_negs,partial_target

# = = = = = = = = = = = = = = = = = = = = = 

max_window_size = 5 # extends on both sides of the target word
n_windows = int(1e6) # number of windows to sample at each epoch
n_negs = 5 # number of negative examples to sample for each positive
d = 64 # dimension of the embedding space
n_epochs = 15
lr_0 = 0.03
decay = 1e-6

resume = True
train = True

with open('/content/drive/MyDrive/nlp_centrale/imdb_files/doc_ints.txt', 'r') as file:
    docs = file.read().splitlines()

docs = [[int(eltt) for eltt in elt.split()] for elt in docs]

with open('/content/drive/MyDrive/nlp_centrale/imdb_files/vocab.json', 'r') as file:
    vocab = json.load(file)

vocab_inv = {v:k for k,v in vocab.items()}

with open('/content/drive/MyDrive/nlp_centrale/imdb_files/counts.json', 'r') as file:
    counts = json.load(file)

token_ints = range(1,len(vocab)+1)
neg_distr = [counts[vocab_inv[elt]] for elt in token_ints]
neg_distr = np.sqrt(neg_distr)
neg_distr = neg_distr/sum(neg_distr) # normalize

# ========== train model ==========
if train:
    
    total_its = int(1e6)*13
    if not resume :
        Wt = np.random.normal(size=(len(vocab)+1,d)) # + 1 is for the OOV token
        Wc = np.random.normal(size=(len(vocab)+1,d))
    else:
        Wt = np.load('/content/drive/MyDrive/nlp_centrale/imdb_files/input_vecs.npy')
        Wc = np.load('/content/drive/MyDrive/nlp_centrale/imdb_files/output_vecs.npy')
    
    for epoch in range(n_epochs):
        print("Epoch : %i/%i"%(epoch+1, n_epochs))
        
        windows,all_negs = sample_examples(docs,max_window_size,n_windows)
        print('training examples sampled')
        
        np.random.shuffle(windows)
        
        total_loss = 0
        
        with tqdm(total=len(windows),unit_scale=True,postfix={'loss':0.0,'lr':lr_0},ncols=50) as pbar: #desc="Epoch : %i/%i" % (epoch+1, n_epochs)
            for i,w in enumerate(windows):
                
                target = w[int(len(w)/2)] # elt at the center
                pos = list(w)
                del pos[int(len(w)/2)] # all elts but the center one
                
                negs = all_negs[n_negs*i:n_negs*i+n_negs]
                
                prods = compute_dot_products(pos,negs,target)
                prodpos = prods[0:len(pos),]
                prodnegs = prods[len(pos):(len(pos)+len(negs)),]
                
                partials_pos,partials_negs,partial_target = compute_gradients(pos,negs,target,prodpos,prodnegs)
                
                lr = lr_0 * 1/(1+decay*total_its)
                total_its += 1
                
                Wt[target,] -= lr * partial_target
                Wc[pos,] -= partials_pos * lr
                Wc[negs,] -= partials_negs * lr
                
                total_loss += compute_loss(prodpos,prodnegs)
                loss_printed = round(total_loss/(i+1), 4)
                lr_printed = round(lr, 4)
                pbar.set_postfix({"loss" : str(loss_printed), "lr" : str(lr_printed)})
                pbar.update(1)
        if epoch % 1 == 0:
            np.save('/content/drive/MyDrive/nlp_centrale/imdb_files/input_vecs',Wt,allow_pickle=False) # pickle disabled for portability reasons
            np.save('/content/drive/MyDrive/nlp_centrale/imdb_files/output_vecs',Wc,allow_pickle=False)
            print('word vectors saved to disk')
    
else:
    Wt = np.load('/content/drive/MyDrive/nlp_centrale/imdb_files/input_vecs.npy')
    Wc = np.load('/content/drive/MyDrive/nlp_centrale/imdb_files/output_vecs.npy')

In [10]:
def my_cos_similarity(word1,word2):
    try:
      embed_1 = Wt[vocab[word1],].reshape(1,-1)
    except KeyError:
      embed_1 = Wt[0,].reshape(1,-1)
    try:
      embed_2 = Wt[vocab[word2],].reshape(1,-1)
    except KeyError:
      embed_2 = Wt[0,].reshape(1,-1)
    sim = cosine(embed_1, embed_2)
    return round(float(sim),4)

In [11]:
def loadPairs(path):
    data = pd.read_csv(path, delimiter='\t')
    pairs = zip(data['word1'], data['word2'], data['SimLex999'])
    return pairs

In [12]:
pairs = loadPairs("/content/drive/MyDrive/nlp_centrale/SimLex-999.txt")
our_similarities,original_similarities = [],[]
for a, b, original_similarity in pairs:
  our_similarities.append(my_cos_similarity(a, b)) 
  original_similarities.append(original_similarity)
corr = stats.spearmanr(our_similarities,original_similarities).correlation
print('spearman correlation :',corr)

spearman correlation : 0.03943479803014715
