In [None]:
import numpy as np
import pandas as pd
import random 
import torch
from nltk.corpus import stopwords
from collections import Counter

from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from collections import OrderedDict
from nltk.corpus import stopwords

from torch import nn
from torch.functional import F
from torch import optim
from tqdm import tqdm

In [None]:
def tokenize_doc(sent, 
                 lemma=False, 
                 remove_stopwords=False):
    
    # a simple tokenizer with case folding and an option to use lemmatization
    sent = sent.lower()
    tokens = sent.split()
    if lemma:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    if remove_stopwords:
        
        tokens = [token for token in tokens if token not in stopwords.words('english')]
    return tokens

def basic_text_processing(corpus, num_words):
    vocab = set()
    all_tokens = []
    
    # tokenization
    for doc in tqdm(corpus):
        tokens = tokenize_doc(doc, lemma=False, remove_stopwords=False)
        vocab.update(set(tokens))
        all_tokens.extend(tokens)
    
    # TODO START
    # We only want to train with the top num_words MOST FREQUENT words
    # Output a variable called ``train_tokens" that is similar to all_tokens
    # variable but without infrequent words
    freq = Counter(all_tokens)
    train_tokens = [word for word in all_tokens if freq[word] > 3]
    # train_tokens = 
    # TODO END
    
    # generating vocabulary from the train_tokens
    word_counts = Counter(train_tokens)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True) 
    i2w = {ii: word for ii, word in enumerate(sorted_vocab)}
    w2i = {word: ii for ii, word in i2w.items()}
    
    return w2i, i2w, train_tokens

In [None]:
def get_targets(words, idx, window_size=5):
    R = random.randint(1, 5)
    start = max(0,idx-R)
    end = min(idx+R,len(words)-1)
    targets = words[start:idx] + words[idx+1:end+1] # +1 since doesn't include this idx
    return targets

In [None]:
def get_batches(words, batch_size, window_size = 5):
    for i in range(0, len(words), batch_size):
        curr = words[i:i + batch_size]   # current batch
        batch_x, batch_y = [], []
        
        for ii in range(len(curr)):
            x = [curr[ii]]
            y = get_targets(curr, ii)
            batch_x.extend(x * len(y))
            batch_y.extend(y)
        
        yield batch_x, batch_y

In [None]:
#CC-News dataset contains news articles from news sites all over the world. 
#The data is available on AWS S3 in the Common Crawl bucket at /crawl-data/CC-NEWS/. 
#This version of the dataset has been prepared using news-please - an integrated web crawler and information extractor for news.
#It contains 708241 English language news articles published between Jan 2017 and December 2019. 
#It represents a small portion of the English language subset of the CC-News dataset.

from datasets import load_dataset
dataset = load_dataset("cc_news")
corpus = dataset['train']['text'][:150000]

In [None]:
# all configurations go here
# TODO
# You will need to set configurations below to a suitable values
# As for learning rate, the current value should work (but you are welcome to change it)
n_vocab = 50000  # maximum size of vocab
n_embed = 500 # size of embedding
lr = 0.001 # learning rate
n_negative_samples = 100 # number negative examples per positive example
ws = 5  # window size
batch_size =  500 # batch size for sampling positive examples
n_epochs =  10# number of training epochs
device = 'cpu'

In [None]:
# this cell might take 20 minutes to run, so be patient!
# optional: you might want to save these intermediate results to disk
# so that next time you open Google Colab, you don't need to
# run this again
w2i, i2w, train_tokens = basic_text_processing(corpus, num_words=n_vocab)
int_words = [w2i[token] for token in train_tokens]
print("Vocab Size:", len(w2i))

### Training

In [None]:
def train_skipgram(W,
                   C,
                   int_words,
                   n_vocab,
                   n_embed,
                   learning_rate,
                   n_negative_samples,
                   batch_size,
                   window_size,
                   n_epochs,
                   print_every=100):
    
    optimizer = optim.Adam([W, C], lr=learning_rate)    
    
    W = torch.nn.init.uniform_(W, -0.10, +0.10)
    C = torch.nn.init.uniform_(C, -0.10, +0.10)

    step = 0
    
    for epoch in range(n_epochs):
        for inputs, targets in get_batches(int_words, batch_size=batch_size, window_size=window_size):
            step += 1
            
            targets_indices = torch.LongTensor(inputs).to(device)
            contexts_indices = torch.LongTensor(targets).to(device)
            
            # retrieve vectors of target words and positive context words
            embeded_targets = W[targets_indices]
            embeded_pos_contexts = C[contexts_indices]
            
            batch_size, embed_size = embeded_targets.shape
            
            # retrieve vectors of negative examples
            noise_dist = torch.ones(n_vocab)
            noise_words = torch.multinomial(noise_dist, 
                                            num_samples=batch_size*n_negative_samples, 
                                            replacement=True)
            noise_words = noise_words.to(device)
            embed_neg_contexts = C[noise_words].view(batch_size, n_negative_samples, n_embed)
            
            
            # TODO START
            # calculating the final loss and output it to ``loss" variable
            # loss = 
            # TODO END
            
            
            # optimization
            #optimizer.zero_grad()
            #loss.backward()
            #optimizer.step()
            
            if (step % print_every) == 0:
                print("Epoch: {}/{} | Loss: {:.4f}".format(epoch+1, n_epochs, loss.item()))

In [None]:
# initialization of W and C weight matrix
W = torch.nn.Parameter(torch.zeros((n_vocab, n_embed), dtype=torch.float32))
C = torch.nn.Parameter(torch.zeros((n_vocab, n_embed), dtype=torch.float32))

In [None]:
train_skipgram(W,
               C,
               int_words,
               n_vocab=n_vocab,
               n_embed=n_embed,
               learning_rate=lr,
               n_negative_samples=n_negative_samples,
               batch_size=batch_size,
               window_size=ws,
               n_epochs=n_epochs,
               print_every=100)

In [None]:
# final embeddings is the summation of the two matrix (check lecture slides)
embeddings = W.data.to('cpu').data.numpy()
embeddings += C.data.to('cpu').data.numpy()

### Evaluation via Visualization

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

n_viz_words = 250
tsne = TSNE()
embeddings_tsne = tsne.fit_transform(embeddings[:n_viz_words, :])

fig, ax = plt.subplots(figsize = (10, 10))
for i in range(n_viz_words):
    plt.scatter(*embeddings_tsne[i, :], color = 'red', s=40)
    plt.annotate(i2w[i], (embeddings_tsne[i, 0], embeddings_tsne[i, 1]), alpha = 0.7)

### Evaluation via Document Similarity

In [None]:
lemmatizer = WordNetLemmatizer()
use_lemmatization = False
remove_stopwords = False

In [None]:
def transform(query, w2i, embeddings, strategy):
    # TODO
    # input: query or document, vocabulary w2i
    # input: trained word2vec embeddings
    # input: strategy: either 'average' or 'concatenate'
    # output: vector representation of the document query
    assert strategy in ['average', 'concatenate']
    
    # get vectors of each word in the query
    
    # sentence aggregation strategy
    if strategy == 'average':
        # TODO START
        # vector = 
        # TODO END
    else:
        # TODO START
        # vector = 
        # TODO END
    
    return vector

In [None]:
# test document similarity
q = transform('today I am very happy', w2i, embeddings, strategy='average')
v = transform('today I feel so fantastic', w2i, embeddings, strategy='average')
sim = np.dot(q, v)/(np.linalg.norm(q)* np.linalg.norm(v))
print("Cosine Similarity: {}".format(sim)) # this score should be high / close to 1.0

In [None]:
def evaluation(strategy):
    df = pd.read_csv('./quora_train.csv')
    print("Loaded {} pairs".format(len(df)))
    pairs = list(zip(df['question1'].astype(str), df['question2'].astype(str)))
    
    all_sims = []
    
    for doc1, doc2 in tqdm(pairs):
        q = transform(doc1, w2i, embeddings, strategy=strategy)
        v = transform(doc2, w2i, embeddings, strategy=strategy)
        
        diff = len(q) - len(v)
        if diff > 0:
            v = np.pad(v, (0, np.abs(diff)))
        else:
            q = np.pad(q, (0, np.abs(diff)))
        
        sim = np.dot(q, v)/(np.linalg.norm(q)* np.linalg.norm(v))
        all_sims.append(sim)
        
    return np.mean(all_sims)

In [None]:
avg_similarity = evaluation('average')
print("Final Average Similarity using Average Strategy: {}".format(avg_similarity))

In [None]:
avg_similarity = evaluation('concatenate')
print("Final Average Similarity using Concatenation Strategy: {}".format(avg_similarity))