In [1]:
import numpy as np
import pandas as pd
import random 
import torch
import regex as re
from nltk.corpus import stopwords
from collections import Counter

from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import RegexpTokenizer 
from collections import OrderedDict
from nltk.corpus import stopwords

from torch import nn
from torch.functional import F
from torch import optim
from tqdm import tqdm

In [2]:
remove_stopwords = True
use_lemmatization = True
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r"\w+")
catchedStopWords = stopwords.words('english')

In [3]:
def tokenize_doc(sent,
                 lemma=False, 
                 remove_stopwords=False):
    
    # a simple tokenizer with case folding and an option to use lemmatization
    sent = sent.lower()
    tokens = sent.split()
    
    if lemma:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    if remove_stopwords:
        tokens = [token for token in tokens if token not in catchedStopWords]
    return tokens

def basic_text_processing(corpus, num_words):
    vocab = set()
    all_tokens = []
    
    # tokenization
    for doc in tqdm(corpus):
        tokens = tokenize_doc(doc, lemma=use_lemmatization, remove_stopwords=remove_stopwords)
        vocab.update(set(tokens))
        all_tokens.extend(tokens)
    print("Tokenization complete")
    # TODO START
    # We only want to train with the top num_words MOST FREQUENT words
    # Output a variable called ``train_tokens" that is similar to all_tokens
    # variable but without infrequent words
    freq_words = Counter(all_tokens)
    freq_words = sorted(freq_words, key=freq_words.get, reverse=True)
    freq_words = freq_words[:num_words]
    train_tokens = [token for token in all_tokens if token in freq_words]
    # TODO END
    
    # generating vocabulary from the train_tokens
    word_counts = Counter(train_tokens)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True) 
    i2w = {ii: word for ii, word in enumerate(sorted_vocab)}
    w2i = {word: ii for ii, word in i2w.items()}
    
    return  w2i, i2w, train_tokens

In [41]:
def get_contexts(words, idx):
    contexts = [words[idx-2], words[idx-1], words[idx+1], words[idx+2]]
    return contexts

In [91]:
def get_batches(words, batch_size):  #, window_size = 4):
    for i in range(0, len(words), batch_size):
        curr = words[i:i + batch_size]   # current batch
        batch_x, batch_y = [], []
        #batch = []
        for ii in range(2,len(curr)-2):
            x = get_contexts(curr, ii)
            y = [curr[ii]]
            #batch.append((x,y))
            batch_x.append(x)
            batch_y.append(y)
        
        yield batch_x, batch_y

In [6]:
#CC-News dataset contains news articles from news sites all over the world. 
#The data is available on AWS S3 in the Common Crawl bucket at /crawl-data/CC-NEWS/. 
#This version of the dataset has been prepared using news-please - an integrated web crawler and information extractor for news.
#It contains 708241 English language news articles published between Jan 2017 and December 2019. 
#It represents a small portion of the English language subset of the CC-News dataset.

from datasets import load_dataset
dataset = load_dataset("cc_news")
corpus = dataset['train']['text'][:150000]


Found cached dataset cc_news (/home/nate/.cache/huggingface/datasets/cc_news/plain_text/1.0.0/ae469e556251e6e7e20a789f93803c7de19d0c4311b6854ab072fecb4e401bd6)


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
# all configurations go here
# TODO
# You will need to set configurations below to a suitable values
# As for learning rate, the current value should work (but you are welcome to change it)
n_vocab = 10000  # maximum size of vocab
n_embed = 500 # size of embedding
lr = 0.003 # learning rate
ws = 5  # window size
batch_size =  500 # batch size for sampling positive examples
n_epochs =  10 #umber of training epochs
device = 'cpu'

In [8]:
# this cell might take 20 minutes to run, so be patient!
# optional: you might want to save these intermediate results to disk
# so that next time you open Google Colab, you don't need to
# run this again
w2i, i2w, train_tokens = basic_text_processing(corpus, num_words=n_vocab)
int_words = [w2i[token] for token in train_tokens]
print("Vocab Size:", len(w2i))

100%|██████████████████████████████████| 150000/150000 [02:39<00:00, 940.69it/s]


Tokenization complete
Vocab Size: 10000


In [68]:
a = int_words[0:500]

In [69]:
a[1:6]

[4966, 6699, 15, 137, 3837]

In [92]:

for x, y in get_batches(a, 50):
    print(x,y)
    break

[[1077, 4966, 15, 137], [4966, 6699, 137, 3837], [6699, 15, 3837, 63], [15, 137, 63, 5], [137, 3837, 5, 8281], [3837, 63, 8281, 1168], [63, 5, 1168, 9], [5, 8281, 9, 137], [8281, 1168, 137, 2578], [1168, 9, 2578, 7210], [9, 137, 7210, 951], [137, 2578, 951, 2894], [2578, 7210, 2894, 1168], [7210, 951, 1168, 121], [951, 2894, 121, 686], [2894, 1168, 686, 4], [1168, 121, 4, 998], [121, 686, 998, 1295], [686, 4, 1295, 2667], [4, 998, 2667, 951], [998, 1295, 951, 3168], [1295, 2667, 3168, 2994], [2667, 951, 2994, 4996], [951, 3168, 4996, 1288], [3168, 2994, 1288, 1295], [2994, 4996, 1295, 8881], [4996, 1288, 8881, 2210], [1288, 1295, 2210, 2210], [1295, 8881, 2210, 1295], [8881, 2210, 1295, 1150], [2210, 2210, 1150, 4996], [2210, 1295, 4996, 13], [1295, 1150, 13, 228], [1150, 4996, 228, 70], [4996, 13, 70, 370], [13, 228, 370, 459], [228, 70, 459, 7], [70, 370, 7, 30], [370, 459, 30, 87], [459, 7, 87, 25], [7, 30, 25, 112], [30, 87, 112, 932], [87, 25, 932, 1339], [25, 112, 1339, 932], [11

### Training

In [62]:
def train_cbow(          C,
                   W,
                   int_words,
                   n_vocab,
                   n_embed,
                   learning_rate,
                   batch_size,
                   n_epochs,
                   print_every=100):
    
    optimizer = optim.Adam([C, W], lr=learning_rate)    
    
    C = torch.nn.init.uniform_(W, -0.10, +0.10)
    W = torch.nn.init.uniform_(C, -0.10, +0.10)

    step = 0
    
    for epoch in range(n_epochs):
        for inputs, targets in get_batches(int_words, batch_size=batch_size):
            step += 1
            
            contexts_indices = torch.LongTensor(inputs).to(device) # number_batches x window_size
            labels_indices = torch.LongTensor(targets).to(device) # number_batches x label
            
            
            # retrieve vectors of target words and positive context words
            embeded_contexts = C[contexts_indices] # number_batches x window_size x embedding
            embeded_labels = W[labels_indices] # number_batches x label x embedding
            yield contexts_indices, labels_indices, embeded_contexts, embeded_labels
            
            b_size, embed_size = embeded_contexts.shape
               
            
            #print(embeded_targets.shape)
            #print(embeded_pos_contexts.shape)
            #print(embed_neg_contexts.shape)
            # TODO START
            # calculating the final loss and output it to ``loss" variable
            embeded_contexts = embeded_contexts.view(b_size,embed_size,1)
            
            pos_loss = torch.bmm(embeded_pos_contexts, embeded_targets).sigmoid().log().squeeze()     
            
            loss = -(pos_loss + neg_loss).mean()
            # loss = 
            # TODO END
            
            
            # optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if (step % print_every) == 0:
                print("Epoch: {}/{} | Loss: {:.4f}".format(epoch+1, n_epochs, loss.item()))


In [51]:
# initialization of W and C weight matrix
C = torch.nn.Parameter(torch.zeros((n_vocab, n_embed), dtype=torch.float32))
W = torch.nn.Parameter(torch.zeros((n_vocab, n_embed), dtype=torch.float32))

In [108]:
a,b,c,d = train_cbow(    C,
               W,
               int_words,
               n_vocab=n_vocab,
               n_embed=n_embed,
               learning_rate=lr,
               batch_size=batch_size,
               n_epochs=n_epochs,
               print_every=100)

In [122]:
d[0].shape

torch.Size([1, 500])

In [105]:
W[b]

tensor([[[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]]], grad_fn=<IndexBackward0>)

In [None]:
# final embeddings is the summation of the two matrix (check lecture slides)
embeddings = W.data.to('cpu').data.numpy()
embeddings += C.data.to('cpu').data.numpy()

### Evaluation via Visualization

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

n_viz_words = 100
tsne = TSNE()
embeddings_tsne = tsne.fit_transform(embeddings[:n_viz_words, :])

fig, ax = plt.subplots(figsize = (10, 10))
for i in range(n_viz_words):
    plt.scatter(*embeddings_tsne[i, :], color = 'red', s=40)
    plt.annotate(i2w[i], (embeddings_tsne[i, 0], embeddings_tsne[i, 1]), alpha = 0.7)

### Evaluation via Document Similarity

In [None]:
def transform(query, w2i, embeddings, strategy):
    # TODO
    # input: query or document, vocabulary w2i
    # input: trained word2vec embeddings
    # input: strategy: either 'average' or 'concatenate'
    # output: vector representation of the document query
    w2v_query = tokenize_doc(query, lemma=use_lemmatization, remove_stopwords=remove_stopwords)
    idx = [w2i[word] for word in w2v_query]
    v = [embeddings[i] for i in idx]
    
    assert strategy in ['average', 'concatenate']
    
    # get vectors of each word in the query
    
    # sentence aggregation strategy
    if strategy == 'average':
        # TODO START
        vector = [em.mean() for em in v]
        # vector = 
        # TODO END
    else:
        # TODO START
        vector = np.ravel(v)
        # TODO END
    
    return vector

In [None]:
# test document similarity
q = transform('today I am very happy', w2i, embeddings, strategy='average')
v = transform('today I feel so fascinated', w2i, embeddings, strategy='average')
sim = np.dot(q, v)/(np.linalg.norm(q)* np.linalg.norm(v))
print("Cosine Similarity: {}".format(sim)) # this score should be high / close to 1.0

In [None]:
def evaluation(strategy):
    df = pd.read_csv('./quora_train.csv')
    print("Loaded {} pairs".format(len(df)))
    pairs = list(zip(df['question1'].astype(str), df['question2'].astype(str)))
    
    all_sims = []
    
    for doc1, doc2 in tqdm(pairs):
        q = transform(doc1, w2i, embeddings, strategy=strategy)
        v = transform(doc2, w2i, embeddings, strategy=strategy)
        
        diff = len(q) - len(v)
        if diff > 0:
            v = np.pad(v, (0, np.abs(diff)))
        else:
            q = np.pad(q, (0, np.abs(diff)))
        
        sim = np.dot(q, v)/(np.linalg.norm(q)* np.linalg.norm(v))
        all_sims.append(sim)
        
    return np.mean(all_sims)

In [None]:
avg_similarity = evaluation('average')
print("Final Average Similarity using Average Strategy: {}".format(avg_similarity))

In [None]:
avg_similarity = evaluation('concatenate')
print("Final Average Similarity using Concatenation Strategy: {}".format(avg_similarity))