In [1]:
import numpy as np
import pandas as pd
import random 
import torch
import regex as re
from nltk.corpus import stopwords
from collections import Counter

from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import RegexpTokenizer 
from collections import OrderedDict
from nltk.corpus import stopwords

from torch import nn
from torch.functional import F
from torch import optim
from tqdm import tqdm

import time

In [2]:
remove_stopwords = True
use_lemmatization = True
lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r"\w+")
catchedStopWords = stopwords.words('english')

In [3]:
def tokenize_doc(sent,
                 lemma=False, 
                 remove_stopwords=False):
    
    # a simple tokenizer with case folding and an option to use lemmatization
    sent = sent.lower()
    tokens = sent.split()
    tokens = [*Counter(tokens).keys()]
    
    if lemma:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    if remove_stopwords:
        tokens = [token for token in tokens if token not in catchedStopWords]
    return tokens

def basic_text_processing(corpus, num_words):
    vocab = set()
    all_tokens = []
    
    # tokenization
    for doc in tqdm(corpus):
        tokens = tokenize_doc(doc, lemma=use_lemmatization, remove_stopwords=remove_stopwords)
        vocab.update(set(tokens))
        all_tokens.extend(tokens)
    print("Tokenization complete")
    # TODO START
    # We only want to train with the top num_words MOST FREQUENT words
    # Output a variable called ``train_tokens" that is similar to all_tokens
    # variable but without infrequent words
    freq_words = dict(Counter(all_tokens).most_common(num_words))
    train_tokens = [token for token in all_tokens if token in freq_words]

    # TODO END
    
    # generating vocabulary from the train_tokens
    word_counts = Counter(train_tokens)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True) 
    i2w = {ii: word for ii, word in enumerate(sorted_vocab)}
    w2i = {word: ii for ii, word in i2w.items()}
    
    return  w2i, i2w, train_tokens

In [4]:
def get_contexts(words, idx):
    contexts = [words[idx-2], words[idx-1], words[idx+1], words[idx+2]]
    return contexts

In [5]:
def get_batches(words, batch_size):  #, window_size = 4):
    for i in range(0, len(words), batch_size):
        curr = words[i:i + batch_size]   # current batch
        batch_x, batch_y = [], []
        #batch = []
        for ii in range(2,len(curr)-2):
            x = get_contexts(curr, ii)
            y = curr[ii]
            #batch.append((x,y))
            batch_x.append(x)
            batch_y.append(y)
        
        yield batch_x, batch_y
        #yield batch

In [6]:
#CC-e woNews dataset contains news articles from news sites all over thrld. 
#The data is available on AWS S3 in the Common Crawl bucket at /crawl-data/CC-NEWS/. 
#This version of the dataset has been prepared using news-please - an integrated web crawler and information extractor for news.
#It contains 708241 English language news articles published between Jan 2017 and December 2019. 
#It represents a small portion of the English language subset of the CC-News dataset.

from datasets import load_dataset
dataset = load_dataset("cc_news")
corpus = dataset['train']['text'][:150000]


Found cached dataset cc_news (/home/nhatvan1561/.cache/huggingface/datasets/cc_news/plain_text/1.0.0/ae469e556251e6e7e20a789f93803c7de19d0c4311b6854ab072fecb4e401bd6)


  0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
# all configurations go here
# TODO
# You will need to set configurations below to a suitable values
# As for learning rate, the current value should work (but you are welcome to change it)
n_vocab = 100000  # maximum size of vocab
n_embed = 200 # size of embedding
lr = 0.003 # learning rate
ws = 5  # window size
batch_size =  50 # batch size for sampling positive examples
n_epochs =  5 #umber of training epochs
device = 'cpu'

In [8]:
# this cell might take 20 minutes to run, so be patient!
# optional: you might want to save these intermediate results to disk
# so that next time you open Google Colab, you don't need to
# run this again
w2i, i2w, train_tokens = basic_text_processing(corpus, num_words=n_vocab)
int_words = [w2i[token] for token in train_tokens]
print("Vocab Size:", len(w2i))

100%|█████████████████████████████████| 150000/150000 [02:04<00:00, 1202.40it/s]


Tokenization complete
Vocab Size: 100000


### Training

In [11]:
def train_cbow(    U,
                   V,
                   int_words,
                   n_vocab,
                   n_embed,
                   learning_rate,
                   batch_size,
                   n_epochs,
                   print_every=100):
    
    optimizer = optim.Adam([U, V], lr=learning_rate)    
    
    U = torch.nn.init.uniform_(U, -0.10, +0.10)
    V = torch.nn.init.uniform_(V, -0.10, +0.10)

    step = 0
    
    for epoch in range(n_epochs):
        for inputs, targets in get_batches(int_words, batch_size=batch_size):
            step += 1
            
            contexts_indices = torch.LongTensor(inputs).to(device) # number_batches x window_size
            targets_indices = torch.LongTensor(targets).to(device) # number_batches x label
            #return contexts_indices, targets_indices
            b_size = contexts_indices.shape[0]
            
            # one hot of target words
            T = torch.zeros(b_size,n_vocab, 1)
            for i in range(b_size):
                T[i][targets_indices[i]] = 1
            # retrieve vectors of target words and positive context words
            W1 = U[contexts_indices].mean(1) 
            W1 = W1.view(b_size, n_embed,1)
            W2 = V.expand(b_size,-1,-1)
            o_layer = torch.bmm(W2, W1).sigmoid()   
            o_layer = o_layer/o_layer.sum(1).view(b_size,1,1)
            return o_layer, T
            #print(embeded_targets.shape)
            #print(embeded_pos_contexts.shape)
            #print(embed_neg_contexts.shape)
            # TODO START
            # calculating the final loss and output it to ``loss" variable
            #embeded_contexts = embeded_contexts.view(b_size,embed_size,1)
            #embeded_targets = embeded_targets.view(b_size, 1, embed_size) 
            #z = torch.bmm(embeded_targets, embeded_contexts).sigmoid().squeeze()
            #a = z/z.sum()
            #loss = -(1/len(a))*(z*torch.log(a)+(1-z)*torch.log(1-a)).sum()
        
            #print(loss[0])
            
            # TODO END
            
            
            # optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if (step % print_every) == 0:
                print("Epoch: {}/{} | Loss: {:.4f}".format(epoch+1, n_epochs, loss.item()))


In [12]:
# initialization of W and C weight matrix
U = torch.nn.Parameter(torch.zeros((n_vocab, n_embed), dtype=torch.float32))
V = torch.nn.Parameter(torch.zeros((n_vocab, n_embed), dtype=torch.float32))

In [13]:
y, Y = train_cbow(    U,
               V,
               int_words,
               n_vocab=n_vocab,
               n_embed=n_embed,
               learning_rate=lr,
               batch_size=batch_size,
               n_epochs=n_epochs,
               print_every=100)

In [14]:
y.shape

torch.Size([46, 100000, 1])

In [120]:
Y.shape

torch.Size([46, 100000, 1])

In [130]:
 loss = torch.nn.CrossEntropyLoss()

In [20]:
y.argmax(1)

tensor([[21663],
        [98524],
        [77772],
        [85128],
        [97785],
        [28078],
        [48414],
        [46132],
        [58613],
        [31283],
        [22797],
        [77793],
        [91335],
        [77177],
        [ 4219],
        [63472],
        [25091],
        [55643],
        [68583],
        [79094],
        [34265],
        [ 7443],
        [71864],
        [93226],
        [71679],
        [71864],
        [61327],
        [61498],
        [20589],
        [28801],
        [53512],
        [65849],
        [82534],
        [88567],
        [75023],
        [73372],
        [87642],
        [92898],
        [64565],
        [68656],
        [96853],
        [31389],
        [61100],
        [33296],
        [81077],
        [10557]])

In [16]:
[np.array(np.arange(10)),y[0].argmax()]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [131]:
loss(y[0],Y[0])

tensor(-0., grad_fn=<DivBackward1>)

In [128]:
a = 0
for i,j in zip(y,Y):
    print(loss(i,j))

tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBackward1>)
tensor(-0., grad_fn=<DivBack

In [None]:
# final embeddings is the summation of the two matrix (check lecture slides)
embeddings = U.data.to('cpu').data.numpy()
embeddings += V.data.to('cpu').data.numpy()

In [88]:
torch.bmm(W2, W1).sigmoid().shape

torch.Size([46, 100000, 1])

### Evaluation via Visualization

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

n_viz_words = 200
tsne = TSNE()
embeddings_tsne = tsne.fit_transform(embeddings[:n_viz_words, :])

fig, ax = plt.subplots(figsize = (20, 20))
for i in range(n_viz_words):
    plt.scatter(*embeddings_tsne[i, :], color = 'red', s=40)
    plt.annotate(i2w[i], (embeddings_tsne[i, 0], embeddings_tsne[i, 1]), alpha = 0.7)

### Evaluation via Document Similarity

In [None]:
def transform(query, w2i, embeddings, strategy):
    # TODO
    # input: query or document, vocabulary w2i
    # input: trained word2vec embeddings
    # input: strategy: either 'average' or 'concatenate'
    # output: vector representation of the document query
    w2v_query = tokenize_doc(query, lemma=use_lemmatization, remove_stopwords=remove_stopwords)
    idx = [w2i[word] for word in w2v_query]
    v = [embeddings[i] for i in idx]
    
    assert strategy in ['average', 'concatenate']
    
    # get vectors of each word in the query
    
    # sentence aggregation strategy
    if strategy == 'average':
        # TODO START
        vector = [em.mean() for em in v]
        # vector = 
        # TODO END
    else:
        # TODO START
        vector = np.ravel(v)
        # TODO END
    
    return vector

In [None]:
# test document similarity
q = transform('today I am very happy', w2i, embeddings, strategy='average')
v = transform('today I feel so fascinated', w2i, embeddings, strategy='average')
sim = np.dot(q, v)/(np.linalg.norm(q)* np.linalg.norm(v))
print("Cosine Similarity: {}".format(sim)) # this score should be high / close to 1.0

In [None]:
def evaluation(strategy):
    df = pd.read_csv('./quora_train.csv')
    print("Loaded {} pairs".format(len(df)))
    pairs = list(zip(df['question1'].astype(str), df['question2'].astype(str)))
    
    all_sims = []
    
    for doc1, doc2 in tqdm(pairs):
        q = transform(doc1, w2i, embeddings, strategy=strategy)
        v = transform(doc2, w2i, embeddings, strategy=strategy)
        
        diff = len(q) - len(v)
        if diff > 0:
            v = np.pad(v, (0, np.abs(diff)))
        else:
            q = np.pad(q, (0, np.abs(diff)))
        
        sim = np.dot(q, v)/(np.linalg.norm(q)* np.linalg.norm(v))
        all_sims.append(sim)
        
    return np.mean(all_sims)

In [None]:
avg_similarity = evaluation('average')
print("Final Average Similarity using Average Strategy: {}".format(avg_similarity))

In [None]:
avg_similarity = evaluation('concatenate')
print("Final Average Similarity using Concatenation Strategy: {}".format(avg_similarity))