In [1]:
import numpy as np
from gensim.models import KeyedVectors
import matplotlib.pyplot as plt
import preprocess
import os

%matplotlib inline

In [2]:
texts, labels = preprocess.download_and_extract_ted()

# Preprocessing


In [3]:
input_texts = preprocess.preprocess_ted(texts)

In [4]:
input_texts = preprocess.clean_tokens_ted(input_texts)

There are 4474850 tokens in the dataset.
There are 18438 tokens that appear only once.
There are 18538 unique tokens to remove.
It took 0.4788999557495117 seconds to remove all unnecessary items.
There are now only 1926086 tokens in the dataset.


In [5]:
#remove all inputs that have less than 500 tokens in them
input_texts, labels = preprocess.remove_short_texts(input_texts, labels)

There are now only 1924 inputs left.


In [6]:
input_texts = preprocess.pad_texts(input_texts)

In [7]:
labels, label_lookup = preprocess.preprocess_labels(labels)

In [8]:
word_to_index, index_to_word, input_indices_list = preprocess.compute_indexes(input_texts)

In [9]:
from gensim.models import KeyedVectors
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

glove_file = datapath('/home/maxime/Documents/nlp/practical_2_bis/glove.txt')
tmp_file = get_tmpfile("word2vec.txt")
glove2word2vec(glove_file, tmp_file)
glove = KeyedVectors.load_word2vec_format(tmp_file)
glove_vectors = glove.wv

  if __name__ == '__main__':


In [10]:
embeddings = preprocess.clean_vocabulary(word_to_index, glove)
print (np.shape(embeddings))

vocabulary size: 35562 words
found 34558 word vectors, 0.9717676171193971 of our vocabulary
missing words e.g. ['yementimes', 'unsignalized', 'compliances', 'sistas', 'mouaz', 'paraorchestra', 'kelps', 'neurotypical', 'futureless', 'biogenerative', 'flosses', 'sietas', 'myesha', 'britlin', 'isil', 'legadema', 'taimina', 'superbetter', 'patternicity', 'redbrigade', 'rocketcam', 'decellularized', 'templated', 'intiwatana', 'feki', 'retweeted', 'aremeyaw', 'novich', 'vishna', 'romotive', 'eppasod', 'animaris', 'blicket', 'otwoma', 'mahabuba', 'sarcosuchus', 'hyperconnectivity', 'gymnosophist', 'bidialectal', 'dracorex', 'rrrrrrr', 'kaesava', 'impatients', 'solidariot', 'monogamously', 'siyathemba', 'capric', 'terrapower', 'schizophonia', 'kleptoparasites']
(35562, 50)


In [11]:
inputs_train, inputs_test, inputs_cv = preprocess.generate_datasets(input_indices_list, labels, label_lookup)

(1540, 192, 192)


In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [13]:
class Classifier(nn.Module):
    def __init__(self, input_size):
        super(Classifier, self).__init__()
        
        self.embed = nn.Embedding(np.shape(embeddings)[0], np.shape(embeddings)[1])
        self.embed.weight.data.copy_(torch.from_numpy(embeddings))
        self.fc1 = nn.Linear(input_size, 50)
        self.dropout1 = nn.Dropout(.5)
        self.tanh1 = nn.Tanh()
        self.fc2 = nn.Linear(50, 8)
        self.logsoftmax = nn.LogSoftmax()
        
    def forward(self, x):
        embeds = self.embed(x)#.view((1, -1))
        embeds = torch.mean(embeds, 1)
        out = self.fc1(embeds)
        out = self.dropout1(out)
        out = self.tanh1(out)
        out = self.fc2(out)
        out = self.logsoftmax(out)
        return out

In [14]:
def evaluate_model(model, mode):
    if mode == 'training':
        batch_size = 1540
        dataset = inputs_train
        
    elif mode == 'validation':
        batch_size = 192
        dataset = inputs_cv
    else:
        batch_size = 192
        dataset = inputs_test
          
    
    batch = preprocess.get_data_batch(0, batch_size, dataset)
    text = torch.from_numpy(batch[0]).long()
    labels = torch.from_numpy(batch[1]).long()
    
    model.eval()
    with torch.no_grad():
        outputs = model(text)
        loss = criterion(outputs, labels)
    y_hat = torch.argmax(outputs, dim = 1)
    
    score = 0
    for i in range (y_hat.size()[0]):
        if y_hat[i].item() == labels[i]:
            score += 1
    return score / len(y_hat)

In [15]:
from IPython.display import clear_output

losses = []
mean_loss = 0

model = Classifier(50).double()
model.train()
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters())

num_epochs = 100
for epoch in range(num_epochs):
    i = 0
    print('Epoch [{}/{}], Mean Loss {:.4f}, Training score {:.4f}, Validation score {:.4f}'
          .format(epoch+1, num_epochs, mean_loss, 
                  evaluate_model(model, 'training'), evaluate_model(model, 'validation')))
    
    
    while 'computing the dataset by 44-size minibatches': # 1540 % 44 = 0 and im lazy krkr  
        if(i > 1539):
            i = 0
            break
        else:
            mini_batch = preprocess.get_data_batch(i, 44, inputs_train)
            text = torch.from_numpy(mini_batch[0]).long()
            labels = torch.from_numpy(mini_batch[1]).long()
            
        # Forward pass
        outputs = model(text)
        loss = criterion(outputs, labels)
        losses.append(loss)
        mean_loss = sum(losses) / len (losses)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        i += 44



Epoch [1/100], Mean Loss 0.0000, Training score 0.1331, Validation score 0.1094
Epoch [2/100], Mean Loss 1.7073, Training score 0.5571, Validation score 0.5573
Epoch [3/100], Mean Loss 1.5412, Training score 0.5571, Validation score 0.5573
Epoch [4/100], Mean Loss 1.4653, Training score 0.5571, Validation score 0.5573
Epoch [5/100], Mean Loss 1.4149, Training score 0.5682, Validation score 0.5781
Epoch [6/100], Mean Loss 1.3731, Training score 0.5903, Validation score 0.5885
Epoch [7/100], Mean Loss 1.3346, Training score 0.6130, Validation score 0.5990
Epoch [8/100], Mean Loss 1.2983, Training score 0.6273, Validation score 0.6042
Epoch [9/100], Mean Loss 1.2638, Training score 0.6403, Validation score 0.6146
Epoch [10/100], Mean Loss 1.2310, Training score 0.6571, Validation score 0.6250
Epoch [11/100], Mean Loss 1.1996, Training score 0.6792, Validation score 0.6198
Epoch [12/100], Mean Loss 1.1695, Training score 0.7019, Validation score 0.6250
Epoch [13/100], Mean Loss 1.1405, Tra

KeyboardInterrupt: 

In [16]:
evaluate_model(model, 'testing')



0.6197916666666666