imports:

In [1]:
import codecs
import os
import csv
import re
import unicodedata
import random
import itertools

import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
import math
from tqdm import tqdm
from livelossplot import PlotLosses
from livelossplot.outputs import MatplotlibPlot
import matplotlib as plt
import gensim
import json


In [2]:
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

Data preprocessing

In [3]:
from helper import print_data_for_visualization, get_QA, to_csw
#print_data_for_visualization()
data_path = '.data/train-v2.0.json'
csw_path = '.data/QA.txt'
#Read in json data and build QA.txt run this line only 1st time
#to_csw(get_QA(data_path=data_path), csw_path)

In [4]:
from helper import get_pairs, filter_pairs
data_path = os.path.join(os.getcwd(), '.data/QA.txt')
#read in and normalize data
pairs = get_pairs(data_path=data_path)
print(f'len(pairs) = {len(pairs)}')
#trim data with questions len > 10 and answers len >5
max_question_lenght, max_answer_lenght = 10, 5
MAX_LEN = (max_question_lenght, max_answer_lenght)
pairs = filter_pairs(pairs, MAX_LEN)
print(pairs[:5])

len(pairs) = 86809
Read 86809 sentence pairs
Trimming pairs with sentence longer than MAX_LEN((10, 5))......
trimmed to 23768 pairs
[['when did beyonce start becoming popular ?', 'in the late 1990s'], ['in which decade did beyonce become famous ?', 'late 1990s'], ['what album made her a worldwide known artist ?', 'dangerously in love'], ['who managed the destiny s child group ?', 'mathew knowles'], ['when did beyonce rise to fame ?', 'late 1990s']]


Building Vocabulary

In [5]:
#Building Vocabulary
from vocabulary import Vocab
#initialize vocab object
vocab = Vocab('SQuAD2.0')
print('populating vocabulary........')
for pair in pairs:
    vocab.add_sentence(pair[0])
    vocab.add_sentence(pair[1])
print('Counted words: {}'.format(vocab.numb_words))

populating vocabulary........
Counted words: 25900


In [6]:
# trim words with frequency less than min_freq = 5 (set at vocab __init__) 
vocab.trim()

# filter out pairs with trimmed words
keep_pairs = list()
for pair in pairs:
    keep1, keep2 = True, True
    #check question for trimmed words
    for word in pair[0].split():
        if word not in vocab.word2int:
            keep1=False
    #check answers for trimed words        
    for word in pair[1].split():
        if word not in vocab.word2int:
            keep2=False

    #pairs that do not contain trimmed words
    if keep1 and keep2: keep_pairs.append(pair)
print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))

pairs = keep_pairs

........This is print from Voc class after call to trimm method.........
keep_words 8190 / 25900 = 0.3162
..........................End of print..................................

Trimmed from 23768 pairs to 9319, 0.3921 of total


In [7]:
#Build input and targets tensor
from helper import pairs2TrainData
batch_size = 5
batches = pairs2TrainData([random.choice(pairs) for _ in range(batch_size)], vocab)
input_variable, lengths, target_variable, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("max_target_len:", max_target_len)


input_variable: tensor([[   1, 7996,  171, 2178,  543,  175,   18, 6150,  493,    9,    2],
        [   1,   18, 5730,   35,  531, 1420,  171, 6345, 2039,    9,    2],
        [   1,   44,   51, 2303,   32, 1116,  387,  349,  908,    9,    2],
        [   1,   18,  289,   42,  924,   10, 2129, 2518,    9,    2,    0],
        [   1,   10,   18,  174,   58, 4350, 4351,  720,    9,    2,    0]])
lengths: tensor([11, 11, 11, 10, 10])
target_variable: tensor([[   1, 2222, 5056,    2,    0],
        [   1, 3641, 6345, 2039,    2],
        [   1,  291,  470,    2,    0],
        [   1, 1056,    2,    0,    0],
        [   1, 3925,    2,    0,    0]])
max_target_len: 5


In [9]:
#futher data preparation
#train valid test 80-20
random.shuffle(pairs)
#20% validation data
split = int(math.floor(len(pairs)*0.2))
train_data, valid_data = pairs[split:], pairs[:split]
#dict to both
data = {'train':train_data, 'valid':valid_data}


In [10]:
#Test Data Loaders
from helper import data_loaders
train_loaders = data_loaders(data=data['valid'], vocab=vocab, batch_size=512)
input_tensor, lenght_tensor, target_tensor, max_target_len = next(train_loaders)
print("input_tensor shape:", input_tensor.shape)
print("target_tensor shape:", target_tensor.shape)

input_tensor shape: torch.Size([512, 11])
target_tensor shape: torch.Size([512, 6])


In [12]:
from helper import initialize_embeddings
from model import Seq2Seq
#ALL TOGETHER:
clip = 20
teacher_forcing_ratio = 0.75
lr = 0.1
epochs = 5

input_size = output_size = vocab.numb_words
embedding_dim = 100
hidden_size = 100
n_layers = 2
dropout = 0.2
batch_size = 128

w2v = gensim.models.Word2Vec.load('brown.embedding')
embeddings = nn.Embedding(input_size, embedding_dim)
initialize_embeddings(w2v,embeddings,vocab)

model = Seq2Seq(
    encoder_input_size=input_size,
    encoder_hidden_size = hidden_size,
    embedding_dim=embedding_dim,
    n_layers=n_layers,
    dropout=dropout,
    decoder_hidden_size=hidden_size,
    decoder_output_size=output_size
)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")

print(f'Total trainable parameters:{sum(p.numel() for p in model.parameters() if p.requires_grad)}')

Initializet total 4336 embeddings with brown embedings, out of total 15173
Building encoder and decoder ...
Models built and ready to go!
Total trainable parameters:2789293


In [13]:
from trainer import optimize
#optimize(
    #data=data,
    #model=model,
    #optimizer=optimizer,
    #criterion=criterion,
    #n_epochs=epochs,
    #save_path=os.path.join(os.getcwd(), 'save_model'),
    #device=device,
    #vocab=vocab,
    #batch_size=batch_size,
    #clip=clip,
    #teacher_forcing=teacher_forcing_ratio,
    #interactive_tracking=False    
   # )

Training: 59it [00:19,  3.09it/s]                                               
Validation: 15it [00:01,  8.92it/s]                                             


3.4672562591100147
3.6068306763966884
Epoch: 1 	Training Loss: 3.467256 	Validation Loss: 3.606831


Training: 59it [01:12,  1.23s/it]                                               
Validation: 15it [00:03,  4.83it/s]                                             


2.883501501406653
3.7161989847819012
Epoch: 2 	Training Loss: 2.883502 	Validation Loss: 3.716199


Training: 59it [01:28,  1.50s/it]                                               
Validation: 15it [00:01,  8.89it/s]                                             


2.798107838226576
3.403855641682943
Epoch: 3 	Training Loss: 2.798108 	Validation Loss: 3.403856


Training: 59it [00:22,  2.61it/s]                                               
Validation: 15it [00:01,  9.22it/s]                                             


2.7328448901742197
3.4540433247884117
Epoch: 4 	Training Loss: 2.732845 	Validation Loss: 3.454043


Training: 59it [01:08,  1.15s/it]                                               
Validation: 15it [00:02,  6.19it/s]                                             

2.680362733743958
3.4418474674224853
Epoch: 5 	Training Loss: 2.680363 	Validation Loss: 3.441847





In [51]:
#after gpu training
model.load_state_dict(torch.load('model_save_last', map_location='cpu'))

<All keys matched successfully>

In [49]:
from helper import evaluateInput
model.eval()
#evaluateInput(model, vocab)
