imports:

In [1]:
import codecs
import os
import csv
import re
import unicodedata
import random
import itertools

import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
import math

In [2]:
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token

Visualize data

In [3]:
from helper import print_data_for_visualization
#print_data_for_visualization()

Extract questions and answers from data

In [2]:
# RUN this cell only first time
from helper import get_QA

path = '.data/train-v2.0.json'
#gets questions and answers from data
questions, answers = get_QA(path=path)
print(tuple(zip(questions[86815:86821], answers[86815:86821])))

#export questions and answers to QA.txt
delimiter ='\t'
delimiter = str(codecs.decode(delimiter, "unicode_escape"))
#output path for QA.txt
data_path = os.path.join(os.getcwd(), '.data/QA.txt')
#export QA pairs to QA.txt
with open(data_path, 'w', encoding='utf-8') as f:
    writer = csv.writer(f, delimiter=delimiter, lineterminator='\n')
    for pair in tuple(zip(questions, answers)):
        #keep only pairs that have answers
        if pair[1] is not None:
            writer.writerow(pair)


(('When the CPJP continued to fight, what did other groups do?', None), ('What is one thing the FACA peace agreement called for when signed in April 2007?', None), ('Who became president in 2013?', 'Michel Djotodia'), ('What was Bozize indicted for?', 'crimes against humanity'), ('What mass murder did Bozize commit?', 'genocide'), ('How many people were displaced in the unrests?', '200,000'))


In [4]:
from helper import normalizeString
data_path = os.path.join(os.getcwd(), '.data/QA.txt')
pairs = list()# list of QA lists
with open(data_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f.readlines()):
        pairs.append([normalizeString(s) for s in line.split('\t')])

pairs[5:10]    

[['in what r&b group was she the lead singer ?', "destiny's child"],
 ['what album made her a worldwide known artist ?', 'dangerously in love'],
 ["who managed the destiny's child group ?", 'mathew knowles'],
 ['when did beyonce rise to fame ?', 'late 1990s'],
 ["what role did beyonce have in destiny's child ?", 'lead singer']]

In [5]:
# checking sequence lenght accros pairs
max_len = list()
for items in pairs:
    for item in items:
        max_len.append(len(item.split()))
max_len.sort(reverse=True)
#max_len[300:400] keep pairs with MAX_LENGTH up to 20 words 

In [6]:
from helper import filter_pairs
#TRIMM PAIRS WITH MAX_LEN > 20
MAX_LEN = 20
pairs = filter_pairs(pairs, MAX_LEN)

Read 86821 sentence pairs
Trimming pairs with sentence longer than MAX_LEN(20)......
trimmed to 83988 pairs


Building Vocabulary

In [7]:
#Building Vocabulary
from vocabulary import Vocab
#initialize vocab object
vocab = Vocab('SQuAD2.0')
print('populating vocabulary........')
for pair in pairs:
    vocab.add_sentence(pair[0])
    vocab.add_sentence(pair[1])
print('Counted words: {}'.format(vocab.numb_words))

populating vocabulary........
Counted words: 69400


In [8]:
# trim words with frequency less than min_freq = 3
vocab.trim()

# filter out pairs with trimmed words
keep_pairs = list()
for pair in pairs:
    keep1, keep2 = True, True
    #check question for trimmed words
    for word in pair[0].split():
        if word not in vocab.word2int:
            keep1=False
    #check answers for trimed words        
    for word in pair[1].split():
        if word not in vocab.word2int:
            keep2=False

    #pairs that do not contain trimmed words
    if keep1 and keep2: keep_pairs.append(pair)
print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs)))

pairs = keep_pairs

........This is print from Voc class after call to trimm method.........
keep_words 22939 / 69400 = 0.3305
..........................End of print..................................

Trimmed from 83988 pairs to 46199, 0.5501 of total


In [9]:
#Build input and targets tensor
from helper import pairs2TrainData
batch_size = 5
batches = pairs2TrainData([random.choice(pairs) for _ in range(batch_size)], vocab)
input_variable, lengths, target_variable, max_target_len = batches

print("input_variable:", input_variable)
print("lengths:", lengths)
print("target_variable:", target_variable)
print("max_target_len:", max_target_len)


input_variable: tensor([[   1,    1,    1,    1,    1],
        [  10,    3,   14,   14,   14],
        [1965,    4, 1340,  117,  235],
        [  14,   11,  117,   11,   70],
        [ 197, 9266,   11,  566, 9668],
        [   4,  449, 6991, 1494,   18],
        [  11, 6054, 2670,   70, 8768],
        [2639,  373, 6316, 2521,    9],
        [2594,  754,   70,    9,    2],
        [3068,  286,    9,    2,    0],
        [ 197, 5558,    2,    0,    0],
        [9482,    9,    0,    0,    0],
        [ 180,    2,    0,    0,    0],
        [   9,    0,    0,    0,    0],
        [   2,    0,    0,    0,    0]])
lengths: tensor([15, 13, 11, 10,  9])
target_variable: tensor([[    1,     1,     1,     1,     1],
        [ 1574,   868,    11,  6585, 17686],
        [ 2594,   463,   755,  2521,     2],
        [ 3068,  4691,  6991,     2,     0],
        [  197,     2,  1340,     0,     0],
        [    2,     0,     2,     0,     0]])
max_target_len: 6


In [10]:
# initialize encoder, decoder and test forward pass trought seq2seq 
from helper import Seq2seq
input_size = output_size = vocab.numb_words
embedding_dim = 300
hidden_size = 512
n_layers = 2
dropout = 0.25

embeddings = nn.Embedding(input_size, embedding_dim)

seq2seq = Seq2seq(input_size, embeddings, embedding_dim, hidden_size, output_size, dropout, n_layers)
a = seq2seq(input_variable, target_variable, max_target_len, lengths)
print(a.shape)#seq_len, batch_size
print(a)
print(a[:,0]) # one data point from decoder output
print([vocab.int2word[x.item()] for x in a[:,0]])

Building encoder and decoder ...
Models built and ready to go!
torch.Size([6, 5])
tensor([[19715, 17712,  4480,  4480, 21347],
        [ 4298,  4480, 16284,  4480,  4480],
        [17406, 17712,  4480, 17843, 16222],
        [10829, 20345, 13556, 16464, 16222],
        [10829, 17712,  6534,  7908, 17406],
        [13605, 15847,  6534, 10997,  7739]])
tensor([19715,  4298, 17406, 10829, 10829, 13605])
['somerset,', 'satellite', '51st', 'ligands', 'ligands', 'asexual']


In [11]:
sum(p.numel() for p in seq2seq.parameters() if p.requires_grad)

26188486

In [12]:
#train valid test 70-20-10
random.shuffle(pairs)
#20% validation data
split = int(math.floor(len(pairs)*0.2))
train_data, valid_data = pairs[split:], pairs[:split]
#10 test data
split = int(math.floor(len(pairs)*0.1))
train_data, test_data = train_data[split:], train_data[:split]
#dict to hold all 3
data = {'train':train_data, 'valid':valid_data, 'test':test_data}


In [23]:
#Test Data Loaders
from helper import data_loaders
train_loaders = data_loaders(data=data['train'], vocab=vocab, batch_size=256)
input_tensor, lenght_tensor, target_tensor, max_target_len = next(train_loaders)
print("input_tensor shape:", input_tensor.shape)
print("target_tensor shape:", target_tensor.shape)


input_tensor shape: torch.Size([21, 256])
target_tensor shape: torch.Size([17, 256])
