In [38]:
import os
import sys
sys.path.append('utils/')
import loading_text_and_tokenization
import torch
import numpy as np
import torch.nn as nn
import random
import math

import utils.ngram_utils as ngram_utils
from utils.ngram_utils import NgramLM
from utils.amazon_dataset import AmazonDataset, pad, batchify
from torch.utils.data import DataLoader
from utils.neural_lm import BagOfNGrams, DecoderMLP, seq2seq
import utils.global_variables as gl
import torch
from tqdm import tqdm_notebook, tqdm
_tqdm = tqdm_notebook

In [39]:
torch.manual_seed(1)


<torch._C.Generator at 0x7ff9ec0447b0>

In [40]:
use_cuda = True
device = torch.device("cuda" if (torch.cuda.is_available() and use_cuda) else "cpu")


In [4]:
# Read data from .txt files and create lists of reviews
train_data = []
# create a list of all the reviews 
with open('../data/amazon_train.txt', 'r') as f:
    train_data = [review for review in f.read().split('\n') if review]
    
valid_data = []
# create a list of all the reviews 
with open('../data/amazon_valid.txt', 'r') as f:
    valid_data = [review for review in f.read().split('\n') if review]
    

In [5]:
train_data[0], valid_data[0]
train_data = train_data#[:100]
valid_data = valid_data#[:10]
train_data[0], type(train_data), len(train_data), type(train_data[0])

("this is a great tutu and at a really great price . it doesn ' t look cheap at all . i ' m so glad i looked on amazon and found such an affordable tutu that isn ' t made poorly . a + + ",
 list,
 22288,
 str)

In [6]:
# Tokenize the Datasets
# TODO: this takes a really long time !! why?
train_data_tokenized, all_tokens_train = ngram_utils.tokenize_dataset(train_data)
valid_data_tokenized, all_tokens_valid = ngram_utils.tokenize_dataset(valid_data)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [7]:
N = 10

In [8]:
train_data_padded = ngram_utils.pad_dataset(train_data_tokenized, n=N)
valid_data_padded = ngram_utils.pad_dataset(valid_data_tokenized, n=N)

In [9]:
vocab = ngram_utils.get_vocab(train_data_padded)
vocab_size = len(vocab)
vocab_size, vocab[:10]

(20805, ('<sos>', '<eos>', '.', 'the', 'i', ',', 'and', 'a', 'to', "'"))

In [10]:
id2token, token2id = ngram_utils.get_dict(vocab)
len(id2token), len(token2id)

(20808, 20806)

In [11]:
train_data_ids = ngram_utils.get_ids(train_data_padded, token2id)
valid_data_ids = ngram_utils.get_ids(valid_data_padded, token2id)

In [12]:
train_dataset = AmazonDataset(train_data_ids, max_inp_length=None, use_cuda=True)
train_dataset_ngrams = []
for t in train_dataset:
    for i in range(len(t) - N):
        train_dataset_ngrams.append((t[i:i + N], t[i + N]))
train_loader = DataLoader(train_dataset_ngrams, batch_size=2048, collate_fn=batchify, shuffle=True)

100%|██████████| 107790/107790 [00:05<00:00, 20292.95it/s]


In [13]:
valid_dataset = AmazonDataset(valid_data_ids, max_inp_length=None, use_cuda=True)
valid_dataset_ngrams = []
for t in valid_dataset:
    for i in range(len(t) - N):
        valid_dataset_ngrams.append((t[i:i + N], t[i + N]))
valid_loader = DataLoader(valid_dataset_ngrams, batch_size=2048, collate_fn=batchify, shuffle=True)

100%|██████████| 15172/15172 [00:00<00:00, 43012.11it/s]


In [14]:
num_train = len(train_dataset_ngrams)
num_valid = len(valid_dataset_ngrams)
num_train, num_valid

(2485766, 356599)

In [15]:
encoder = BagOfNGrams(len(id2token), emb_dim=300, hidden_size=256, out_size=128, activation='ReLU', nlayers=2, reduce='mean', dropout=0.1, batch_norm=False)
encoder

BagOfNGrams(
  (embedding): EmbeddingBag(20808, 300, mode=mean)
  (layers): ModuleList(
    (0): Linear(in_features=300, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1)
    (3): Linear(in_features=256, out_features=128, bias=True)
  )
)

In [16]:
decoder = DecoderMLP(input_size=128, output_size=len(id2token), hidden_size=256)
decoder

DecoderMLP(
  (linear): Linear(in_features=128, out_features=256, bias=True)
  (out): Linear(in_features=256, out_features=20808, bias=True)
  (log_softmax): LogSoftmax()
)

In [17]:
model = seq2seq(encoder, decoder, id2token, use_cuda=False, lr=0.1, size_ngrams=N) 
model

seq2seq(
  (encoder): BagOfNGrams(
    (embedding): EmbeddingBag(20808, 300, mode=mean)
    (layers): ModuleList(
      (0): Linear(in_features=300, out_features=256, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.1)
      (3): Linear(in_features=256, out_features=128, bias=True)
    )
  )
  (decoder): DecoderMLP(
    (linear): Linear(in_features=128, out_features=256, bias=True)
    (out): Linear(in_features=256, out_features=20808, bias=True)
    (log_softmax): LogSoftmax()
  )
  (criterion): NLLLoss()
)

In [18]:
TRAIN = False

In [19]:
if TRAIN:
    num_epochs = 10
    log_interval = 10
    best_eval_loss = np.inf

    for epoch in range(num_epochs):
        # Train
        cur_loss = 0
        for i, (data, labels) in enumerate(train_loader):
            prediction, loss = model.train_step(data, labels)
            cur_loss += loss

            if i % log_interval == 0 and i > 0:
                cur_loss = cur_loss / log_interval
                print('| Epoch {:3d} | Train Loss {:5.2f} | Train PPL {:8.2f} | {:5d}/{:5d} Batches'.format(
                    epoch, cur_loss, math.exp(cur_loss), i, int(num_train/len(data))))
                cur_loss = 0

        # Eval
        if epoch % 1 == 0:        
            eval_loss = 0
            for i, (data, labels) in enumerate(valid_loader):
                prediction, loss = model.eval_step(data, labels)
                eval_loss += len(data) * loss
            eval_loss = eval_loss / num_valid 
            print('-' * 89)
            print('| Epoch {:3d} | Valid Loss {:5.2f} | Valid PPL {:8.2f}'.format(
                epoch, eval_loss, math.exp(eval_loss)))
            print('-' * 89)

            # Save the model if the validation loss is the best we've seen so far.
            if not best_eval_loss or eval_loss < best_eval_loss:
                model.save_model('neural_lm_amazon_model_N{}'.format(N) + '.pt')
                best_eval_loss = eval_loss
    pretrained_model = model

else:
    # Load Pretrained Model
    pretrained_model = seq2seq(encoder, decoder, id2token, use_cuda=False, lr=0.1, size_ngrams=N) 
    pretrained_model.load_model('neural_lm_amazon_model_N{}'.format(N) + '.pt')
    pretrained_model

## Perplexity (Train + Valid Data)

In [20]:
def get_perplexity(loader, num_data, model):
    total_loss = 0
    for i, (data, labels) in enumerate(loader):
        prediction, loss = model.eval_step(data, labels)
        total_loss += len(data) * loss
    total_loss = total_loss / num_data 
    ppl = math.exp(total_loss)
    return ppl

In [21]:
# Load Pretrained Model
N = 3
model_n3 = seq2seq(encoder, decoder, id2token, use_cuda=False, lr=0.1, size_ngrams=N) 
model_n3.load_model('neural_lm_amazon_model_N{}'.format(N) + '.pt')
model_n3

seq2seq(
  (encoder): BagOfNGrams(
    (embedding): EmbeddingBag(20808, 300, mode=mean)
    (layers): ModuleList(
      (0): Linear(in_features=300, out_features=256, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.1)
      (3): Linear(in_features=256, out_features=128, bias=True)
    )
  )
  (decoder): DecoderMLP(
    (linear): Linear(in_features=128, out_features=256, bias=True)
    (out): Linear(in_features=256, out_features=20808, bias=True)
    (log_softmax): LogSoftmax()
  )
  (criterion): NLLLoss()
)

In [None]:
valid_ppl = get_perplexity(valid_loader, num_valid, model_n3)
valid_ppl

118.45357802182568

In [None]:
train_ppl = get_perplexity(train_loader, num_train, model_n3)
train_ppl

111.81766466410237

In [None]:
# Load Pretrained Model
N = 5
model_n5 = seq2seq(encoder, decoder, id2token, use_cuda=False, lr=0.1, size_ngrams=N) 
model_n5.load_model('neural_lm_amazon_model_N{}'.format(N) + '.pt')
model_n5

seq2seq(
  (encoder): BagOfNGrams(
    (embedding): EmbeddingBag(20808, 300, mode=mean)
    (layers): ModuleList(
      (0): Linear(in_features=300, out_features=256, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.1)
      (3): Linear(in_features=256, out_features=128, bias=True)
    )
  )
  (decoder): DecoderMLP(
    (linear): Linear(in_features=128, out_features=256, bias=True)
    (out): Linear(in_features=256, out_features=20808, bias=True)
    (log_softmax): LogSoftmax()
  )
  (criterion): NLLLoss()
)

In [None]:
valid_ppl = get_perplexity(valid_loader, num_valid, model_n5)
valid_ppl

39.204964041909726

In [None]:
train_ppl = get_perplexity(train_loader, num_train, model_n5)
train_ppl

In [None]:
# Load Pretrained Model
N = 7
model_n7 = seq2seq(encoder, decoder, id2token, use_cuda=False, lr=0.1, size_ngrams=N) 
model_n7.load_model('neural_lm_amazon_model_N{}'.format(N) + '.pt')
model_n7

In [None]:
valid_ppl = get_perplexity(valid_loader, num_valid, model_n7)
valid_ppl

In [None]:
train_ppl = get_perplexity(train_loader, num_train, model_n7)
train_ppl

In [None]:
# Load Pretrained Model
N = 10
model_n10 = seq2seq(encoder, decoder, id2token, use_cuda=False, lr=0.1, size_ngrams=N) 
model_n10.load_model('neural_lm_amazon_model_N{}'.format(N) + '.pt')
model_n10

In [None]:
valid_ppl = get_perplexity(valid_loader, num_valid, model_n10)
valid_ppl

In [None]:
train_ppl = get_perplexity(train_loader, num_train, model_n10)
train_ppl

## Score Sentences

In [None]:
def score_sentence(sent, model):
    tokenized, _ = ngram_utils.tokenize_dataset(sent)
    sent_ids = ngram_utils.get_ids(tokenized, token2id)
    sent_tensor = torch.LongTensor(sent_ids).to(device)
    generated, scores = model.evaluate(sent_tensor, score_only=True)
    ppl = math.exp(scores)
    return ppl

In [22]:
sentence = ['i like pandas']
ppl3 = score_sentence(sentence, model_n3)
ppl5 = score_sentence(sentence, model_n5)
ppl7 = score_sentence(sentence, model_n7)
ppl10 = score_sentence(sentence, model_n10)
ppl3, ppl5, ppl7, ppl10

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(219)evaluate()
-> loss = self.criterion(decoder_output, torch.LongTensor([ys[0][i]]))


(Pdb)  c


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(218)evaluate()
-> import pdb; pdb.set_trace()


(Pdb)  c


> /home/roberta/ammi-2019-nlp/01-day-LM/utils/neural_lm.py(219)evaluate()
-> loss = self.criterion(decoder_output, torch.LongTensor([ys[0][i]]))


(Pdb)  c


> <ipython-input-21-479a1d43fe1a>(7)score_sentence()
-> ppl = math.exp(scores)


(Pdb)  c


3437.9760046972974

In [None]:
sentence = ['tutu tutu is not my favorit']
ppl3 = score_sentence(sentence, model_n3)
ppl5 = score_sentence(sentence, model_n5)
ppl7 = score_sentence(sentence, model_n7)
ppl10 = score_sentence(sentence, model_n10)
ppl3, ppl5, ppl7, ppl10

In [43]:
sentence = ['i really like this watch']
ppl3 = score_sentence(sentence, model_n3)
ppl5 = score_sentence(sentence, model_n5)
ppl7 = score_sentence(sentence, model_n7)
ppl10 = score_sentence(sentence, model_n10)
ppl3, ppl5, ppl7, ppl10

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




(0.23688190956640393,
 0.2069621170292099,
 0.19724340297057574,
 0.1701903869411314)

In [44]:
sentence = ['training neural networks']
ppl3 = score_sentence(sentence, model_n3)
ppl5 = score_sentence(sentence, model_n5)
ppl7 = score_sentence(sentence, model_n7)
ppl10 = score_sentence(sentence, model_n10)
ppl3, ppl5, ppl7, ppl10

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




(0.1542097808793169,
 0.1426822753179523,
 0.14866593411932333,
 0.13848500821297985)

In [47]:
sentence = ['this is a great tutu']
ppl3 = score_sentence(sentence, model_n3)
ppl5 = score_sentence(sentence, model_n5)
ppl7 = score_sentence(sentence, model_n7)
ppl10 = score_sentence(sentence, model_n10)
ppl3, ppl5, ppl7, ppl10

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




(0.22389753328859022,
 0.24349304077503534,
 0.2568703826212508,
 0.24887823072216506)

In [48]:
sentence = ['my wife really likes the color of this dress']
ppl3 = score_sentence(sentence, model_n3)
ppl5 = score_sentence(sentence, model_n5)
ppl7 = score_sentence(sentence, model_n7)
ppl10 = score_sentence(sentence, model_n10)
ppl3, ppl5, ppl7, ppl10

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




(0.21159407812486142,
 0.2024794603470719,
 0.18907617427561665,
 0.20109196484969336)

## Generate Sentences

In [49]:
def generate_sentence(model, context=None):
    if context is None:
        dummy_context = torch.LongTensor([[0]]).to(device)
        generated, scores = model.evaluate(dummy_context, use_context=False)
    else:
        tokenized, _ = ngram_utils.tokenize_dataset(context)
        context_ids = ngram_utils.get_ids(tokenized, token2id)
        context_tensor = torch.LongTensor(context_ids).to(device)
        generated, scores = model.evaluate(context_tensor, use_context=True)
    
    ppl = math.exp(scores)
    return generated, scores

### No Context

In [None]:
generated, scores = generate_sentence(model_n3)
print(' '.join(word[0] for word in generated)), scores

In [59]:
generated, scores = generate_sentence(model_n5)
print(' '.join(word[0] for word in generated))

i ' m 5 ' about 140 34 34 <eos>


In [60]:
generated, scores = generate_sentence(model_n7)
print(' '.join(word[0] for word in generated))

i ' m a size 6 and i i ordered ordered large it it . . <eos>


In [61]:
generated, scores = generate_sentence(model_n10)
print(' '.join(word[0] for word in generated))

i have been wearing these for years and they are comfortable . <eos>


### With Context

In [62]:
generated, scores = generate_sentence(model_n3, context=['this dress'])
print(' '.join(word[0] for word in generated))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


bra . <eos>


In [63]:
generated, scores = generate_sentence(model_n5, context=['this dress'])
print(' '.join(word[0] for word in generated))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


is . . . . . . . . . . . . . . . . . . .


In [64]:
generated, scores = generate_sentence(model_n7, context=['this dress'])
print(' '.join(word[0] for word in generated))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


is the most comfortable bra ever . <eos>


In [65]:
generated, scores = generate_sentence(model_n10, context=['this dress'])
print(' '.join(word[0] for word in generated))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


is the is a perfect fit . <eos>


In [70]:
generated, scores = generate_sentence(model_n3, context=['i like'])
print(' '.join(word[0] for word in generated))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


the way . <eos>


In [71]:
generated, scores = generate_sentence(model_n5, context=['i like'])
print(' '.join(word[0] for word in generated))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


the way the color . . <eos>


In [72]:
generated, scores = generate_sentence(model_n7, context=['i like'])
print(' '.join(word[0] for word in generated))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


the way it is . <eos>


In [73]:
generated, scores = generate_sentence(model_n10, context=['i like'])
print(' '.join(word[0] for word in generated))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


the color , it is very soft , but is very comfortable , , , soft soft soft , ,
