<a href="https://colab.research.google.com/github/mmsamiei/just-practice-deep/blob/master/Ben_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import spacy

import random
import math
import os
import time

In [0]:
SEED = 1

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [0]:
spacy_en = spacy.load('en')
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [0]:
SRC = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True)
TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True)

In [0]:
from torchtext.data import TabularDataset, interleave_keys

train_dataset = TabularDataset(path="./formatted_movie_lines.txt", format="CSV",
                               fields=[("query", SRC),("response", TRG)],
                               csv_reader_params={"delimiter":'\t'})

In [0]:
SRC.build_vocab(train_dataset, min_freq=2)
TRG.build_vocab(train_dataset, min_freq=2)

In [185]:
len(train_dataset)

221282

In [0]:
##############

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [241]:
from torchtext.data import Dataset

def my_filter_pred(example, limited_word = 16):
  if(len(example.query) <= limited_word and len(example.response) <= limited_word):
    return True
  else:
    return False

my_train_dataset = Dataset(examples = train_dataset.examples,
               fields=[("query", SRC),("response", TRG)],
               filter_pred = my_filter_pred)

print("len of this my_train_dataset is {}".format(len(my_train_dataset)))

len of this my_train_dataset is 120314


In [0]:
BATCH_SIZE = 256

train_iterator = BucketIterator(my_train_dataset,
     batch_size=BATCH_SIZE,
     sort_key=lambda x: interleave_keys(len(x.query), len(x.response)),
     sort = True,
     device=device,
     shuffle = True)

In [243]:
len(list(iter(train_iterator)))

470

In [244]:
for batch in iter(train_iterator):
  print(batch.query.shape)

torch.Size([256, 5])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 5])
torch.Size([256, 5])
torch.Size([256, 5])
torch.Size([256, 5])
torch.Size([256, 5])
torch.Size([256, 5])
torch.Size([256, 5])
torch.Size([256, 5])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 5])
torch.Size([256, 5])
torch.Size([256, 5])
torch.Size([256, 5])
torch.Size([256, 5])
torch.Size([256, 5])
torch.Size([256, 5])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 4])
torch.Size([256, 5])
torch.Size([256, 5])
torch.Size([256, 5])
torch.Size([256, 5])
torch.Size([256, 5])
torch.Size([256, 7])
torch.Size([256, 6])
torch.Size([256, 6])
torch.Size([2

In [0]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hid_dim, n_layers, n_heads, pf_dim, encoder_layer, self_attention, positionwise_feedforward, dropout, device):
        super().__init__()

        self.input_dim = input_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.pf_dim = pf_dim
        self.encoder_layer = encoder_layer
        self.self_attention = self_attention
        self.positionwise_feedforward = positionwise_feedforward
        self.dropout = dropout
        self.device = device
        
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(1000, hid_dim)
        
        self.layers = nn.ModuleList([encoder_layer(hid_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout, device) 
                                     for _ in range(n_layers)])
        
        self.do = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src sent len]
        #src_mask = [batch size, src sent len]
        
        pos = torch.arange(0, src.shape[1]).unsqueeze(0).repeat(src.shape[0], 1).to(self.device)
        
        src = self.do((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))
        
        #src = [batch size, src sent len, hid dim]
        
        for layer in self.layers:
            src = layer(src, src_mask)
            
        return src

In [0]:
class EncoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout, device):
        super().__init__()
        
        self.ln = nn.LayerNorm(hid_dim)
        self.sa = self_attention(hid_dim, n_heads, dropout, device)
        self.pf = positionwise_feedforward(hid_dim, pf_dim, dropout)
        self.do = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src sent len, hid dim]
        #src_mask = [batch size, src sent len]
        
        src = self.ln(src + self.do(self.sa(src, src, src, src_mask)))
        
        src = self.ln(src + self.do(self.pf(src)))
        
        return src

In [0]:
class SelfAttention(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        
        assert hid_dim % n_heads == 0
        
        self.w_q = nn.Linear(hid_dim, hid_dim)
        self.w_k = nn.Linear(hid_dim, hid_dim)
        self.w_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc = nn.Linear(hid_dim, hid_dim)
        
        self.do = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim // n_heads])).to(device)
        
    def forward(self, query, key, value, mask=None):
        
        bsz = query.shape[0]
        
        #query = key = value [batch size, sent len, hid dim]
                
        Q = self.w_q(query)
        K = self.w_k(key)
        V = self.w_v(value)
        
        #Q, K, V = [batch size, sent len, hid dim]
        
        Q = Q.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
        K = K.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
        V = V.view(bsz, -1, self.n_heads, self.hid_dim // self.n_heads).permute(0, 2, 1, 3)
        
        #Q, K, V = [batch size, n heads, sent len, hid dim // n heads]
        
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        #energy = [batch size, n heads, sent len, sent len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = self.do(torch.softmax(energy, dim=-1))
        
        #attention = [batch size, n heads, sent len, sent len]
        
        x = torch.matmul(attention, V)
        
        #x = [batch size, n heads, sent len, hid dim // n heads]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        
        #x = [batch size, sent len, n heads, hid dim // n heads]
        
        x = x.view(bsz, -1, self.n_heads * (self.hid_dim // self.n_heads))
        
        #x = [batch size, src sent len, hid dim]
        
        x = self.fc(x)
        
        #x = [batch size, sent len, hid dim]
        
        return x

In [0]:
class PositionwiseFeedforward(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.pf_dim = pf_dim
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.do = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, sent len, hid dim]
        
        x = self.do(torch.relu(self.fc_1(x)))
        
        #x = [batch size, sent len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, sent len, hid dim]
        
        return x

In [0]:
class Decoder(nn.Module):
    def __init__(self, output_dim, hid_dim, n_layers, n_heads, pf_dim, decoder_layer, self_attention, positionwise_feedforward, dropout, device):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.pf_dim = pf_dim
        self.decoder_layer = decoder_layer
        self.self_attention = self_attention
        self.positionwise_feedforward = positionwise_feedforward
        self.dropout = dropout
        self.device = device
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(1000, hid_dim)
        
        self.layers = nn.ModuleList([decoder_layer(hid_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout, device)
                                     for _ in range(n_layers)])
        
        self.fc = nn.Linear(hid_dim, output_dim)
        
        self.do = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, src, trg_mask, src_mask):
        
        #trg = [batch_size, trg sent len]
        #src = [batch_size, src sent len]
        #trg_mask = [batch size, trg sent len]
        #src_mask = [batch size, src sent len]
        
        pos = torch.arange(0, trg.shape[1]).unsqueeze(0).repeat(trg.shape[0], 1).to(self.device)
                
        trg = self.do((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))
        
        #trg = [batch size, trg sent len, hid dim]
        
        for layer in self.layers:
            trg = layer(trg, src, trg_mask, src_mask)
            
        return self.fc(trg)

In [0]:
class DecoderLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout, device):
        super().__init__()
        
        self.ln = nn.LayerNorm(hid_dim)
        self.sa = self_attention(hid_dim, n_heads, dropout, device)
        self.ea = self_attention(hid_dim, n_heads, dropout, device)
        self.pf = positionwise_feedforward(hid_dim, pf_dim, dropout)
        self.do = nn.Dropout(dropout)
        
    def forward(self, trg, src, trg_mask, src_mask):
        
        #trg = [batch size, trg sent len, hid dim]
        #src = [batch size, src sent len, hid dim]
        #trg_mask = [batch size, trg sent len]
        #src_mask = [batch size, src sent len]
                
        trg = self.ln(trg + self.do(self.sa(trg, trg, trg, trg_mask)))
                
        trg = self.ln(trg + self.do(self.ea(trg, src, src, src_mask)))
        
        trg = self.ln(trg + self.do(self.pf(trg)))
        
        return trg

In [0]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, sos_idx, pad_idx, device, maxlen=50):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.sos_idx = sos_idx
        self.pad_idx = pad_idx
        self.device = device
        self.maxlen = maxlen
        
    def make_masks(self, src, trg):
        
        #src = [batch size, src sent len]
        #trg = [batch size, trg sent len]
        
        src_mask = (src != self.pad_idx).unsqueeze(1).unsqueeze(2)
        
        trg_pad_mask = (trg != self.pad_idx).unsqueeze(1).unsqueeze(3)
        
        #src_mask = [batch size, 1, 1, src sent len]
        #trg_pad_mask = [batch size, 1, trg sent len, 1]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=self.device)).bool()
                        
        #trg_sub_mask = [trg sent len, trg sent len]
            
        trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg sent len, trg sent len]
        
        return src_mask, trg_mask
    
    def forward(self, src, trg):
        
        #src = [batch size, src sent len]
        #trg = [batch size, trg sent len]
                
        src_mask, trg_mask = self.make_masks(src, trg)
        
        enc_src = self.encoder(src, src_mask)
        
        #enc_src = [batch size, src sent len, hid dim]
                
        out = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        #out = [batch size, trg sent len, output dim]
        
        return out
    
    def translate_sequences(self, src):
        #src = [batch size, src sent len]
        
        batch_size, src_len = src.shape
        trg = src.new_full((batch_size, 1), self.sos_idx)
        #trg = [batch size, 1]
        src_mask, trg_mask = self.make_masks(src, trg)
        
        enc_src = self.encoder(src, src_mask)
        
        #enc_src = [batch size, src sent len, hid dim]
        
        translation_step = 0
        while translation_step < self.maxlen:
            out = self.decoder(trg, enc_src, trg_mask, src_mask)
            # out - [batch size, trg sent len, output dim]
            out = torch.argmax(out[:, -1], dim=1) # batch size
            out = out.unsqueeze(1) # batch size, 1
            trg = torch.cat((trg, out), dim=1)
            # trg - [batch size, trg sent len]
            src_mask, trg_mask = self.make_masks(src, trg)
            translation_step += 1
        return trg

In [0]:
input_dim = len(SRC.vocab)
hid_dim = 512
n_layers = 4
n_heads = 8
pf_dim = 2048
dropout = 0.2

enc = Encoder(input_dim, hid_dim, n_layers, n_heads, pf_dim, EncoderLayer, SelfAttention, PositionwiseFeedforward, dropout, device)

In [0]:
output_dim = len(TRG.vocab)
hid_dim = 512
n_layers = 4
n_heads = 8
pf_dim = 2048
dropout = 0.2

dec = Decoder(output_dim, hid_dim, n_layers, n_heads, pf_dim, DecoderLayer, SelfAttention, PositionwiseFeedforward, dropout, device)

In [0]:
pad_idx = SRC.vocab.stoi['<pad>']
sos_idx = SRC.vocab.stoi['<sos>']

model = Seq2Seq(enc, dec, sos_idx, pad_idx, device).to(device)

In [141]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 72,986,775 trainable parameters


In [0]:
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

In [0]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
    
    def zero_grad(self):
        self.optimizer.zero_grad()

In [0]:
optimizer = NoamOpt(hid_dim, 1, 2000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

In [0]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [0]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.query
        trg = batch.response
        
        #if(i%100==1):
        #  print("src shape {}->batch {}: {}".format(src.shape,i, loss))

        optimizer.zero_grad()
        
        output = model(src, trg[:,:-1])
                
        #output = [batch size, trg sent len - 1, output dim]
        #trg = [batch size, trg sent len]
            
        output = output.contiguous().view(-1, output.shape[-1])
        trg = trg[:,1:].contiguous().view(-1)
                
        #output = [batch size * trg sent len - 1, output dim]
        #trg = [batch size * trg sent len - 1]
            
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg[:,:-1])
            
            #output = [batch size, trg sent len - 1, output dim]
            #trg = [batch size, trg sent len]
            
            output = output.contiguous().view(-1, output.shape[-1])
            trg = trg[:,1:].contiguous().view(-1)
            
            #output = [batch size * trg sent len - 1, output dim]
            #trg = [batch size * trg sent len - 1]
            
            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [0]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [247]:
N_EPOCHS = 150
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    #valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # if valid_loss < best_valid_loss:
    #     best_valid_loss = valid_loss
    #     torch.save(model.state_dict(), 'tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    #print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 3m 51s
	Train Loss: 4.132 | Train PPL:  62.290
Epoch: 02 | Time: 3m 51s
	Train Loss: 3.953 | Train PPL:  52.093


KeyboardInterrupt: ignored

In [0]:
torch.save(model.state_dict(), "ben-5.pt")

In [210]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [209]:
!mv /content/ben-5.pt /content/drive/My\ Drive/Ben/

mv: cannot move '/content/ben-5.pt' to '/content/drive/My Drive/Ben/': No such file or directory


In [205]:
source_sentence = ["<sos>"] + my_train_dataset[450].query + ["<eos>"]
#source_sentence = ["<sos>"] + ["not","bad"] + ["<eos>"]
target_sentence = ["<sos>"] + my_train_dataset[450].response + ["<eos>"]
print(' '.join(source_sentence))
print(' '.join(target_sentence))

<sos> but , david . <eos>
<sos> i was n't hallucinating . <eos>


In [204]:
x = SRC.numericalize([source_sentence]).to(device)
# y = TRG.numericalize([target_sentence]).to(device)
# We actually do not have y in real world, translation should only
# rely on source data. translate_sequences should work worse than
# model(x, y), as it uses its own predicted tokens rather than
# tokens from gold example (y).
translation = model.translate_sequences(x)
translation = translation[0].cpu().detach().numpy()

for x in translation[1:]:
    word = TRG.vocab.itos[x]
    if word == "<eos>":
        break
    print(word, end=' ')
    #print(x)

get it ? 

In [248]:
for i in range(10200,10250):
  print("\n"+"**"*10)
  source_sentence = ["<sos>"] + my_train_dataset[i].query + ["<eos>"]
  target_sentence = ["<sos>"] + my_train_dataset[i].response + ["<eos>"]
  print(' '.join(source_sentence))
  print(' '.join(target_sentence))
  x = SRC.numericalize([source_sentence]).to(device)
  # y = TRG.numericalize([target_sentence]).to(device)
  # We actually do not have y in real world, translation should only
  # rely on source data. translate_sequences should work worse than
  # model(x, y), as it uses its own predicted tokens rather than
  # tokens from gold example (y).
  translation = model.translate_sequences(x)
  translation = translation[0].cpu().detach().numpy()
  for x in translation[1:]:
      word = TRG.vocab.itos[x]
      if word == "<eos>":
          break
      print(word, end=' ')



********************
<sos> ... i ... everyone has been very kind to me . <eos>
<sos> of course . how long did you and mr. treves prepare for this interview ? <eos>
i 'm not right .   i 'm not . 
********************
<sos> it was a great pleasure to meet you , mr. merrick . <eos>
<sos> i am very pleased to meet you . <eos>
i 'm going to be . 
********************
<sos> i am very pleased to meet you . <eos>
<sos> i hope we can talk together again sometime . good day . <eos>
i 'm going to have to have to have to be a little . 
********************
<sos> how long has this man been here ? <eos>
<sos> three quarters of an hour . <eos>
i do n't know . 
********************
<sos> abominable things these machines . one ca n't reason with them . <eos>
<sos> what a mess . <eos>
i do n't know . 
********************
<sos> i say freddie , what are you about ? <eos>
<sos> oh nothing ... nothing of any great importance . <eos>
i 'm not .   i 'm going to the way . 
********************
<sos> certainl