<a href="https://colab.research.google.com/github/mmsamiei/thesis-prototype/blob/master/phase2_simple_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
!cp /content/drive/My\ Drive/Thesis/phase-2/tokenized_bert.json /content/train.json

In [0]:
from torch.utils.data import Dataset, DataLoader
import os
import torch
import json
from torch.utils.data.sampler import SubsetRandomSampler

In [0]:
class MyDataset(Dataset):
    """My dataset."""

    def __init__(self, json_file):
        """
        Args:
            json_file (string): Path to the json file with annotations.
        """
        self.dialogues = json.load(open(json_file))
        self.root_dir = json_file

        self.dialogues = sorted(self.dialogues, key=lambda x: len(x['history']))

    def __len__(self):
        return len(self.dialogues)

    def __getitem__(self, idx):
      
        
        history_lst = self.dialogues[idx]['history']
        true_lst = self.dialogues[idx]['true_sentence']
        false_lst = self.dialogues[idx]['false_sentenc']


        if(len(history_lst)>40):
          history_lst = history_lst[-40:]

        if(len(true_lst)>40):
          true_lst = true_lst[:40]
        
        if(len(false_lst)>40):
          false_lst = false_lst[:40]
        

        history = torch.LongTensor(history_lst)
        true_sample = torch.LongTensor(true_lst)
        false_sample = torch.LongTensor(false_lst)

        sample = {'history': history, 'true_sample': true_sample, 'false_sample': false_sample}

        return sample

In [0]:
dataset = MyDataset('train.json')

In [0]:
def my_collate_fn(batch):

  len_batch = len(batch)

  
  max_len_history = max([len(data['history']) for data in batch])
  max_len_true_sample = max([len(data['true_sample']) for data in batch])
  max_len_false_sample = max([len(data['false_sample']) for data in batch])
  
  padding_ind = 1 ## for bert is 0 but because embedding hate 0!! we used 1
  result_history = torch.ones(len_batch, max_len_history)
  result_true_sample = torch.ones(len_batch, max_len_true_sample)
  result_false_sample = torch.ones(len_batch, max_len_false_sample)

  for i, data in enumerate(batch):
    p1 = len(data['history'])
    result_history[i, :p1] = data['history']
    p2 = len(data['true_sample'])
    result_true_sample[i, :p2] = data['true_sample']
    p3 = len(data['false_sample'])
    result_false_sample[i, :p3] = data['false_sample']



  return result_history.T.long(), result_true_sample.T.long(), result_false_sample.T.long()

sampler = torch.utils.data.SequentialSampler(dataset)
dataset_loader = torch.utils.data.DataLoader(dataset, batch_size=32, sampler=sampler,
                                             shuffle=False, collate_fn=my_collate_fn)

In [0]:
for _, batch in enumerate(dataset_loader):
  history, true_sample, false_sample = batch

# Model

In [0]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [0]:
import numpy as np
import torch.nn as nn

class RepresentModel(nn.Module):
  
  def __init__(self, hid_size, vocab_size, n_head, n_layers, pf_size, max_len, device):
    super().__init__()

    self.device = device
    
    self.hid_size = hid_size
    self.pf_size = pf_size
    self.max_len = max_len
    self.n_head = n_head

    self.embedding = nn.Embedding(vocab_size, hid_size)

    self.position_enc = nn.Embedding(self.max_len, self.hid_size)
    self.position_enc.weight.data = self.position_encoding_init(self.max_len, self.hid_size)
    self.scale = torch.sqrt(torch.FloatTensor([self.hid_size])).to(device)

    self.layer_norm = nn.LayerNorm(self.hid_size)
    self.encoder_layer = nn.TransformerEncoderLayer(d_model=hid_size, nhead = n_head, dim_feedforward = pf_size)
    self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=n_layers, norm=self.layer_norm)

    self._init_weights()
  
  def forward(self, x):
    ## x = [sent_len, batch_size]
    sent_len, batch_size = x.shape[0], x.shape[1]

    temp = x
    temp = self.embedding(temp)

    pos = torch.arange(1,sent_len+1).unsqueeze(1).repeat(1,batch_size).to(self.device)
    temp_pos_emb = self.position_enc(pos)

    temp = temp * self.scale + temp_pos_emb
    temp = self.encoder(temp)
    temp = temp[0,:]
    return temp

  def _init_weights(self):
    for p in self.parameters():
      if p.dim() > 1:
        nn.init.xavier_uniform_(p)

  def append_decoder_layer(self):
    appended_mod = nn.TransformerEncoderLayer(d_model=self.hid_size, nhead = self.n_head, dim_feedforward = self.pf_size).to(self.device)
    for p in appended_mod.parameters():
      if p.dim() > 1:
        nn.init.xavier_uniform_(p)
    self.encoder.layers.append(appended_mod)
    self.encoder.num_layers += 1

  
  def position_encoding_init(self, n_position, d_pos_vec):
    ''' Init the sinusoid position encoding table '''

    # keep dim 0 for padding token position encoding zero vector
    position_enc = np.array([
        [pos / np.power(10000, 2*i/d_pos_vec) for i in range(d_pos_vec)]
        if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])

    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
    temp = torch.from_numpy(position_enc).type(torch.FloatTensor)
    temp = temp.to(self.device)
    return temp

In [0]:
hid_size = 16
vocab_size = 34000 
n_head = 8
n_layers = 1
pf_size = 32
max_len = 500
model = RepresentModel(hid_size, vocab_size, n_head, n_layers, pf_size, max_len, device).to(device)

In [11]:
test_len = 20
batch_size = 64
test_input = torch.LongTensor(test_len, batch_size).random_(1,vocab_size).to(device)
model(test_input).shape

torch.Size([64, 16])

In [12]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 556,480 trainable parameters


In [0]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
    
    def zero_grad(self):
        self.optimizer.zero_grad()

In [14]:
!pip install -U tqdm

Collecting tqdm
[?25l  Downloading https://files.pythonhosted.org/packages/72/c9/7fc20feac72e79032a7c8138fd0d395dc6d8812b5b9edf53c3afd0b31017/tqdm-4.41.1-py2.py3-none-any.whl (56kB)
[K     |█████▊                          | 10kB 26.4MB/s eta 0:00:01[K     |███████████▌                    | 20kB 1.7MB/s eta 0:00:01[K     |█████████████████▎              | 30kB 2.5MB/s eta 0:00:01[K     |███████████████████████         | 40kB 1.6MB/s eta 0:00:01[K     |████████████████████████████▉   | 51kB 2.0MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 2.0MB/s 
[?25hInstalling collected packages: tqdm
  Found existing installation: tqdm 4.28.1
    Uninstalling tqdm-4.28.1:
      Successfully uninstalled tqdm-4.28.1
Successfully installed tqdm-4.41.1


In [0]:
from tqdm.notebook import tqdm

def train_one_epoch(model,train_iter, optimizer, criterion, clip):
  epoch_loss = 0
  model.train()
  for batch in tqdm(train_iter):
    optimizer.zero_grad()
    history, true_sample, false_sample = batch
    batch_size = history.shape[1]
    ### hist = [sent_len, batch]
    history_rpr = model(history.to(device))
    true_rpr = model(true_sample.to(device))
    false_rpr = model(false_sample.to(device))
    ## rpr = [batch_size, hidden]
    cos = nn.PairwiseDistance().to(device)
    tru_sml = -1*cos(history_rpr, true_rpr)
    fls_sml = -1*cos(history_rpr, false_rpr)
    mini_batch_tensor = torch.ones(batch_size).to(device)
    loss = criterion(tru_sml, fls_sml, mini_batch_tensor)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    epoch_loss += loss.item()
  return epoch_loss / len(train_iter)

In [0]:
def train(model, train_iter, optimizer, criterion, clip, N_EPOCH):
  for epoch in range(N_EPOCH):
    epoch_loss = train_one_epoch(model, train_iter, optimizer, criterion, clip)
    print(epoch_loss)

In [17]:
optimizer = NoamOpt(hid_size, 1, 2000,
              torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
criterion = torch.nn.MarginRankingLoss(margin=10).to(device)
train(model, dataset_loader, optimizer, criterion, 1, 3)

HBox(children=(FloatProgress(value=0.0, max=4023.0), HTML(value='')))


0.43450488240126794


HBox(children=(FloatProgress(value=0.0, max=4023.0), HTML(value='')))


0.01133512950213193


HBox(children=(FloatProgress(value=0.0, max=4023.0), HTML(value='')))


0.0012296017591792673


In [0]:
def evaluate(model,valid_iter, criterion):
  epoch_loss = 0
  model.eval()
  for batch in tqdm(valid_iter):
    history, true_sample, false_sample = batch
    batch_size = history.shape[1]
    ### hist = [sent_len, batch]
    history_rpr = model(history.to(device))
    true_rpr = model(true_sample.to(device))
    false_rpr = model(false_sample.to(device))
    ## rpr = [batch_size, hidden]
    cos = nn.PairwiseDistance().to(device)
    tru_sml = -1*cos(history_rpr, true_rpr)
    fls_sml = -1*cos(history_rpr, false_rpr)
    mini_batch_tensor = torch.ones(batch_size).to(device)
    loss = criterion(tru_sml, fls_sml, mini_batch_tensor)
    epoch_loss += loss.item()
  return epoch_loss / len(valid_iter)

In [0]:
valid_dataset = MyDataset('/content/drive/My Drive/Thesis/phase-2/valid_tokenized_bert.json')
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=32, sampler=sampler,
                                             shuffle=False, collate_fn=my_collate_fn)

In [20]:
evaluate(model,valid_loader, criterion)

HBox(children=(FloatProgress(value=0.0, max=4023.0), HTML(value='')))




0.07794507536430606

# **inference**

In [21]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |████████████████████████████████| 450kB 2.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/1f/8e/ed5364a06a9ba720fddd9820155cc57300d28f5f43a6fd7b7e817177e642/sacremoses-0.0.35.tar.gz (859kB)
[K     |████████████████████████████████| 860kB 56.7MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 56.0MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.35-cp36-none-any.whl size=883999 sha256=7599c2e61dc153cc451383

In [22]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [0]:
history = ["I think science fiction is an amazing genre for anything. Future science, technology, time travel, FTL travel, they're all such interesting concepts", 
           "I'm a huge fan of science fiction myself!"
           ]
sentences = ["Science fiction film (or sci-fi film) is a genre that uses speculative, fictional science-based depictions of phenomena that are not fully accepted by mainstream science, such as extraterrestrial lifeforms, alien worlds, extrasensory perception and time travel, along with futuristic elements such as spacecraft, robots, cyborgs, interstellar travel or other technologies.",
             "Science fiction films have often been used to focus on political or social issues, and to explore philosophical issues like the human condition.",
             "In many cases, tropes derived from written science fiction may be used by filmmakers ignorant of or at best indifferent to the standards of scientific plausibility and plot logic to which written science fiction is traditionally held."]

In [0]:
def text_to_idslist(text):
  '''
  get text, add [SEP] and [CLS] to it and returns it as indices list
  '''
  text = str(text)
  text = "[CLS] " + text + " [SEP]"
  return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

In [0]:
def history_to_idslist(history: []):
  '''
  get list of history texts and return one 1d list of indices.
  '''
  temp = ' [SEP] '.join(history)
  temp = ' [CLS] ' + temp + ' [SEP] '
  return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(temp))

In [42]:
temp = history_to_idslist(history)
temp = torch.LongTensor(temp)
temp = temp.unsqueeze(1)
temp.shape

torch.Size([18, 1])

In [0]:
def similarity(model, history, sentence):
  history_tensor = torch.LongTensor(history_to_idslist(history)).unsqueeze(1)
  sentence_tensor = torch.LongTensor(text_to_idslist(sentence)).unsqueeze(1)
  history_tensor = history_tensor.to(device)
  sentence_tensor = sentence_tensor.to(device)
  # history_tensor = [sent_len, 1]
  # sentence_tensor = [sent_len, num_sentences]
  model.eval()
  hst_rpr = model(history_tensor) ## hst_rpr = [1, hidden]
  snt_rpr = model(sentence_tensor) ## snt_rpr = [1, hidden]
  cos = nn.PairwiseDistance()
  similarity = -1*cos(hst_rpr, snt_rpr).item()
  return(similarity)

  

In [57]:
print(similarity(model, history, sentences[0]))
print(similarity(model, history, sentences[1]))
print(similarity(model, history, sentences[2]))

-20.85930633544922
-2.9274656772613525
-20.723146438598633
