<a href="https://colab.research.google.com/github/mmsamiei/AI_projects/blob/master/phase2_albert_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [1]:
!pip install transformers



In [0]:
!cp /content/drive/My\ Drive/Thesis/phase-2/tokenized_albert-base_v1_phase2.json ./train.json

In [0]:
from torch.utils.data import Dataset, DataLoader
import os
import torch
import json
from torch.utils.data.sampler import SubsetRandomSampler

In [0]:
class MyDataset(Dataset):
    """My dataset."""

    def __init__(self, json_file):
        """
        Args:
            json_file (string): Path to the json file with annotations.
        """
        self.dialogues = json.load(open(json_file))
        self.root_dir = json_file

        self.dialogues = sorted(self.dialogues, key=lambda x: len(x['history']))

    def __len__(self):
        return len(self.dialogues)

    def __getitem__(self, idx):
      
        
        history_lst = self.dialogues[idx]['history']
        true_lst = self.dialogues[idx]['true_sentence']
        false_lst = self.dialogues[idx]['false_sentenc']


        if(len(history_lst)>40):
          history_lst = history_lst[-40:]

        if(len(true_lst)>40):
          true_lst = true_lst[:40]
        
        if(len(false_lst)>40):
          false_lst = false_lst[:40]
        

        history = torch.LongTensor(history_lst)
        true_sample = torch.LongTensor(true_lst)
        false_sample = torch.LongTensor(false_lst)

        sample = {'history': history, 'true_sample': true_sample, 'false_sample': false_sample}

        return sample

In [0]:
dataset = MyDataset('train.json')

In [0]:
def my_collate_fn(batch):

  len_batch = len(batch)

  
  max_len_history = max([len(data['history']) for data in batch])
  max_len_true_sample = max([len(data['true_sample']) for data in batch])
  max_len_false_sample = max([len(data['false_sample']) for data in batch])
  
  padding_ind = 1 ## for bert is 0 but because embedding hate 0!! we used 1
  result_history = torch.ones(len_batch, max_len_history)
  result_true_sample = torch.ones(len_batch, max_len_true_sample)
  result_false_sample = torch.ones(len_batch, max_len_false_sample)

  for i, data in enumerate(batch):
    p1 = len(data['history'])
    result_history[i, :p1] = data['history']
    p2 = len(data['true_sample'])
    result_true_sample[i, :p2] = data['true_sample']
    p3 = len(data['false_sample'])
    result_false_sample[i, :p3] = data['false_sample']



  return result_history.long(), result_true_sample.long(), result_false_sample.long()

sampler = torch.utils.data.SequentialSampler(dataset)
dataset_loader = torch.utils.data.DataLoader(dataset, batch_size=256, sampler=sampler,
                                             shuffle=False, collate_fn=my_collate_fn)

In [0]:
for _, batch in enumerate(dataset_loader):
  history, true_sample, false_sample = batch
  print(history.shape)

# Model

In [0]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [8]:
import torch
from transformers import *
import numpy as np
import torch.nn as nn

class RepresentModel(nn.Module):
  
  def __init__(self, device):
    super().__init__()

    self.device = device
    self.albert_config = AlbertConfig(num_hidden_layers=6)
    self.albert = AlbertModel.from_pretrained('albert-base-v1')
    self.fc = nn.Linear(768, 128)
    
    for p in self.albert.parameters():
      p.requires_grad = False

    for name, p in self.albert.named_parameters():
      if 'encoder.albert_layer_groups.0' in name:
        p.requires_grad = True
      elif 'pooler.' in name:
        p.requires_grad = True
      p.requires_grad = False

  def forward(self, x):
    ## x = [batch, sent]
    batch_size, sent_len = x.shape[0], x.shape[1]

    temp = self.albert(x)
    temp = temp[0]
    ## [batch, sent_len, 768]
    temp = temp[:,0,:]
    ## [batch, 768]
    temp = self.fc(temp)
    return temp


In [0]:
hid_size = 768
model = RepresentModel(device).to(device)

In [10]:
test_len = 20
batch_size = 64
vocab_size = 30000
test_input = torch.LongTensor(batch_size, test_len).random_(1,vocab_size).to(device)
model(test_input).shape

torch.Size([64, 128])

In [11]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 98,432 trainable parameters


In [0]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
    
    def zero_grad(self):
        self.optimizer.zero_grad()

In [13]:
!pip install -U tqdm

Requirement already up-to-date: tqdm in /usr/local/lib/python3.6/dist-packages (4.41.1)


In [0]:
from tqdm.notebook import tqdm

def train_one_epoch(model,train_iter, optimizer, criterion, clip):
  epoch_loss = 0
  model.train()
  for batch in tqdm(train_iter):
    optimizer.zero_grad()
    history, true_sample, false_sample = batch
    batch_size = history.shape[0]
    ### hist = [batch, sent_len]
    history_rpr = model(history.to(device))
    true_rpr = model(true_sample.to(device))
    false_rpr = model(false_sample.to(device))
    ## rpr = [batch_size, hidden]
    cos = nn.PairwiseDistance().to(device)
    tru_sml = -1*cos(history_rpr, true_rpr)
    fls_sml = -1*cos(history_rpr, false_rpr)
    mini_batch_tensor = torch.ones(batch_size).to(device)
    loss = criterion(tru_sml, fls_sml, mini_batch_tensor)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    epoch_loss += loss.item()
  return epoch_loss / len(train_iter)

In [0]:
def train(model, train_iter, optimizer, criterion, clip, N_EPOCH):
  for epoch in range(N_EPOCH):
    epoch_loss = train_one_epoch(model, train_iter, optimizer, criterion, clip)
    print(epoch_loss)

In [0]:
optimizer = NoamOpt(hid_size, 1, 2000,
              torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))
criterion = torch.nn.MarginRankingLoss(margin=10).to(device)
train(model, dataset_loader, optimizer, criterion, 1, 1)

HBox(children=(FloatProgress(value=0.0, max=9642.0), HTML(value='')))