<a href="https://colab.research.google.com/github/myutman/NLP/blob/master/HW4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install transformers



In [0]:
import transformers
from transformers import BertTokenizer, BertModel

import torch
import torch.nn as nn
import torch.optim as optim
from torch.functional import F

from tqdm import tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

import re

In [0]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
class MLP(nn.Module):
    def __init__(self, nin, nout, n_hidden):
        super(MLP, self).__init__()
        self.layers = []
        self.layers.extend([
            nn.Linear(nin, 32),
            nn.Sigmoid(),
            nn.Dropout(0.5)
        ])
        for i in range(1, n_hidden - 1):
            self.layers.extend([
                nn.Linear(32, 32),
                nn.Sigmoid(),
                nn.Dropout(0.5)
            ])
        self.layers.extend([
            nn.Linear(32, nout),
            nn.Softmax(dim=-1)
        ])
        self.model = nn.Sequential(*self.layers)
    
    def forward(self, x):
        return self.model(x)


class QAModel(nn.Module):
    def __init__(self):
        super(QAModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
        for par in self.bert.parameters():
            par.requires_grad = False

        self.param_fr = nn.Parameter(torch.ones((768, 1)))
        self.mlp_fr = MLP(256, 256, 3)

        self.param_to = nn.Parameter(torch.ones((768, 1)))
        self.mlp_to = MLP(256, 256, 3)

    def forward(self, x, mask):
        states, _ = self.bert(x, attention_mask=mask)
        print(states.shape)
        fr_vec = (states @ self.param_fr).reshape(-1, 256)
        fr_out = self.mlp_fr(fr_vec)
        
        to_vec = (states @ self.param_to).reshape(-1, 256)
        to_out = self.mlp_to(to_vec)

        return fr_out, to_out

EPS = 1e-9

def J(output_froms, output_tos, froms, tos):
    fr_probs = - torch.log(output_froms[torch.arange(len(output_froms)), froms] + EPS) * (froms != -1)
    to_probs = - torch.log(output_tos[torch.arange(len(output_tos)), tos] + EPS) * (tos != -1)
    no_fr_probs = - torch.log(output_froms + EPS).sum(dim=-1) * (froms == -1)
    no_to_probs = - torch.log(output_tos + EPS).sum(dim=-1) * (tos == -1)
    return fr_probs.sum() + to_probs.sum() + no_fr_probs.sum() + no_to_probs.sum()



In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

In [0]:
df = pd.read_csv('/content/gdrive/My Drive/NLP/train_qa.csv')

In [0]:
batch_size = 8
max_len = 256

def colate_fn(data):
    #print(len(data))
    X, fr, to, mask = zip(*data)
    #print(torch.tensor(X).shape)
    #print(torch.tensor(y).shape)
    #print(torch.tensor(mask).shape)
    return torch.tensor(X), torch.tensor(fr), torch.tensor(to), torch.tensor(mask)

def prepare_dataset(df):
    texts = list(df['paragraph'])
    quests = list(df['question'])
    anss = list(df['answer'])
    X = []
    froms = []
    tos = []
    masks = []
    for text, quest, ans in tqdm(list(zip(texts, quests, anss))):
        text = ' '.join(re.findall('\w+', text.lower()))
        quest = ' '.join(re.findall('\w+', quest.lower()))
        ans = ' '.join(re.findall('\w+', ans.lower()))

        text_tokens = tokenizer.encode(text)[1:-1]
        quest_tokens = tokenizer.encode(quest)[1:-1]
        ans_tokens = tokenizer.encode(ans)[1:-1]
        
        fr = -1
        to = -1
        for i in range(len(text_tokens) - len(ans_tokens)):
            if text_tokens[i:i + len(ans_tokens)] == ans_tokens:
                fr = i
                to = i + len(ans_tokens) - 1
        
        if fr == -1:
            continue

        l = 0
        r = max_len - len(quest_tokens) - 4
        if len(text_tokens) > max_len - len(quest_tokens) - 3:
            step = (max_len - len(quest_tokens) - 3) // 3
        else:
            step = len(text_tokens)
        while len(text_tokens) > 0:
            cnt = min(len(text_tokens), max_len - len(quest_tokens) - 3)
            tokens = [tokenizer.cls_token_id] + quest_tokens + [tokenizer.sep_token_id] + text_tokens[:cnt] + [tokenizer.sep_token_id] + [tokenizer.pad_token_id] * (max_len - cnt - len(quest_tokens) - 3)
            mask = [0 if (i < len(quest_tokens) + 1) or (tokens[i] == tokenizer.pad_token_id) else 1 for i in range(max_len)]
            X.append(tokens)
            masks.append(mask)
            if fr >= l and fr <= r:
                froms.append(len(quest) + 2 + fr - l)
            else:
                froms.append(-1)
            if to >= l and to <= r:
                tos.append(len(quest) + 2 + to - l)
            else:
                tos.append(-1)

            text_tokens = text_tokens[step:]
            l += step
            r += step

    X_train, X_test, froms_train, froms_test, tos_train, tos_test, mask_train, mask_test = train_test_split(X, froms, tos, masks, test_size=0.2)
    train_data = torch.utils.data.DataLoader(list(zip(X_train, froms_train, tos_train, mask_train)), batch_size=batch_size, collate_fn=colate_fn)
    test_data = torch.utils.data.DataLoader(list(zip(X_test, froms_test, tos_test, mask_test)), batch_size=batch_size, collate_fn=colate_fn)
    return train_data, test_data

In [0]:
train_data, test_data = prepare_dataset(df[:200])

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [0]:
model = QAModel()
model.to(device)
model.train()

QAModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
   

In [0]:
#Head training

num_epochs = 3
adam = optim.Adam(filter(lambda par: par.requires_grad, model.parameters()), lr=5e-5)

train_losses = []
test_losses = []

kek = False
for i in tqdm(range(num_epochs)):
    losses = []
    for X, fr, to, mask in train_data:
        adam.zero_grad()
        out_fr, out_to = model(X.to(device), mask.to(device))

        loss = J(out_fr.to(device), out_to.to(device), fr.to(device), to.to(device))
        #loss = J(out_fr, out_to, f: device-side assert triggered at /pytorch/aten/src/THC/generic/THCTensorMath.cu:26r, to)
        losses.append(float(loss.double()))

        loss.backward()
        adam.step()

    train_losses.append(np.mean(losses))

    losses = []
    with torch.no_grad():
        for X, fr, to, mask in test_data:
            adam.zero_grad()
            out_fr, out_to = model(X.to(device), mask.to(device))

            loss = J(out_fr.to(device), out_to.to(device), fr.to(device), to.to(device))
            losses.append(float(loss.double()))

    test_losses.append(np.mean(losses))    

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

torch.Size([8, 256, 768])



RuntimeError: ignored

In [0]:
plt.plot(range(num_epochs), train_losses)
plt.plot(range(num_epochs), test_losses)
plt.legend(['train', 'test'])
plt.show()

In [12]:
!nvidia-smi

Sat Dec  7 14:22:41 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.33.01    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P0    26W /  75W |   1403MiB /  7611MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------