# Sber-gpt3 implimentation for nti2021-ai problem

In [1]:
!pip install transformers
!pip install -q git+https://github.com/gmihaila/ml_things.git
!pip install jsonlines

  Building wheel for ml-things (setup.py) ... [?25l[?25hdone


In [2]:
import io
import os
import torch
from tqdm.notebook import tqdm
import json
import re
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from ml_things import plot_dict, plot_confusion_matrix, fix_text
from sklearn.metrics import classification_report, accuracy_score
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          AutoTokenizer,
                          AdamW, 
                          get_linear_schedule_with_warmup,
                          AutoModelWithLMHead)

set_seed(179)

epochs = 4
batch_size = 10

max_length = 512

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')
model_name_or_path = 'sberbank-ai/rugpt3small_based_on_gpt2'

n_labels = 2

device: cuda


# Downloading data

In [3]:
!mkdir data/

!curl https://raw.githubusercontent.com/AI-Front/NTI/main/semifinals/data/train.jsonl -o data/train.jsonl
!curl https://raw.githubusercontent.com/AI-Front/NTI/main/semifinals/data/val.jsonl -o data/val.jsonl
!curl https://raw.githubusercontent.com/AI-Front/NTI/main/semifinals/data/test.jsonl -o data/test.jsonl

mkdir: cannot create directory ‘data/’: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 2861k  100 2861k    0     0  13.3M      0 --:--:-- --:--:-- --:--:-- 13.3M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  553k  100  553k    0     0  12.8M      0 --:--:-- --:--:-- --:--:-- 12.8M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1698k  100 1698k    0     0  24.0M      0 --:--:-- --:--:-- --:--:-- 24.0M


# QA dataset

In [4]:
class QADataset(Dataset):
    """Question and answers dataset"""
    
    def __init__(self, path, tokenizer, max_len):
        """
        Args:
            path (string): Path to jsonl file
        """
        self.path = path
        data = []
        with open(path, 'r') as json_file:
            json_list = list(json_file)
            for json_str in json_list:
                item = json.loads(json_str)
                text = re.sub(r'\(\d+\)', "", item['passage']['text']).replace('"', '')
                questions = item['passage']['questions']
                for q in questions:
                    question = q['question']
                    ans = q['answers']
                    for a in ans:
                        pad_len = max(0, len(question.split()) + len(a["text"].split()) + len(text.split()) - 509) // 2
                        pad_text = ' '.join(text.split()[pad_len:-pad_len - 1])
                        data.append({"idx": a["idx"],
                                     "text": text,
                                     "question": question,
                                     "answer": a["text"],
                                     "data": f"Контекст: {pad_text} Вопрос: {question} Ответ: {a['text']}"})

                        if 'label' in a:
                            data[-1]["label"] = a["label"]
        self.data = pd.DataFrame.from_dict(data).set_index("idx")
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def get_df(self):
        texts = self.data['data']
        labels = self.data['label'] if 'label' in self.data.columns else [None] * self.data.shape[0]

        return pd.DataFrame({
                    'text': texts,
                    'labels': labels
                }) 
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data.iloc[idx].data
        label = self.data.iloc[idx].label if "label" in self.data.columns else None

        return {'text': text,
                'label': label}


In [5]:
class Gpt2Collator(object):
    def __init__(self, tokenizer, max_len=None):

        self.tokenizer = tokenizer
        self.max_len = tokenizer.model_max_length if max_len is None else max_len

        return

    def __call__(self, sequences):
        texts = [sequence['text'] for sequence in sequences]
        labels = [sequence['label'] for sequence in sequences]

        inputs = self.tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True,  max_length=self.max_len)
        inputs.update({'labels': torch.tensor(labels)})

        return inputs

In [6]:
def train(dataloader, optimizer_, scheduler_, device_):
    global model

    predictions_labels = []
    true_labels = []
    total_loss = 0

    model.train()

    for batch in tqdm(dataloader, total=len(dataloader)):

        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}
        print(len(batch['input_ids'][0]))

        model.zero_grad()
        outputs = model(**batch)

        loss, logits = outputs[:2]
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        logits = logits.detach().cpu().numpy()
        predictions_labels += logits.argmax(axis=-1).flatten().tolist()

    avg_epoch_loss = total_loss / len(dataloader)
    return true_labels, predictions_labels, avg_epoch_loss



def validation(dataloader, device_):
    global model

    predictions_labels = []
    true_labels = []
    total_loss = 0

    model.eval()

    for batch in tqdm(dataloader, total=len(dataloader)):

        true_labels += batch['labels'].numpy().flatten().tolist()
        batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

        with torch.no_grad():        

            outputs = model(**batch)

            loss, logits = outputs[:2]
            
            logits = logits.detach().cpu().numpy()
            total_loss += loss.item()
            
            predict_content = logits.argmax(axis=-1).flatten().tolist()
            predictions_labels += predict_content

    avg_epoch_loss = total_loss / len(dataloader)
    return true_labels, predictions_labels, avg_epoch_loss

In [7]:
# Get model's tokenizer.
print('Loading tokenizer...')
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token_id = '[PAD]'

print('Loading model...')
model = AutoModelWithLMHead.from_pretrained(model_name_or_path)
# print(model)

model.to(device)
print('Model loaded to `%s`' % device)

Loading tokenizer...


Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


Loading model...
Model loaded to `cuda`


## **Create dataloaders**

In [8]:
gpt2_classificaiton_collator = Gpt2Collator(tokenizer=tokenizer, 
                                            max_len=max_length)


print('Dealing with Train...')
train_dataset = QADataset(path='data/train.jsonl', 
                            tokenizer=tokenizer, 
                            max_len=max_length)
print('Created `train_dataset` with %d examples!' % len(train_dataset))

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_classificaiton_collator)
print('Created `train_dataloader` with %d batches!'%len(train_dataloader))

print()

print('Dealing with Validation...')
valid_dataset =  QADataset(path='data/val.jsonl', 
                            tokenizer=tokenizer,
                            max_len=max_length)
print('Created `valid_dataset` with %d examples!' % len(valid_dataset))

valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classificaiton_collator)
print('Created `eval_dataloader` with %d batches!' % len(valid_dataloader))

Dealing with Train...
Created `train_dataset` with 11950 examples!
Created `train_dataloader` with 1195 batches!

Dealing with Validation...
Created `valid_dataset` with 2235 examples!
Created `eval_dataloader` with 224 batches!


## **Train**

In [9]:
torch.cuda.empty_cache()
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # default is 1e-8.
                  )


total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

all_loss = {'train_loss':[], 'val_loss':[]}
all_acc = {'train_acc':[], 'val_acc':[]}

print("Epoch")
for epoch in tqdm(range(epochs)):
    print()
    print('Training on batches...')

    train_labels, train_predict, train_loss = train(train_dataloader, optimizer, scheduler, device)
    train_acc = accuracy_score(train_labels, train_predict)

    print('Validation on batches...')
    valid_labels, valid_predict, val_loss = validation(valid_dataloader, device)
    val_acc = accuracy_score(valid_labels, valid_predict)

    print("  train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f" % (train_loss, val_loss, train_acc, val_acc))
    print()

    all_loss['train_loss'].append(train_loss)
    all_loss['val_loss'].append(val_loss)
    all_acc['train_acc'].append(train_acc)
    all_acc['val_acc'].append(val_acc)


plot_dict(all_loss, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'])
plot_dict(all_acc, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'])

Epoch


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


Training on batches...


HBox(children=(FloatProgress(value=0.0, max=1195.0), HTML(value='')))

370




ValueError: ignored

## **Evaluate**

In [None]:
true_labels, predictions_labels, avg_epoch_loss = validation(valid_dataloader, device)


evaluation_report = classification_report(true_labels, predictions_labels, labels=[0, 1]], target_names=[0, 1])
print(evaluation_report)

plot_confusion_matrix(y_true=true_labels, 
                      y_pred=predictions_labels, 
                      classes=[0, 1],
                      normalize=True, 
                      magnify=0.1,
                      )