# Современные языковые модели на примере GPT-2 и как их применять в диалоговых системах.

## Generation

In [1]:
from tqdm import tqdm_notebook as tqdm

import numpy as np
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
weights_shortcut = 'gpt2'

tokenizer = GPT2Tokenizer.from_pretrained(weights_shortcut)
model = GPT2LMHeadModel.from_pretrained(weights_shortcut)

In [9]:
prompt_text = 'My name is'
encoded_prompt = tokenizer.encode(prompt_text, return_tensors="pt")

In [10]:
encoded_prompt

tensor([[3666, 1438,  318]])

In [11]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
encoded_prompt = encoded_prompt.to(device)

In [19]:
encoded_result = model.generate(encoded_prompt, 
                                eos_token_ids=tokenizer.eos_token_id
                               )
result = tokenizer.decode(encoded_result[0], skip_special_tokens=True)
print(result)

My name is John. I'm a man of God. I'm a man of God.


## Training

Dataset is preprocessed from here: https://github.com/square/MimicAndRephrase/tree/master/datasets/Sentiment/Sentiment

In [20]:
from torch.utils.data import DataLoader

def get_dataset_tensor(dataset_path):
    with open(dataset_path) as f:
        tokenized_dataset = [tokenizer.encode(line) for line in f]

    samples_num = len(tokenized_dataset)
    max_tokens_num = max(map(len, tokenized_dataset))

    input_ids = np.full((samples_num, max_tokens_num), tokenizer.pad_token_id, dtype=np.int64)
    for i, tokens in enumerate(tokenized_dataset):
        input_ids[i, :len(tokens)] = tokens

    return torch.from_numpy(input_ids)

tokenizer.pad_token = tokenizer.eos_token

train_data_tensor = get_dataset_tensor(dataset_path='paraphrase_dataset.txt')
train_dataloader = DataLoader(train_data_tensor, batch_size=16, shuffle=True)

In [21]:
from transformers import AdamW, get_linear_schedule_with_warmup

def train_model(model, training_data, epochs_num):
    optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=1)

    train_loss = []

    for _ in tqdm(range(epochs_num), total=epochs_num):
        for input_ids in training_data:
            model.train()

            input_ids = input_ids.to(device)
            loss = model(input_ids, labels=input_ids)[0]
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        train_loss.append(loss.item())
                
    return model, train_loss

In [30]:
encoded_prompt = tokenizer.encode('Jack lost my keys -> ', return_tensors="pt").to(device)
encoded_result = model.generate(encoded_prompt, 
                                eos_token_ids=tokenizer.eos_token_id,
                                do_sample=True)
result = tokenizer.decode(encoded_result[0], skip_special_tokens=True)
print(result)

Jack lost my keys -> I am sorry to hear your laptop accidentally lost something!


In [24]:
finetuned_model, metrics_history = train_model(model, train_dataloader, epochs_num=2)

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




In [28]:
encoded_prompt = tokenizer.encode('I bought a new car -> ', return_tensors="pt").to(device)
encoded_result = finetuned_model.generate(encoded_prompt, 
                                          eos_token_ids=tokenizer.eos_token_id,
                                          num_return_sequences=5)
for cur_sample_tokens in encoded_result[0]:
    print(tokenizer.decode(cur_sample_tokens, skip_special_tokens=True))

I bought a new  car -> I am glad you bought a new car!!
I bought a new  car -> I am glad you bought a new car!!
I bought a new  car -> I am glad to hear about your new car!
I bought a new  car -> I am glad you bought a new car!!
I bought a new  car -> I am glad you bought a new car!!


### Next steps
* Compute validation metrics: perplexity/BLEU/ROUGE
* Logging into tensorboard
* Generate N candidates and filter or rerank
* Analyze errors and improve dataset
* Improve training: masking, `lr_scheduler`, multi-gpu training
* Improve generation: try different strategies
* Improve the model: use bigger model, try different architecrures (DialoGPT2, XLNet, CTRL  etc)