In [1]:
!pip install transformers > /dev/null

In [2]:
import warnings
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np

from tqdm.auto import tqdm

from transformers import AutoModelForCausalLM, AutoTokenizer, T5ForConditionalGeneration

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, random_split

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
epochs = 10
batch_size = 4

csv_path = "/kaggle/input/tinkoff-ml/data.csv"
out_dir = "/kaggle/working/model.pt"

# ruDialoGPT finetune

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/rugpt3small_based_on_gpt2")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("tinkoff-ai/ruDialoGPT-medium")
model = model.to(device)

opt = torch.optim.AdamW(model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.LinearLR(opt)

In [4]:
def flatten(l):
    flat_list = []
    for sublist in l:
        for item in sublist:
            flat_list.append(item)
    return flat_list


def collate(batch):
    return pad_sequence(batch, batch_first=True, padding_value=tokenizer.pad_token_id)


class DialogueDataset(Dataset):
    def __init__(self, csv_path):
        self.csv = pd.read_csv(csv_path)
        self.csv = self.csv.fillna(0)
        # выбираем только те диалоги, в которых есть 3 и более реплики и длина реплики меньше 500 (чтобы влезло в модель)
        self.csv = self.csv[(self.csv["context_2"] != 0) & (self.csv["response"] != 0) & (self.csv["context_1"] != 0)]
        self.csv = self.csv[(self.csv["context_3"].str.len() < 500) & (self.csv["context_2"].str.len() < 500) & (self.csv["context_1"].str.len() < 500) & (self.csv["response"].str.len() < 500)]

    def __len__(self):
        return len(self.csv)

    def __getitem__(self, idx):
        row = list(self.csv.iloc[idx])[2:]
        if row[0] == 0:
            seq = [tokenizer.encode(x + tokenizer.eos_token) for x in row[1:]]
        else:
            seq = [tokenizer.encode(x + tokenizer.eos_token) for x in row]
        seq = flatten(seq)
        return torch.LongTensor(seq)

In [5]:
def train(epochs, model, solver, scheduler, train_loader, test_loader, out_dir):
    print('Training.')
    print("Epoch\tTrain loss\tTest loss")
    model.train()

    for t in range(epochs):
        epoch_loss = []
        for batch in tqdm(train_loader):
            context = batch.to(device)
            target = batch.to(device)

            solver.zero_grad()

            outputs = model(context, labels=target)
            loss = outputs.loss

            epoch_loss.append(loss.item())

            loss.backward()
            solver.step()

            del context
            del target

            torch.cuda.empty_cache()
        scheduler.step()

        model.eval()

        test_loss = []

        with torch.no_grad():
            for batch in tqdm(test_loader):
                context = batch.to(device)
                target = batch.to(device)

                outputs = model(context, labels=target)
                loss = outputs.loss
                test_loss.append(loss.item())

                del context
                del target

                torch.cuda.empty_cache()


        print('%d\t%4.3f\t\t%4.4f' %
                (t+1, sum(epoch_loss) / len(epoch_loss), sum(test_loss) / len(test_loss)))

        torch.cuda.empty_cache()

    torch.save(model, out_dir)
    print("Saved model.")

In [None]:
dataset = DialogueDataset(csv_path)

train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate, shuffle=False)

print(f"Train size: {train_size}, test size: {test_size}")

In [None]:
train(epochs, model, opt, scheduler, train_loader, test_loader, out_dir=out_dir)

# T5 finetune

In [4]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruT5-base")
model = T5ForConditionalGeneration.from_pretrained("ai-forever/ruT5-base")
model = model.to(device)

opt = torch.optim.AdamW(model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.LinearLR(opt)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [5]:
class DialogueDataset(Dataset):
    def __init__(self, csv_path):
        csv = pd.read_csv(csv_path)
        csv = csv.fillna(0)
        # выбираем только те диалоги, в которых длина реплики меньше 500 (чтобы влезло в модель)
        csv = csv[(csv["response"] != 0) & (csv["context_1"] != 0)]
        csv = csv[(csv["context_1"].str.len() < 500) & (csv["response"].str.len() < 500)]   
        csv = csv.sample(n=25000)
        
        self.question = csv["context_1"]
        self.answer = csv["response"]

    def __len__(self):
        return len(self.question)

    def __getitem__(self, idx):
        q = self.question.iloc[idx]
        a = self.answer.iloc[idx]
        
        q = tokenizer(q, max_length=512, padding="max_length", return_tensors="pt")
        a = tokenizer(a, max_length=512, padding="max_length", return_tensors="pt")
        
        return {
            "input_ids": q.input_ids.flatten(),
            "attention_mask": q.attention_mask.flatten(),
            "labels": a.input_ids.flatten()
        }

In [6]:
def train(epochs, model, solver, scheduler, train_loader, test_loader, out_dir):
    print('Training.')
    print("Epoch\tTrain loss\tTest loss")
    model.train()

    for t in range(epochs):
        epoch_loss = []
        for batch in tqdm(train_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            solver.zero_grad()

            outputs = model(input_ids, attention_mask, labels=labels)
            loss = outputs.loss

            epoch_loss.append(loss.item())

            loss.backward()
            solver.step()

            del input_ids
            del attention_mask
            del labels

            torch.cuda.empty_cache()
        scheduler.step()

        model.eval()

        test_loss = []

        with torch.no_grad():
            for batch in tqdm(test_loader):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(input_ids, attention_mask, labels=labels)
                loss = outputs.loss
                test_loss.append(loss.item())

                del input_ids
                del attention_mask
                del labels

                torch.cuda.empty_cache()


        print('%d\t%4.3f\t\t%4.4f' %
                (t+1, sum(epoch_loss) / len(epoch_loss), sum(test_loss) / len(test_loss)))

        torch.cuda.empty_cache()

    torch.save(model, out_dir)
    print("Saved model.")

In [7]:
dataset = DialogueDataset(csv_path)

train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Train size: {train_size}, test size: {test_size}")

Train size: 20000, test size: 5000


In [None]:
train(epochs, model, opt, scheduler, train_loader, test_loader, out_dir=out_dir)