# GPT-2

Description here.

## Install dependent libraries
This section installs required package. Version should be specified for reproducibility.

In [None]:
! pip install transformers==2.6.0
! pip install tqdm==4.43.0
! pip install mecab-python3==0.996.2
! pip install attrdict==2.0.1
! pip install tensorboard==2.1.1

## Parameters
Declare parameters set by `papermill` .

In [None]:
# general parameters
name = "test"
data_dir = "./data_sample/"
output_dir ="./output/"

# model parameters
n_embd=768
n_layer=12
n_head=12
n_ctx=1024

# training parameters
seed=1234
num_epochs=30
batch_size=2
block_size=1024
learning_rate=5e-5
max_grad_norm=1.0
warmup_rate=0.1
patience = 3

In [None]:
import attrdict

params = attrdict.AttrDict({
    "name": name,
    "data_dir": data_dir,
    "output_dir": output_dir,
    "n_embd": n_embd,
    "n_layer": n_layer,
    "n_head": n_head,
    "n_ctx": n_ctx,
    "seed": seed,
    "num_epochs": num_epochs,
    "batch_size": batch_size,
    "block_size": block_size,
    "learning_rate": learning_rate,
    "max_grad_norm": max_grad_norm,
    "warmup_rate": warmup_rate,
    "patience": patience,
})

del name
del data_dir
del output_dir
del n_embd
del n_layer
del n_head
del n_ctx
del seed
del num_epochs
del batch_size
del block_size
del learning_rate
del max_grad_norm
del warmup_rate
del patience

## Test library

Test your all the libraries used in this notebook.

## Preprocessor and tokenizer

In [None]:
import transformers


def build_tokenizer():
    return transformers.BertJapaneseTokenizer.from_pretrained("bert-base-japanese")

## Dataset

In [None]:
import torch


class Dataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, texts, block_size):
        ids = []
        for text in texts:
            # Set add_special_tokens=False
            # not to add additional special tokens.
            tokens = tokenizer.tokenize(text)
            ids.extend(tokenizer.convert_tokens_to_ids(tokens))

        inputs = []
        for idx in range(0, len(ids)-block_size+1, block_size):
            inputs.append(ids[idx:idx+block_size])

        self._inputs = inputs

    def __len__(self):
        return len(self._inputs)

    def __getitem__(self, item):
        return torch.tensor(self._inputs[item])


def build_dataloader(tokenizer, texts, batch_size, block_size, shuffle):
    data_set = Dataset(tokenizer, texts, block_size=block_size)
    data_loader = torch.utils.data.DataLoader(
        data_set,
        batch_size=batch_size,
        shuffle=shuffle,
    )
    return data_loader

## Model

## Train model

In [None]:
import torch
import numpy as np
import random


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    # When use GPU
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
import tqdm


def train_model(net, config, tokenizer, dataloader_dict, train_config):
    PHASE_TRAIN = "train"
    PHASE_VAL = "val"
    
    # keep the best model
    best_model = None
    best_epoch = 0
    best_metric = {"loss": float("infinity")}
    
    # 学習イテレーションの回数を保持
    num_iters = 0
    
    # keep the count which the validation metric does not improved
    num_patience = 0
        
    net.to(train_config.device)
    
    for epoch in range(train_config.num_epochs+1):
        print("Epoch {}/{}".format(epoch, train_config.num_epochs))
        # 学習と検証のループ
        for phase in [PHASE_TRAIN, PHASE_VAL]:
            # フェーズによってネットワークのモードを変更する
            # Dropout等の挙動に影響あり
            if phase == PHASE_TRAIN:
                net.train()
            elif phase == PHASE_VAL:
                net.eval()
            else:
                raise Exception("got {} expected one of {}".format(phase, [PHASE_TRAIN, PHASE_VAL]))
                
            epoch_loss = 0
            
            # 未学習時の検証性能を確かめる
            if epoch == 0 and phase == PHASE_TRAIN:
                continue
                
            # for batch in tqdm.tqdm(dataloader_dict[phase], disable=True):
            for batch in tqdm.tqdm(dataloader_dict[phase], disable=False):

                # GPUが使える場合はGPUにデータを送る
                inputs = batch.to(train_config.device)

                # Initialize optimizer
                train_config.optimizer.zero_grad()
                
                # set_grad_enabled(phrase=="train") で
                # 学習時のみ勾配計算できるようにグラフ作成する
                with torch.set_grad_enabled(phase==PHASE_TRAIN):
                    # labelsを指定することでlossを計算する
                    loss, _, _ = net(inputs, labels=inputs)
                    
                    if phase == PHASE_TRAIN:
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(net.parameters(), train_config.max_grad_norm)
                        train_config.optimizer.step()
                        train_config.scheduler.step()

                        num_iters += 1

                    # epoch loss を更新
                    epoch_loss += loss.item() * inputs.size()[0]

                    # TensorBoardへの描画を行う
                    # 学習時のみlossを描画
                    if phase == PHASE_TRAIN:
                        train_config.writer.add_scalars("train/loss", {PHASE_TRAIN: loss.item()}, num_iters)
                        train_config.writer.add_scalars("train/lr", {PHASE_TRAIN: train_config.scheduler.get_lr()[0]}, num_iters)

            epoch_loss = epoch_loss / len(dataloader_dict[phase].dataset)
            print("phase {}, loss: {:.4f}".format(phase, epoch_loss))
            
            if phase == PHASE_VAL:
                train_config.writer.add_scalars("train/loss", {PHASE_VAL: epoch_loss}, num_iters)
                
                if best_metric["loss"] > epoch_loss:
                    best_model = net
                    best_metric = {"loss": epoch_loss}
                    best_epoch = epoch
                    num_patience = 0
                    # save model
                    print("Save model, epoch:", epoch)
                    save_dir = os.path.join(train_config.model_output_dir, "epoch-{}".format(epoch))
                    mkdir(save_dir)

                    # [TODO] Save vocab dict
                    net.save_pretrained(save_dir)
                    tokenizer.save_pretrained(save_dir)
                    config.save_pretrained(save_dir)
                else:
                    num_patience += 1
                    print("Patience {}, epoch: {}".format(num_patience, epoch))
                    
                if num_patience > train_config.patience:
                    return best_model, best_metric, best_epoch
    return best_model, best_metric, best_epoch

In [None]:
import os
from torch.utils.tensorboard import SummaryWriter



def get_texts(filepath):
    return [line.strip("\n") for line in open(filepath)]


def mkdir(path):
    if not os.path.exists(path):
        os.mkdir(path)

    
def train(params):
    # Fix seed for reproducability
    set_seed(seed=params.seed)
    
    # create directory to save model
    model_output_dir = os.path.join(params.output_dir, params.name)
    mkdir(model_output_dir)
    
    # Dataset and dataloader
    tokenizer = build_tokenizer()  
    dataloader_dict = {
        "train": build_dataloader(tokenizer, get_texts(params.data_dir + "/train.txt"), params.batch_size, params.block_size, shuffle=True),
        "val": build_dataloader(tokenizer, get_texts(params.data_dir + "/valid.txt"), params.batch_size, params.block_size, shuffle=False),
    }
    
    # Model
    config = transformers.GPT2Config(
        vocab_size=len(tokenizer),
        n_ctx=params.n_ctx,
        n_positions=params.block_size,
        n_embd=params.n_embd,
        n_layer=params.n_layer,
        n_head=params.n_head,
        num_labels=1,  # binary label to classify whether generated sentence is valid or not.
    )
    net = transformers.GPT2LMHeadModel(config=config)
    assert config.n_ctx == config.n_positions == params.n_ctx == params.block_size

    # create train config
    optimizer = torch.optim.Adam(net.parameters(),  lr=params.learning_rate)
    total_steps = len(dataloader_dict["train"]) * params.num_epochs
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=total_steps*params.warmup_rate,
        num_training_steps=total_steps
    )
    train_config = attrdict.AttrDict({
        "model_output_dir": model_output_dir,
        "optimizer": optimizer,
        "scheduler": scheduler,
        "writer": SummaryWriter(log_dir=params.output_dir + "/runs/" + params.name),
        "num_epochs": params.num_epochs,
        "max_grad_norm": params.max_grad_norm,
        "patience": params.patience,
        "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    })
    
    # print(net)
    return train_model(net, config, tokenizer, dataloader_dict, train_config)

In [None]:
best_model, best_metric, best_epoch = train(params)

## Evaluate model

In [None]:
print("Best model in epoch", best_epoch)
print("Scores:", best_metric)