# GPT-2

Description here.

## Check environment

In [None]:
! git log -1

In [None]:
! git status

In [None]:
! git diff

In [None]:
! pip list

In [None]:
#! pip install transformers==2.6.0
#! pip install tqdm==4.43.0
#! pip install mecab-python3==0.996.2
#! pip install attrdict==2.0.1
#! pip install tensorboard==2.1.1

## Test library

Test your all the libraries used in this notebook.

## Parameters
Declare parameters set by `papermill` .

In [None]:
# general parameters
name = "model"
data_dir = "notebooks/chatlm/data_sample"
pretrained_dir = "notebooks/gpt/output/model"
output_dir ="output"

# training parameters
seed=1234
num_epochs=10
batch_size=2
learning_rate=5e-5
max_grad_norm=1.0
warmup_rate=0.1
patience = 3

In [None]:
import attrdict

_params = attrdict.AttrDict({
    "name": name,
    "data_dir": data_dir,
    "pretrained_dir": pretrained_dir,
    "output_dir": output_dir,
    "seed": seed,
    "num_epochs": num_epochs,
    "batch_size": batch_size,
    "learning_rate": learning_rate,
    "max_grad_norm": max_grad_norm,
    "warmup_rate": warmup_rate,
    "patience": patience,
})

del name
del data_dir
del pretrained_dir
del output_dir
del seed
del num_epochs
del batch_size
del learning_rate
del max_grad_norm
del warmup_rate
del patience

## Define preprocessor, tokenizer and dataset

## Define model

## Build and save vocabulary

In [None]:
import os

_model_output_dir = os.path.join(_params.output_dir, _params.name)
if not os.path.exists(_model_output_dir):
    os.mkdir(_model_output_dir)

In [None]:
from gptchat.lib.chatlm import ChatLMTokenizerBuilder

_tokenizer = ChatLMTokenizerBuilder().build()
_tokenizer.save_pretrained(_model_output_dir)

## Train and save model

In [None]:
import torch
import numpy as np
import random


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    # When use GPU
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
import os
import transformers
from torch.utils.tensorboard import SummaryWriter
from gptchat.lib.chatlm import ChatLMDataloaderBuilder
from gptchat.lib.chatlm import ChatLMDataset
from gptchat.lib.chatlm import ChatLMModelBuilder
from gptchat.lib.trainer import Trainer


def get_texts(filepath):
    return [line.strip("\n").split("\t") for line in open(filepath)]


def train(model_output_dir, params):
    # Fix seed for reproducability
    set_seed(seed=params.seed)
    
    # Load tokenizer
    tokenizer = transformers.BertJapaneseTokenizer.from_pretrained(model_output_dir)
    
    # Build dataloader
    train_data = get_texts(params.data_dir + "/train.tsv")
    valid_data = get_texts(params.data_dir + "/valid.tsv")
    dataloader_builder = ChatLMDataloaderBuilder()
    dataloader_dict = {
        "train": ChatLMDataloaderBuilder().build(
            dataset=ChatLMDataset(tokenizer, train_data),
            batch_size=params.batch_size,
            shuffle=True,
            pad_token_id=tokenizer.pad_token_id
        ),
        "val": ChatLMDataloaderBuilder().build(
            dataset=ChatLMDataset(tokenizer, valid_data),
            batch_size=params.batch_size,
            shuffle=False,
            pad_token_id=tokenizer.pad_token_id
        ),
    }
    
    # Model
    net = ChatLMModelBuilder().from_pretrained(
        pretrained_dir=params.pretrained_dir,
        vocab_size=len(tokenizer)
    )

    # create train config
    optimizer = torch.optim.Adam(net.parameters(),  lr=params.learning_rate)
    total_steps = len(dataloader_dict["train"]) * params.num_epochs
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=total_steps*params.warmup_rate,
        num_training_steps=total_steps
    )
    
    trainer = Trainer(
        model_output_dir=model_output_dir,
        net=net,
        dataloader_dict=dataloader_dict,
        num_epochs=params.num_epochs,
        device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
        optimizer=optimizer,
        scheduler=scheduler,
        max_grad_norm=params.max_grad_norm,
        patience=params.patience,
        writer=SummaryWriter(log_dir=params.output_dir + "/runs/" + params.name),
        tqdm_disable=True,
    )
    
    trainer.train()

In [None]:
train(_model_output_dir, _params)

## Evaluate the best model

In [None]:
def evaluate(model_output_dir, params):
    # Load model
    tokenizer = transformers.BertJapaneseTokenizer.from_pretrained(model_output_dir)
    net = transformers.GPT2LMHeadModel.from_pretrained(model_output_dir)

    # Build dataloader
    valid_data = get_texts(params.data_dir + "/test.tsv")
    dataloader_builder = ChatLMDataloaderBuilder()
    dataloader_dict = {
        "val": ChatLMDataloaderBuilder().build(
            dataset=ChatLMDataset(tokenizer, valid_data),
            batch_size=params.batch_size,
            shuffle=False,
            pad_token_id=tokenizer.pad_token_id
        ),
    }
    
    trainer = Trainer(
        model_output_dir=None,
        net=net,
        dataloader_dict=dataloader_dict,
        num_epochs=0,
        device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
        optimizer=None,
        scheduler=None,
        max_grad_norm=None,
        patience=1,
        writer=None,
        tqdm_disable=True,
    )
    
    trainer.train()

In [None]:
evaluate(_model_output_dir, _params)