# GPT-2

Description here.

## Install dependent libraries
This section installs required package. Version should be specified for reproducibility.

In [None]:
! pip install transformers==2.6.0
! pip install tqdm==4.43.0
! pip install mecab-python3==0.996.2
! pip install attrdict==2.0.1
! pip install tensorboard==2.1.1

## Test library

Test your all the libraries used in this notebook.

## Parameters
Declare parameters set by `papermill` .

In [None]:
# general parameters
name = "model"
data_dir = "data_sample"
pretrained_dir = None
output_dir ="output"

# model parameters
dataset_type="block"
n_embd=768
n_layer=12
n_head=12
n_ctx=1024

# training parameters
seed=1234
num_epochs=10
batch_size=2
block_size=1024
learning_rate=5e-5
max_grad_norm=1.0
warmup_rate=0.1
patience = 3

In [None]:
import attrdict

_params = attrdict.AttrDict({
    "name": name,
    "data_dir": data_dir,
    "pretrained_dir": pretrained_dir,
    "output_dir": output_dir,
    "dataset_type": dataset_type,
    "n_embd": n_embd,
    "n_layer": n_layer,
    "n_head": n_head,
    "n_ctx": n_ctx,
    "seed": seed,
    "num_epochs": num_epochs,
    "batch_size": batch_size,
    "block_size": block_size,
    "learning_rate": learning_rate,
    "max_grad_norm": max_grad_norm,
    "warmup_rate": warmup_rate,
    "patience": patience,
})

del name
del data_dir
del pretrained_dir
del output_dir
del dataset_type
del n_embd
del n_layer
del n_head
del n_ctx
del seed
del num_epochs
del batch_size
del block_size
del learning_rate
del max_grad_norm
del warmup_rate
del patience

## Define preprocessor, tokenizer and dataset

In [None]:
import torch
import transformers

Define a builder class for BlockLM dataset and tokenizer

In [None]:
class BlockLMDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, texts, block_size):
        ids = []
        for text in texts:
            # Set add_special_tokens=False
            # not to add additional special tokens.
            tokens = tokenizer.tokenize(text)
            ids.extend(tokenizer.convert_tokens_to_ids(tokens))

        inputs = []
        for idx in range(0, len(ids)-block_size+1, block_size):
            inputs.append(ids[idx:idx+block_size])

        self._inputs = inputs

    def __len__(self):
        return len(self._inputs)

    def __getitem__(self, item):
        return torch.tensor(self._inputs[item])

    
class BlockLMBuilder:
    def build_tokenizer(self):
            return transformers.BertJapaneseTokenizer.from_pretrained("bert-base-japanese")
        
    def build_dataloader(self, tokenizer, texts, batch_size, block_size, shuffle):
        data_set = BlockLMDataset(tokenizer, texts, block_size=block_size)
        data_loader = torch.utils.data.DataLoader(
            data_set,
            batch_size=batch_size,
            shuffle=shuffle,
        )
        return data_loader
        

Define a builder class for SentenceLM dataset and tokenizer

In [None]:
class SentenceLMDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, texts, block_size):
        BOS, EOS = tokenizer.additional_special_tokens
        inputs_ = []
        for text in texts:
            tokens = [BOS] + tokenizer.tokenize(text)[:block_size-2] + [EOS]
            ids = tokenizer.convert_tokens_to_ids(tokens)
            inputs_.append(ids)

        self._inputs = inputs_

    def __len__(self):
        return len(self._inputs)

    def __getitem__(self, idx):
        return torch.tensor(self._inputs[idx])

    
class SentenceLMBuilder:
    def build_tokenizer(self):
        special_tokens_dict = {
            "additional_special_tokens": [
                "<BOS>",  # Begin of sentence
                "<EOS>",  # End of sentence
            ]
        }
        tokenizer = transformers.BertJapaneseTokenizer.from_pretrained("bert-base-japanese")
        tokenizer.add_special_tokens(special_tokens_dict)
        return tokenizer


    def build_dataloader(self, tokenizer, texts, batch_size, block_size, shuffle):
        data_set = SentenceLMDataset(tokenizer, texts, block_size)
        data_loader = torch.utils.data.DataLoader(
            data_set,
            batch_size=batch_size,
            shuffle=shuffle,
            collate_fn=lambda x: torch.nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=tokenizer.pad_token_id)
        )
        return data_loader

## Define model

In [None]:
def build_model(tokenizer, params, model_dir=None):
    model = GPT2DoubleHeadsModel.from_pretrained(model_dir)
    model.resize_token_embeddings(len(tokenizer))
    return model

## Build and save vocabulary

In [None]:
import os


_model_output_dir = os.path.join(_params.output_dir, _params.name)
if not os.path.exists(_model_output_dir):
    os.mkdir(_model_output_dir)

In [None]:
if _params.dataset_type == "block":
    _builder = BlockLMBuilder()
elif _params.dataset_type == "sentence":
    _builder = SentenceLMBuilder()
else:
    raise Exception("dataset_type, want block or sentence, but got {}".format(_params.dataset_type))

_tokenizer = _builder.build_tokenizer()
_tokenizer.save_pretrained(_model_output_dir)

## Train and save model

In [None]:
import torch
import numpy as np
import random


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    # When use GPU
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


def calc_ppl(loss):
    """Calculate perplexity from Softmax Cross Entropy loss"""
    ppl = torch.exp(torch.tensor(loss)).item()
    return ppl


In [None]:
import tqdm


def train_model(model_output_dir, net, dataloader_dict, train_config):
    PHASE_TRAIN = "train"
    PHASE_VAL = "val"
    
    # keep the best model
    best = {"model": None, "epoch": 0, "loss":float("infinity"), "ppl": float("infinity")}
    
    # 学習イテレーションの回数を保持
    num_iters = 0
    
    # keep the count which the validation metric does not improved
    num_patience = 0
        
    net.to(train_config.device)
    
    for epoch in range(train_config.num_epochs+1):
        print("Epoch {}/{}".format(epoch, train_config.num_epochs))
        # 学習と検証のループ
        for phase in [PHASE_TRAIN, PHASE_VAL]:
            # フェーズによってネットワークのモードを変更する
            # Dropout等の挙動に影響あり
            if phase == PHASE_TRAIN:
                net.train()
            elif phase == PHASE_VAL:
                net.eval()
            else:
                raise Exception("got {} expected one of {}".format(phase, [PHASE_TRAIN, PHASE_VAL]))
                
            epoch_loss = 0
            
            # 未学習時の検証性能を確かめる
            if epoch == 0 and phase == PHASE_TRAIN:
                continue
                
            for batch in tqdm.tqdm(dataloader_dict[phase], disable=True):

                # GPUが使える場合はGPUにデータを送る
                inputs = batch.to(train_config.device)

                # Initialize optimizer
                if phase == PHASE_TRAIN:
                    train_config.optimizer.zero_grad()
                
                # set_grad_enabled(phrase=="train") で
                # 学習時のみ勾配計算できるようにグラフ作成する
                with torch.set_grad_enabled(phase==PHASE_TRAIN):
                    # labelsを指定することでlossを計算する
                    loss, _, _ = net(inputs, labels=inputs)
                    
                    if phase == PHASE_TRAIN:
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(net.parameters(), train_config.max_grad_norm)
                        train_config.optimizer.step()
                        train_config.scheduler.step()

                        num_iters += 1

                    # epoch loss を更新
                    epoch_loss += loss.item() * inputs.size()[0]

                    # TensorBoardへの描画を行う
                    # 学習時のみlossを描画
                    if phase == PHASE_TRAIN:
                        train_config.writer.add_scalars("train/loss", {phase: loss.item()}, num_iters)
                        train_config.writer.add_scalars("train/lr", {phase: train_config.scheduler.get_lr()[0]}, num_iters)

            epoch_loss = epoch_loss / len(dataloader_dict[phase].dataset)
            epoch_ppl = calc_ppl(epoch_loss)
            print("phase {}, loss: {:.4f}, ppl: {:.4f}".format(phase, epoch_loss, epoch_ppl))

            if train_config.writer and phase == PHASE_VAL:
                train_config.writer.add_scalars("train/loss", {phase: epoch_loss}, num_iters)
                train_config.writer.add_scalars("metric/ppl", {phase: epoch_ppl}, num_iters)

                
                if best["loss"] > epoch_loss:
                    best = {"model": net, "epoch": epoch, "loss": epoch_loss, "ppl": epoch_ppl}
                    num_patience = 0
                    # save model
                    if model_output_dir:
                        print("Save model, epoch:", epoch)
                        net.save_pretrained(model_output_dir)
                else:
                    num_patience += 1
                    print("Patience {}, epoch: {}".format(num_patience, epoch))
                    
                if num_patience > train_config.patience:
                    return

In [None]:
import os
from torch.utils.tensorboard import SummaryWriter



def get_texts(filepath):
    return [line.strip("\n") for line in open(filepath)]



def train(model_output_dir, params, builder):
    # Fix seed for reproducability
    set_seed(seed=params.seed)
    
    # Dataset and dataloader
    tokenizer = transformers.BertJapaneseTokenizer.from_pretrained(model_output_dir)
    dataloader_dict = {
        "train": builder.build_dataloader(tokenizer, get_texts(params.data_dir + "/train.txt"), params.batch_size, params.block_size, shuffle=True),
        "val": builder.build_dataloader(tokenizer, get_texts(params.data_dir + "/valid.txt"), params.batch_size, params.block_size, shuffle=False),
    }
    
    # Model
    if params.pretrained_dir:
        net = transformers.GPT2LMHeadModel.from_pretrained(params.pretrained_dir)
        net.resize_token_embeddings(len(tokenizer))
    else:
        config = transformers.GPT2Config(
            vocab_size=len(tokenizer),
            n_ctx=params.n_ctx,
            n_positions=params.block_size,
            n_embd=params.n_embd,
            n_layer=params.n_layer,
            n_head=params.n_head,
        )
        net = transformers.GPT2LMHeadModel(config=config)
        assert config.n_ctx == config.n_positions == params.n_ctx == params.block_size
    
    # Save config
    #config.save_pretrained(model_output_dir)

    # create train config
    optimizer = torch.optim.Adam(net.parameters(),  lr=params.learning_rate)
    total_steps = len(dataloader_dict["train"]) * params.num_epochs
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=total_steps*params.warmup_rate,
        num_training_steps=total_steps
    )
    train_config = attrdict.AttrDict({
        "optimizer": optimizer,
        "scheduler": scheduler,
        "writer": SummaryWriter(log_dir=params.output_dir + "/runs/" + params.name),
        "num_epochs": params.num_epochs,
        "max_grad_norm": params.max_grad_norm,
        "patience": params.patience,
        "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    })
    
    train_model(model_output_dir, net, dataloader_dict, train_config)

In [None]:
train(_model_output_dir, _params, _builder)

## Evaluate the best model

In [None]:
def evaluate(model_output_dir, params, builder):
    # Load model
    tokenizer = transformers.BertJapaneseTokenizer.from_pretrained(model_output_dir)
    net = transformers.GPT2LMHeadModel.from_pretrained(model_output_dir)
    
    dataloader_dict = {
        "val": builder.build_dataloader(tokenizer, get_texts(params.data_dir + "/test.txt"), params.batch_size, params.block_size, shuffle=False),
    }

    train_config = attrdict.AttrDict({
        "writer": None,
        "num_epochs": 0,
        "patience": 1,
        "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    })
    
    # print(net)
    return train_model(None, net, dataloader_dict, train_config)

In [None]:
evaluate(_model_output_dir, _params, _builder)