# GPT-2

Description here.

TODO
- Implement scheduler

## 0. Prepare Environment

### Install and import dependent libraries

In [None]:
import attrdict
import transformers
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import os

### Define parameters
Declare parameters set by `papermill` .

In [None]:
_params = dict(
    data_dir="data_sample",
    output_dir="output",

    # model parameters
    tokenizer_model_name="cl-tohoku/bert-base-japanese",
    n_embd=768,
    n_layer=12,
    n_head=12,
    n_ctx=1024,

    # training parameters
    seed=1234,
    num_epochs=10,
    batch_size=2,
    block_size=1024,
    learning_rate=5e-5,
    max_grad_norm=1.0,
    warmup_rate=0.1,
    patience=1,
)

In [None]:
_params = attrdict.AttrDict(_params)

### Set seed for reproducibility

In [None]:
def set_seed(seed):
    import numpy as np
    import tensorflow as tf
    import random
    import os

    os.environ['PYTHONHASHSEED'] = '0'
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)

In [None]:
set_seed(_params.seed)

## 1. Define Problem

## 2. Create Dataset

## 3. Select Evaluation Metrics

Perplexity

## 4. Determine Eavaluation Protocol

Hold-out validation

## 5. Prepare Data

In [None]:
def load_dataset(path):
    texts = []
    for line in open(path):
        texts.append(line.strip("\n"))
    return texts

_train_texts = load_dataset(_params.data_dir + "/train.txt")
_valid_texts = load_dataset(_params.data_dir + "/valid.txt")
_test_texts = load_dataset(_params.data_dir + "/test.txt")

Prepare tokenizer

In [None]:
def build_tokenizer(tokenizer_model_name):
    tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_model_name)
    return tokenizer

In [None]:
_tokenizer = build_tokenizer(_params.tokenizer_model_name)

In [None]:
def build_data(tokenizer, texts, block_size):
    ids = []
    for text in texts:
        # Set add_special_tokens=False
        # not to add additional special tokens.
        tokens = tokenizer.tokenize(text)
        ids.extend(tokenizer.convert_tokens_to_ids(tokens))

    inputs = []
    labels = []
    for idx in range(0, len(ids)-block_size+1, block_size):
        sample = ids[idx:idx+block_size]
        inputs.append(sample[:-1])
        labels.append(sample[1:])
    return np.array(inputs), np.array(labels)

In [None]:
_x_train, _y_train = build_data(_tokenizer, _train_texts, _params.block_size)
_x_valid, _y_valid = build_data(_tokenizer, _valid_texts, _params.block_size)
_x_test, _y_test = build_data(_tokenizer, _valid_texts, _params.block_size)

## 6-8. Develop and Tune Models

6. Develop a model to overcome baseline model
7. Develop a overfitting model
8. Regularize the model and tune hyperparameters

In [None]:
def build_model(tokenizer, params):
    config = transformers.GPT2Config(
        vocab_size=len(tokenizer),
        n_ctx=params.n_ctx,
        n_positions=params.block_size,
        n_embd=params.n_embd,
        n_layer=params.n_layer,
        n_head=params.n_head,
    )
    model = transformers.TFGPT2LMHeadModel(config=config)
    return model

In [None]:
# Read https://github.com/huggingface/transformers/issues/2169
# to know more about how to train TFGPT2LMHead

def train(params, tokenizer, x_train, y_train, x_valid, y_valid):
    # Prepare model save directory
    model_save_dir = os.path.join(params.output_dir, "model")
    if not os.path.exists(model_save_dir):
        os.mkdir(model_save_dir)

    # Compile model
    # Set from_logits=True because TFGPT2LMHeadModel returns the logits (before Softmax)
    model = build_model(tokenizer, params)
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(
        optimizer=keras.optimizers.Adam(
            learning_rate=params.learning_rate,
            epsilon=1e-08,
            clipnorm=params.max_grad_norm,
        ),
        loss=[loss, *[None] * model.config.n_layer],
        metrics=[
            keras.metrics.SparseCategoricalCrossentropy(from_logits=True),
            keras.metrics.SparseCategoricalAccuracy(),
        ],
    )
    
    callbacks_list = [
        keras.callbacks.EarlyStopping(
            monitor="val_loss",
            patience=params.patience,
        ),
        keras.callbacks.ModelCheckpoint(
            filepath=os.path.join(model_save_dir, "model.h5"),
            monitor="val_loss",
            save_best_only=True,
        ),
        keras.callbacks.TensorBoard(
            log_dir=os.path.join(params.output_dir, "tensorboard"),
            histogram_freq=1,
            embeddings_freq=1,
        )
    ]
    
    # Train model and save the best one
    _tokenizer.save_pretrained(model_save_dir)   
    history = model.fit(
        {"input_ids": x_train},
        y_train,
        epochs=_params.num_epochs,
        batch_size=_params.batch_size,
        callbacks=callbacks_list,
        validation_data=({"input_ids": x_valid}, y_valid),
    )
    return history.model


_val_best_model = train(_params, _tokenizer, _x_train, _y_train, _x_valid, _y_valid)
_val_best_model.summary()

## 9. Evaluate Best Model with Validation Data

In [None]:
_val_best_model.evaluate(_x_valid, _y_valid)