# Setting runtime environment

In [None]:
RUN_FROM = 'LOCAL'
# RUN_FROM = 'COLAB'

if RUN_FROM == 'LOCAL':
    from os.path import expanduser
    HOME = expanduser("~")
elif RUN_FROM == 'COLAB':
    !pip install hanziconv
    from google.colab import drive
    drive.mount('/content/drive')
    %cd /content/drive/My Drive/projects/lm
    HOME = '/content/drive/My Drive'

# Import modules

In [None]:
import os
from data import Dataset
from model import Model
from routines import lr_range_test, train_and_evaluate, evaluate
from utils import data_utils_py3
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"

# Some presets

## Get datasets

In [None]:
char_vocab_file = HOME+"/Data/Vocab/vocab_zh"
char_embedding_files = HOME+"/Data/Vocab/zh_char_300_nlpcc.txt"
char_vocab = data_utils_py3.Vocab(
    char_vocab_file,
    embedding_files=char_embedding_files)

text_zh_dataset = Dataset(HOME+"/Data/text_zh", char_vocab)
nlpcc2017_news_dataset = Dataset(HOME+"/Data/nlpcc2017_news", char_vocab)
smp2017_ecdt1_dataset = Dataset(HOME+"/Data/smp2017_ecdt1", char_vocab)

## Various model configs

In [None]:
small_model_config = {
    'char_vocab_size': char_vocab.size(),
    'char_vocab_dim': 300,
    'char_vocab_emb': char_vocab.embedding_init.tolist(),
    'layer_size': 256,
    'num_layers': 4,
}
big_model_config = {
    'char_vocab_size': char_vocab.size(),
    'char_vocab_dim': 300,
    'char_vocab_emb': char_vocab.embedding_init.tolist(),
    'layer_size': 768,
    'num_layers': 12,
}

def get_dir(dir_name):
    if RUN_FROM == 'LOCAL':
        dir_path = dir_name
    elif RUN_FROM == 'COLAB':
        dir_path = os.path.join(HOME+'/Models/', dir_name)
    return dir_path

## Various running configs

In [None]:
pretrain_zh_run_config = {
    'batch_size': 32,
    'max_lr': 3e-4,
    'max_train_steps': 200000,
    'pct_start': 0.3,
    'dropout': 0.1,
    'wd': 1e-6,
    'data': [{'is_target': True, 'max_token_length': 8, 'min_seq_length': 2, 'max_seq_length': 256}],
}
seq2cls_zh_run_config = {
    'batch_size': 64,
    'max_lr': 5e-5,
    'max_train_steps': 10000,
    'pct_start': 0.3,
    'dropout': 0.1,
    'wd': 1e-6,
    'data': [{'is_target': False, 'max_token_length': 8, 'min_seq_length': 2, 'max_seq_length': 256},{'is_target': True, 'max_token_length': 8}],
}

# Let's Begin!

## Settings

In [None]:
# choose model
model_config = small_model_config

# choose datasets
pretrain_dataset = text_zh_dataset
finetune_dataset = nlpcc2017_news_dataset

# choose model dirs
pretrain_dir = get_dir('pretrain_zh_small')
finetune_dir = get_dir('nlpcc2017_news_small')

# adjust pretrain_run_config
pretrain_run_config = {}
pretrain_run_config.update(pretrain_zh_run_config)
pretrain_run_config.update(
    {
        'batch_size': 32,
        'max_train_steps': 200000,
    })

#adjust finetune_run_config
finetune_run_config = {}
finetune_run_config.update(seq2cls_zh_run_config)
finetune_run_config.update(
    {
        'batch_size': 64,
        'max_train_steps': 10000,
    })

## Pretrain model

### Create model

In [None]:
pretrain_model = Model(model_config, pretrain_dir)

### LR range test and plot the curve

In [None]:
lr_range_test(pretrain_dataset, pretrain_model, pretrain_run_config, num_steps=1000)

### Adjust run config

In [None]:
pretrain_run_config.update(
    {
        'max_lr': 3e-4,
        'dropout': 0.1,
        'wd': 1e-6,
    })

### Start train and eval loop

In [None]:
train_and_evaluate(
    pretrain_dataset, pretrain_model, pretrain_run_config, eval_every=50000, distributed=True)

## Finetune down-stream task model

### Create model

In [None]:
finetune_model = Model(model_config, finetune_dir, pretrain_dir)

### LR range test and plot the curve

In [None]:
lr_range_test(finetune_dataset, finetune_model, finetune_run_config, num_steps=1000)

### Adjust run config

In [None]:
finetune_run_config.update(
    {
        'max_lr': 5e-5,
        'dropout': 0.1,
        'wd': 1e-6,
    })

### Start train and eval loop

In [None]:
train_and_evaluate(
    finetune_dataset, finetune_model, finetune_run_config, eval_every=10000, distributed=False)

### evaluation

In [None]:
evaluate(finetune_dataset, finetune_model, finetune_run_config)