# Setting runtime environment

In [None]:
RUN_FROM = 'LOCAL'
# RUN_FROM = 'COLAB'

if RUN_FROM == 'LOCAL':
    from os.path import expanduser
    HOME = expanduser("~")
elif RUN_FROM == 'COLAB':
    !pip install hanziconv
    from google.colab import drive
    drive.mount('/content/drive')
    %cd /content/drive/My Drive/projects/lm
    HOME = '/content/drive/My Drive'

# Import modules

In [None]:
import os
from data import Dataset
from model import Model
from routines import lr_range_test, train_and_evaluate, evaluate
from utils import data_utils_py3
# os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"

# Some presets

## Get datasets

In [None]:
char_vocab_file = HOME+"/Data/Vocab/char_vocab_zh"
char_embedding_files = HOME+"/Data/Vocab/zh_char_300_nlpcc.txt"
char_vocab = data_utils_py3.AtomicVocab(
    filename=char_vocab_file,
    embedding_files=char_embedding_files)

datasets = {
    'text_zh': Dataset(HOME+"/Data/text_zh", char_vocab),
    'nlpcc2017_news': Dataset(HOME+"/Data/nlpcc2017_news", char_vocab),
    'smp2017_ecdt1': Dataset(HOME+"/Data/smp2017_ecdt1", char_vocab),
}

## Various model configs

In [None]:
model_configs = {
    'small': {
        'char_vocab_size': len(char_vocab),
        'char_vocab_dim': 300,
        'layer_size': 256,
        'num_layers': 4,
        'num_heads': 8,
    },
    'medium': {
        'char_vocab_size': len(char_vocab),
        'char_vocab_dim': 300,
        'layer_size': 640,
        'num_layers': 10,
        'num_heads': 10,
    },
    'large': {
        'char_vocab_size': len(char_vocab),
        'char_vocab_dim': 300,
        'layer_size': 1024,
        'num_layers': 16,
        'num_heads': 16,
    },
}

def get_dir(dir_name, idx=None):
    if idx != None:
        dir_name += "-{:d}".format(idx)
    if RUN_FROM == 'LOCAL':
        dir_path = dir_name
    elif RUN_FROM == 'COLAB':
        dir_path = os.path.join(HOME+'/Models/', dir_name)
    return dir_path

## Various running configs

In [None]:
run_configs = {
    'pretrain_zh': {
        'batch_size': 32,
        'max_lr': 5e-4,
        'max_train_steps': 200000,
        'pct_start': 0.3,
        'dropout': 0.1,
        'wd': 1e-6,
        'data': [{'target_level': 1, 'max_token_length': 8, 'min_seq_length': 5, 'max_seq_length': 256}],
    },
    'seq2cls_zh': {
        'batch_size': 128,
        'max_lr': 5e-4,
        'max_train_steps': 10000,
        'pct_start': 0.3,
        'dropout': 0.1,
        'wd': 1e-6,
        'data': [{'target_level': 0, 'max_token_length': 8, 'max_seq_length': 256},{'target_level': 1, 'max_token_length': 8}],
    },
}

# Let's Begin!

## Settings

In [None]:
# choose model_configs
model_config_name = 'small'

# choose datasets
pretrain_dataset_name = 'text_zh'
finetune_dataset_name = 'nlpcc2017_news'

# choose run_configs
pretrain_run_config_name = 'pretrain_zh'
finetune_run_config_name = 'seq2cls_zh'

# set model dirs
pretrain_dir = get_dir('-'.join([pretrain_run_config_name, pretrain_dataset_name, model_config_name]))
finetune_dir = lambda i: get_dir('-'.join([finetune_run_config_name, finetune_dataset_name, model_config_name]), i)

# adjust pretrain_run_config
pretrain_run_config = {}
pretrain_run_config.update(run_configs[pretrain_run_config_name])
pretrain_run_config.update(
    {
        'batch_size': 32,
        'max_train_steps': 200000,
    })

#adjust finetune_run_config
finetune_run_config = {}
finetune_run_config.update(run_configs[finetune_run_config_name])
finetune_run_config.update(
    {
        'batch_size': 128,
        'max_train_steps': 20000,
    })

## Pretrain model

### Create model

In [None]:
pretrain_model = Model(model_configs[model_config_name], char_vocab, pretrain_dir)

### LR range test and plot the curve

In [None]:
lr_range_test(datasets[pretrain_dataset_name], pretrain_model, pretrain_run_config)

### Adjust run config

In [None]:
pretrain_run_config.update(
    {
        'max_lr': 5e-4,
        'dropout': 0.1,
        'wd': 1e-6,
    })

### Start train and eval loop

In [None]:
train_and_evaluate(
    datasets[pretrain_dataset_name], pretrain_model, pretrain_run_config,
    eval_every=int(pretrain_run_config['max_train_steps']/4), distributed=True)

In [None]:
evaluate(datasets[pretrain_dataset_name], pretrain_model, pretrain_run_config)

## Finetune down-stream task model

### Create model

In [None]:
finetune_model = Model(model_configs[model_config_name], char_vocab, finetune_dir(0), pretrain_dir)

### LR range test and plot the curve

In [None]:
lr_range_test(datasets[finetune_dataset_name], finetune_model, finetune_run_config)

### Adjust run config

In [None]:
finetune_run_config.update(
    {
        'max_lr': 5e-4,
        'dropout': 0.1,
        'wd': 1e-6,
    })

### Start train and eval loop

In [None]:
train_and_evaluate(
    datasets[finetune_dataset_name], finetune_model, finetune_run_config,
    eval_every=int(finetune_run_config['max_train_steps']/2), distributed=False)

### evaluation

In [None]:
evaluate(datasets[finetune_dataset_name], finetune_model, finetune_run_config)