# Setting runtime environment

In [None]:
RUN_FROM = 'LOCAL'
# RUN_FROM = 'COLAB'

if RUN_FROM == 'LOCAL':
    from os.path import expanduser
    HOME = expanduser("~")
elif RUN_FROM == 'COLAB':
    !pip install hanziconv
    from google.colab import drive
    drive.mount('/content/drive')
    %cd /content/drive/My Drive/projects/lm
    HOME = '/content/drive/My Drive'

# Import modules

In [None]:
import os
from data import Dataset
from model import Model
from routines import lr_range_test, train_and_evaluate
from utils import data_utils_py3
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"

# Setting parameters

## Dataset

In [None]:
char_vocab_file = HOME+"/Data/Vocab/vocab_zh"
char_embedding_files = HOME+"/Data/Vocab/zh_char_300_nlpcc.txt"
char_vocab = data_utils_py3.Vocab(
    char_vocab_file,
    embedding_files=char_embedding_files)

text_zh_dataset = Dataset(HOME+"/Data/text_zh", char_vocab)
news_nlpcc2017_dataset = Dataset(HOME+"/Data/news_nlpcc2017", char_vocab)

## Model

In [None]:
small_model_config = {
    'char_vocab_size': char_vocab.size(),
    'char_vocab_dim': 300,
    'char_vocab_emb': char_vocab.embedding_init.tolist(),
    'layer_size': 256,
    'num_layers': 3,
}
big_model_config = {
    'char_vocab_size': char_vocab.size(),
    'char_vocab_dim': 300,
    'char_vocab_emb': char_vocab.embedding_init.tolist(),
    'layer_size': 512,
    'num_layers': 16,
}

def get_dir(dir_name):
    if RUN_FROM == 'LOCAL':
        dir_path = dir_name
    elif RUN_FROM == 'COLAB':
        dir_path = os.path.join(HOME+'/Models/', dir_name)
    return dir_path

## Run

In [None]:
pretrain_run_config = {
    'batch_size': 32,
    'max_lr': 1e-4,
    'max_train_steps': 200000,
    'pct_start': 0.3,
    'dropout': 0.1,
    'wd': 1e-6,
    'data': [{'is_target': True, 'max_token_length': 8, 'min_seq_length': 2, 'max_seq_length': 256}],
}
seq2cls_run_config = {
    'batch_size': 128,
    'max_lr': 5e-5,
    'max_train_steps': 20000,
    'pct_start': 0.3,
    'dropout': 0.1,
    'wd': 1e-6,
    'data': [{'is_target': False, 'max_token_length': 8, 'min_seq_length': 2, 'max_seq_length': 256},{'is_target': True, 'max_token_length': 8}],
}

# Let's Begin!

## Pretrain model

### Create model

In [None]:
pretrain_small_model = Model(small_model_config, get_dir('pretrain_zh_small'))

### LR range test and plot the curve

In [None]:
lr_range_test(text_zh_dataset, pretrain_small_model, pretrain_run_config, num_steps=1000)

### Adjust run config

In [None]:
pretrain_run_config = {
    'batch_size': 32,
    'max_lr': 1e-4,
    'max_train_steps': 200000,
    'pct_start': 0.3,
    'dropout': 0.1,
    'wd': 1e-6,
    'data': [{'is_target': True, 'max_token_length': 8, 'min_seq_length': 2, 'max_seq_length': 256}],
}

### Start train and eval loop

In [None]:
train_and_evaluate(
    text_zh_dataset, pretrain_small_model, pretrain_run_config, eval_every=10000, distributed=True)

## Finetune down-stream task model

### Create model

In [None]:
new_nlpcc2017_small_model = Model(small_model_config, get_dir('news_nlpcc2017_small'), get_dir('pretrain_zh_small'))

### LR range test and plot the curve

In [None]:
lr_range_test(news_nlpcc2017_dataset, news_nlpcc2017_small_model, seq2cls_run_config, num_steps=1000)

### Adjust run config

In [None]:
seq2cls_run_config = {
    'batch_size': 128,
    'max_lr': 5e-5,
    'max_train_steps': 20000,
    'pct_start': 0.3,
    'dropout': 0.1,
    'wd': 1e-6,
    'data': [{'is_target': False, 'max_token_length': 8, 'min_seq_length': 2, 'max_seq_length': 256},{'is_target': True, 'max_token_length': 8}],
}

### Start train and eval loop

In [None]:
train_and_evaluate(
    news_nlpcc2017_dataset, news_nlpcc2017_small_model, seq2cls_run_config, eval_every=10000, distributed=False)