#### Installation

In [1]:
!cp -r ../input/mtdnn0502/MT-DNN MT-DNN

In [2]:
!pip install -e MT-DNN
!pip list | grep mtdnn

In [3]:
import sys, os
sys.path.append('MT-DNN')
sys.path

In [4]:
%pip install torch==1.5.0 torchvision torchaudio torchtext

In [5]:
!git clone https://github.com/NVIDIA/apex
os.chdir('apex')
%pip install -v --disable-pip-version-check --no-cache-dir ./
os.chdir('..')

In [6]:
!mkdir data

In [7]:
!cp -r ../input/clrusraw data/clrusraw
!cp -r ../input/clengraw data/clengraw
!cp -r ../input/clrusnorm data/clrusnorm
!cp -r ../input/clengnorm data/clengnorm
!cp -r ../input/extrus data/extrus
!cp -r ../input/extrus data/exteng
!cp -r ../input/extrusnorm data/extrusnorm
!cp -r ../input/extengnorm data/extengnorm

#### Defining the tasks and building the data

In [8]:
import json
import os
import shutil
import sys
from tempfile import TemporaryDirectory

import pandas as pd
import torch

from mtdnn.common.types import EncoderModelType
from mtdnn.configuration_mtdnn import MTDNNConfig
from mtdnn.data_builder_mtdnn import MTDNNDataBuilder
from mtdnn.modeling_mtdnn import MTDNNModel
from mtdnn.process_mtdnn import MTDNNDataProcess
from mtdnn.tasks.config import MTDNNTaskDefs
from mtdnn.tokenizer_mtdnn import MTDNNTokenizer

In [9]:
ROOT_DIR = 'result'
OUTPUT_DIR = os.path.join(ROOT_DIR, 'checkpoint')
os.makedirs(OUTPUT_DIR) if not os.path.exists(OUTPUT_DIR) else OUTPUT_DIR

LOG_DIR = os.path.join(ROOT_DIR, 'tensorboard_logdir')
os.makedirs(LOG_DIR) if not os.path.exists(LOG_DIR) else LOG_DIR

DATA_DIR = "data"
DATA_CL_RUS = os.path.join(DATA_DIR, "clrusraw")
DATA_CL_ENG = os.path.join(DATA_DIR, "clengraw")
DATA_CL_RUS_NORM = os.path.join(DATA_DIR, "clrusnorm")
DATA_CL_ENG_NORM = os.path.join(DATA_DIR, "clengnorm")
DATA_EXT_RUS = os.path.join(DATA_DIR, "extrus")
DATA_EXT_ENG = os.path.join(DATA_DIR, "exteng")
DATA_EXT_RUS_NORM = os.path.join(DATA_DIR, "extrusnorm")
DATA_EXT_ENG_NORM = os.path.join(DATA_DIR, "extengnorm")

# Parameters
PRE_TRAINED_MODEL_NAME = 'cimm-kzn/enrudr-bert'
#PRE_TRAINED_MODEL_NAME = 'DeepPavlov/rubert-base-cased-conversational'

TEST_DATASET_LIST = ['clrus']

BATCH_SIZE = 32
MAX_SEQ_LEN = 128
NUM_EPOCHS = 5
LR = 3e-5

In [10]:
config = MTDNNConfig(init_checkpoint=PRE_TRAINED_MODEL_NAME,
                     batch_size=BATCH_SIZE, 
                     max_seq_len=MAX_SEQ_LEN,
                     cuda=True,
                     epochs=NUM_EPOCHS,
                     learning_rate=LR,
                    )

In [11]:
clrus_task = {
    "clrus": {
        "data_format": "PremiseOnly",
        "encoder_type": "BERT",
        "dropout_p": 0.3,
        "enable_san": False,
        "metric_meta": ["F1", 'Precision', 'Recall'],
        "loss": "CeCriterion",
        "kd_loss": "MseCriterion",
        "n_class": 2,
        'split_names': ['train', 'dev', 'test'],
        "data_process_opts": {"header": True, "is_train": True, "multi_snli": False},
        "task_type": "Classification",
        "data_source_dir": DATA_CL_RUS_NORM,
        },
    }

cleng_task = {
    "cleng": {
        "data_format": "PremiseOnly",
        "encoder_type": "BERT",
        "dropout_p": 0.3,
        "enable_san": False,
        "metric_meta": ["F1", 'Precision', 'Recall'],
        "loss": "CeCriterion",
        "kd_loss": "MseCriterion",
        "n_class": 2,
        'split_names': ['train'],
        "data_process_opts": {"header": True, "is_train": True, "multi_snli": False},
        "task_type": "Classification",
        "data_source_dir": DATA_CL_ENG_NORM,
        },
    }

extrus_task = {
    "extrus": {
        "data_format": "Sequence",
        "encoder_type": "BERT",
        "dropout_p": 0.3,
        "enable_san": False,
        "labels": ["O", "B", "I", "X", "CLS", "SEP"],
        "metric_meta": ["SeqEval"],
        "n_class": 6,
        "loss": "SeqCeCriterion",
        "split_names": ["train"],
        "data_process_opts": {"header": False, "is_train": True, "multi_snli": False},
        "task_type": "SequenceLabeling",
        'data_source_dir': DATA_EXT_RUS_NORM,
        },
    }

exteng_task = {
    "exteng": {
        "data_format": "Sequence",
        "encoder_type": "BERT",
        "dropout_p": 0.3,
        "enable_san": False,
        "labels": ["O", "B", "I", "X", "CLS", "SEP"],
        "metric_meta": ["SeqEval"],
        "n_class": 6,
        "loss": "SeqCeCriterion",
        "split_names": ["train"],
        "data_process_opts": {"header": False, "is_train": True, "multi_snli": False},
        "task_type": "SequenceLabeling",
        'data_source_dir': DATA_EXT_ENG_NORM,
        },
    }

In [12]:
task_params = {}
task_params.update(clrus_task)
task_params.update(cleng_task)
task_params.update(extrus_task)
task_params.update(exteng_task)

In [13]:
task_defs = MTDNNTaskDefs(task_params)

In [14]:
tokenizer = MTDNNTokenizer(model_name=PRE_TRAINED_MODEL_NAME, do_lower_case=False)

In [16]:
data_builder = MTDNNDataBuilder(
    tokenizer=tokenizer,
    task_defs=task_defs,
    data_dir=DATA_DIR,
    canonical_data_suffix="canonical_data",
    dump_rows=False,
)

vectorized_data = data_builder.vectorize()

In [17]:
data_processor = MTDNNDataProcess(
    config=config, task_defs=task_defs, vectorized_data=vectorized_data
)

In [18]:
multitask_train_dataloader = data_processor.get_train_dataloader()
dev_dataloaders_list = data_processor.get_dev_dataloaders()
test_dataloaders_list = data_processor.get_test_dataloaders()

In [19]:
decoder_opts = data_processor.get_decoder_options_list()
task_types = data_processor.get_task_types_list()
dropout_list = data_processor.get_tasks_dropout_prob_list()
loss_types = data_processor.get_loss_types_list()
kd_loss_types = data_processor.get_kd_loss_types_list()
tasks_nclass_list = data_processor.get_task_nclass_list()

In [20]:
num_all_batches = data_processor.get_num_all_batches()

#### Training the model and making predictions

In [33]:
model = MTDNNModel(
    config,
    task_defs,
    pretrained_model_name=PRE_TRAINED_MODEL_NAME,
    num_train_step=num_all_batches,
    decoder_opts=decoder_opts,
    task_types=task_types,
    dropout_list=dropout_list,
    loss_types=loss_types,
    kd_loss_types=kd_loss_types,
    tasks_nclass_list=tasks_nclass_list,
    multitask_train_dataloader=multitask_train_dataloader,
    dev_dataloaders_list=dev_dataloaders_list,
    test_dataloaders_list=test_dataloaders_list,
    test_datasets_list=TEST_DATASET_LIST,
    output_dir=OUTPUT_DIR,
    log_dir=LOG_DIR
)

In [34]:
model.fit()

In [35]:
for epoch in range(NUM_EPOCHS):
    model.predict(trained_model_chckpt=f"{OUTPUT_DIR}/model_{epoch}.pt", saved_epoch_idx=f'{epoch}', calc_test_metrics=True)

#### Displaying the metrics

In [36]:
cl_results = {}
ext_results = {}
dev_result_files = list(filter(lambda x: x.endswith('.json') and 'dev' in x, os.listdir(OUTPUT_DIR)))
test_result_files = list(filter(lambda x: x.endswith('.json') and 'test' in x, os.listdir(OUTPUT_DIR)))

for d in dev_result_files: 
    name =  ' '.join(list(map(str.capitalize, d.split('_')))[:2])
    epoch = d.split('.')[0].split('_')[-1]
    file_name = os.path.join(OUTPUT_DIR, d)
    with open(file_name, 'r') as f: 
        res = json.load(f)
        for metric in res['metrics']:
            name_and_metric = ' '.join([name, metric])
            if 'Cl' in name:
                if name_and_metric not in cl_results:
                    cl_results[name_and_metric] = dict()
                cl_results[name_and_metric].update({f'Epoch {epoch}': f"{res['metrics'][metric]:.3f}"})
            elif 'Ext' in name:
                if name_and_metric not in ext_results:
                    ext_results[name_and_metric] = dict()    
                ext_results[name_and_metric].update({f'Epoch {epoch}': res['metrics'][metric]})
        
for d in test_result_files: 
    name =  ' '.join(list(map(str.capitalize, d.split('_')))[:2])
    epoch = d.split('.')[0].split('_')[-1]
    file_name = os.path.join(OUTPUT_DIR, d)
    with open(file_name, 'r') as f: 
        res = json.load(f)
        for metric in res['metrics']:
            name_and_metric = ' '.join([name, metric])
            if 'Cl' in name:
                if name_and_metric not in cl_results:
                    cl_results[name_and_metric] = dict()
                cl_results[name_and_metric].update({f'Epoch {epoch}': f"{res['metrics'][metric]:.3f}"})
            elif 'Ext' in name:
                if name_and_metric not in ext_results:
                    ext_results[name_and_metric] = dict()    
                ext_results[name_and_metric].update({f'Epoch {epoch}': res['metrics'][metric]})
        
cl_results = pd.DataFrame(cl_results)   
cl_results

In [37]:
for n_and_m, epochs in ext_results.items():
    print(n_and_m)
    for epoch, data in sorted(epochs.items()):
        print(epoch)
        print(data)

In [38]:
!rm -r result