# Machine Learning Project

Article: *Multi-Task Deep Neural Networks for Natural Language Understanding*

<a href=https://arxiv.org/abs/1901.11504> https://arxiv.org/abs/1901.11504</a>

In [None]:
### Colab
!git clone -b insertMyNotebookAndScripts https://github.com/matteoghera/MT-DNN.git
!mkdir MT-DNN/models
!pip install path

In [None]:
import os
from path import Path
PROJ_DIR = Path("/content/MT-DNN")  #for Colab 
#PROJ_DIR = Path().getcwd().parent   #for Pycharm and AWS
DATA_DIR = PROJ_DIR / "data"
MODELS_DIR=PROJ_DIR / "models"
os.chdir(PROJ_DIR)
print(PROJ_DIR)

In [None]:
!python myscripts/download_glue_data.py --data_dir data --tasks all  ##decommenta

In [None]:
!pip install -e .


## Multi-task Deep Neural Network

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import os
import shutil
import sys
from tempfile import TemporaryDirectory

import pandas as pd
import torch

from mtdnn.common.types import EncoderModelType
from mtdnn.configuration_mtdnn import MTDNNConfig
from mtdnn.data_builder_mtdnn import MTDNNDataBuilder
from mtdnn.modeling_mtdnn import MTDNNModel
from mtdnn.process_mtdnn import MTDNNDataProcess
from mtdnn.tasks.config import MTDNNTaskDefs, MTDNNTaskConfig
from mtdnn.tokenizer_mtdnn import MTDNNTokenizer


In [None]:
# Define Configuration, Tasks and Model Objects
ROOT_DIR = TemporaryDirectory().name
OUTPUT_DIR = os.path.join(ROOT_DIR, 'checkpoint')
os.makedirs(OUTPUT_DIR) if not os.path.exists(OUTPUT_DIR) else OUTPUT_DIR

LOG_DIR = os.path.join(ROOT_DIR, 'tensorboard_logdir')
os.makedirs(LOG_DIR) if not os.path.exists(LOG_DIR) else LOG_DIR

# Training parameters
BATCH_SIZE = 16
MULTI_GPU_ON = False
MAX_SEQ_LEN = 128
NUM_EPOCHS = 5

# Task list
tasks=["cola", "sst", "mnli", "rte", "wnli", "qqp", "mrpc", "snli", "stsb", "qnli"]
tasks=["rte", "wnli"]

In [None]:
print(OUTPUT_DIR)
print(LOG_DIR)

In [None]:
config = MTDNNConfig(batch_size=BATCH_SIZE, 
                     max_seq_len=MAX_SEQ_LEN, 
                     multi_gpu_on=MULTI_GPU_ON)


In [None]:
tokenizer = MTDNNTokenizer(do_lower_case=True)


In [None]:
import json

with open(str(PROJ_DIR/"myscripts"/"glue_config.json")) as f:
  glue_config = json.load(f)

my_task_config={}
for task in tasks:
    my_task_config[task]=glue_config[task]
    my_task_config[task]["data_source_dir"]=str(DATA_DIR/my_task_config[task]["data_paths"][0].split("/")[0])
    
task_defs=MTDNNTaskDefs(my_task_config)
    

In [None]:
models=[]
for key in my_task_config.keys():
    data_source_dir=str(DATA_DIR/my_task_config[key]["data_paths"][0].split("/")[0])
    data_builder = MTDNNDataBuilder(
        tokenizer=tokenizer,
        task_defs=task_defs,
        data_dir=data_source_dir,
        canonical_data_suffix="canonical_data",
        dump_rows=False,
    )
    vectorized_data = data_builder.vectorize()
    
    data_processor = MTDNNDataProcess(
        config=config, task_defs=task_defs, vectorized_data=vectorized_data
    )
    
    multitask_train_dataloader = data_processor.get_train_dataloader()
    dev_dataloaders_list = data_processor.get_dev_dataloaders()
    test_dataloaders_list = data_processor.get_test_dataloaders()
    
    decoder_opts = data_processor.get_decoder_options_list()
    task_types = data_processor.get_task_types_list()
    dropout_list = data_processor.get_tasks_dropout_prob_list()
    loss_types = data_processor.get_loss_types_list()
    kd_loss_types = data_processor.get_kd_loss_types_list()
    tasks_nclass_list = data_processor.get_task_nclass_list()
    test_datasets_list=[filename.split(".")[0] for filename in os.listdir(data_source_dir) if filename.find("test")!=-1 and filename.find(".tsv")!=-1]
    test_datasets_list=[filename.replace("test", key) for filename in test_datasets_list]
    
    num_all_batches = data_processor.get_num_all_batches()
    
    model = MTDNNModel(
        config,
        task_defs,
        pretrained_model_name="bert-base-uncased",
        num_train_step=num_all_batches,
        decoder_opts=decoder_opts,
        task_types=task_types,
        dropout_list=dropout_list,
        loss_types=loss_types,
        kd_loss_types=kd_loss_types,
        tasks_nclass_list=tasks_nclass_list,
        multitask_train_dataloader=multitask_train_dataloader,
        dev_dataloaders_list=dev_dataloaders_list,
        test_dataloaders_list=test_dataloaders_list,
        test_datasets_list=test_datasets_list,
        output_dir=OUTPUT_DIR,
        log_dir=LOG_DIR 
    )
    models.append(model)
    

In [None]:
results = {}
for model in models:
  model.fit(epochs=NUM_EPOCHS)
  model.predict(trained_model_chckpt=f"{OUTPUT_DIR}/model_4.pt")

  dev_result_files = list(filter(lambda x: x.endswith('.json') and 'dev' in x, os.listdir(OUTPUT_DIR))) 
  for d in dev_result_files: 
      name =  ' '.join(list(map(str.capitalize, d.split('_')))[:3]) 
      file_name = os.path.join(OUTPUT_DIR, d)
      with open(file_name, 'r') as f: 
          res = json.load(f) 
          results.update(
              {name: {
                  'ACCURACY': f"{res['metrics']['ACC']:.3f}"
                  }
              }) 
          
df_results = pd.DataFrame(results)   
df_results

