In [1]:
# import json
# import os
# import sys
# from tempfile import TemporaryDirectory

# import numpy as np
# import pandas as pd
# import scrapbook as sb
# import torch
# import torch.nn as nn
# from sklearn.metrics import accuracy_score, classification_report
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
# from tqdm import tqdm
# from utils_nlp.common.timer import Timer
# from utils_nlp.dataset.multinli import load_pandas_df
# from utils_nlp.models.transformers.sequence_classification import (
#     Processor, SequenceClassifier)
# from utils_nlp.models.mtdnn.configuration_mtdnn import MTDNNConfig
# from utils_nlp.models.mtdnn.modeling_mtdnn import MTDNNModel

from utils_nlp.models.mtdnn.common.types import EncoderModelType
from utils_nlp.models.mtdnn.configuration_mtdnn import MTDNNConfig
from utils_nlp.models.mtdnn.modeling_mtdnn import MTDNNModel
from utils_nlp.models.mtdnn.process_mtdnn import MTDNNDataProcess, MTDNNPipelineProcess
from utils_nlp.models.mtdnn.tasks.config import MTDNNTaskDefs

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [2]:
cuda_device = 0

In [3]:
config = MTDNNConfig(cuda_device=cuda_device, multi_gpu_on=False)

# Define task parameters
# Python dict, yaml or json
tasks_params = {
    "mnli": {
        "data_format": "PremiseAndOneHypothesis",
        "encoder_type": "BERT",
        "dropout_p": 0.3,
        "enable_san": True,
        "labels": ["contradiction", "neutral", "entailment"],
        "metric_meta": ["ACC"],
        "loss": "CeCriterion",
        "kd_loss": "MseCriterion",
        "n_class": 3,
        "split_names": [
            "train",
            "matched_dev",
            "mismatched_dev",
            "matched_test",
            "mismatched_test",
        ],
        "task_type": "Classification",
    },
}

# Define the tasks
task_defs = MTDNNTaskDefs(tasks_params)

INFO - Mapping Task attributes
INFO - Configured task definitions - ['mnli']


In [4]:
# Make the Data Preprocess step and update the config with training data updates
data_processor = MTDNNDataProcess(
    config=config,
    task_defs=task_defs,
    batch_size=8,
    data_dir="/home/useradmin/sources/mt-dnn/data/canonical_data/bert_uncased_lower",
    train_datasets_list=["mnli"],
    test_datasets_list=["mnli_mismatched", "mnli_matched"],
)

INFO - Starting to process the training data sets
INFO - Loading /home/useradmin/sources/mt-dnn/data/canonical_data/bert_uncased_lower/mnli_train.json as task 0


Loaded 392702 samples out of 392702


INFO - Starting to process the testing data sets


Loaded 9832 samples out of 9832
Loaded 9847 samples out of 9847
Loaded 9815 samples out of 9815
Loaded 9796 samples out of 9796


In [5]:
multitask_train_dataloader = data_processor.get_train_dataloader()
dev_dataloaders_list = data_processor.get_dev_dataloaders()
test_dataloaders_list = data_processor.get_test_dataloaders()

In [6]:
multitask_train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f5f52cf6748>

In [7]:
# Update config with data preprocess params
config = data_processor.update_config(config)

In [8]:
config

{
  "adam_eps": 1e-06,
  "answer_opt": 0,
  "attention_probs_dropout_prob": 0.1,
  "batch_size": 8,
  "batch_size_eval": 8,
  "bert_dropout_p": 0.1,
  "bert_l2norm": 0.0,
  "cuda": true,
  "cuda_device": 0,
  "decoder_opts": [
    0
  ],
  "dropout_p": 0.1,
  "dropout_w": 0.0,
  "dump_feature": false,
  "embedding_opt": 0,
  "enable_variational_dropout": true,
  "encoder_type": 1,
  "epochs": 5,
  "finetuning_task": null,
  "fp16": false,
  "fp16_opt_level": "01",
  "freeze_layers": 1,
  "global_grad_clipping": 1.0,
  "grad_accumulation_step": 1,
  "grad_clipping": 0.0,
  "have_lr_scheduler": true,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "init_checkpoint": "bert-base-uncased",
  "init_ratio": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "kd_loss_types": [
    1
  ],
  "kwargs": {},
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_n

In [9]:
# Instantiate model
model = MTDNNModel(config)
print("Network: ", model.network)

idx: 0, number of task labels: 3
Network:  SANBERTNetwork(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [10]:
model.network.bert.embeddings.word_embeddings.weight

Parameter containing:
tensor([[-0.0102, -0.0615, -0.0265,  ..., -0.0199, -0.0372, -0.0098],
        [-0.0117, -0.0600, -0.0323,  ..., -0.0168, -0.0401, -0.0107],
        [-0.0198, -0.0627, -0.0326,  ..., -0.0165, -0.0420, -0.0032],
        ...,
        [-0.0218, -0.0556, -0.0135,  ..., -0.0043, -0.0151, -0.0249],
        [-0.0462, -0.0565, -0.0019,  ...,  0.0157, -0.0139, -0.0095],
        [ 0.0015, -0.0821, -0.0160,  ..., -0.0081, -0.0475,  0.0753]],
       device='cuda:0', requires_grad=True)

In [11]:
# Create a process pipeline for training and inference
pipeline_process = MTDNNPipelineProcess(
    model=model,
    config=config,
    task_defs=task_defs,
    multitask_train_dataloader=multitask_train_dataloader,
    dev_dataloaders_list=dev_dataloaders_list,
    test_dataloaders_list=test_dataloaders_list,
)

In [12]:
pipeline_process.fit(epochs=1)

INFO - Total number of params: 109484547
INFO - At epoch 0
INFO - Amount of data to go over: 49088
INFO - Task - [ 0] Updates - [     1] Training Loss - [1.15971] Time Remaining - [9:36:01]
INFO - Task - [ 0] Updates - [   500] Training Loss - [1.10957] Time Remaining - [5:26:22]
INFO - Task - [ 0] Updates - [  1000] Training Loss - [1.10850] Time Remaining - [5:23:16]
INFO - Task - [ 0] Updates - [  1500] Training Loss - [1.10725] Time Remaining - [5:19:38]
INFO - Task - [ 0] Updates - [  2000] Training Loss - [1.10652] Time Remaining - [5:15:28]
INFO - Task - [ 0] Updates - [  2500] Training Loss - [1.10586] Time Remaining - [5:11:12]
INFO - Task - [ 0] Updates - [  3000] Training Loss - [1.10539] Time Remaining - [5:08:03]
INFO - Task - [ 0] Updates - [  3500] Training Loss - [1.10474] Time Remaining - [5:04:19]
INFO - Task - [ 0] Updates - [  4000] Training Loss - [1.10387] Time Remaining - [5:01:14]
INFO - Task - [ 0] Updates - [  4500] Training Loss - [1.10350] Time Remaining - [

INFO - Task - [ 0] Updates - [ 45000] Training Loss - [1.09114] Time Remaining - [0:27:19]
INFO - Task - [ 0] Updates - [ 45500] Training Loss - [1.09116] Time Remaining - [0:23:59]
INFO - Task - [ 0] Updates - [ 46000] Training Loss - [1.09110] Time Remaining - [0:20:38]
INFO - Task - [ 0] Updates - [ 46500] Training Loss - [1.09103] Time Remaining - [0:17:18]
INFO - Task - [ 0] Updates - [ 47000] Training Loss - [1.09095] Time Remaining - [0:13:57]
INFO - Task - [ 0] Updates - [ 47500] Training Loss - [1.09093] Time Remaining - [0:10:37]
INFO - Task - [ 0] Updates - [ 48000] Training Loss - [1.09093] Time Remaining - [0:07:16]
INFO - Task - [ 0] Updates - [ 48500] Training Loss - [1.09091] Time Remaining - [0:03:55]
INFO - Task - [ 0] Updates - [ 49000] Training Loss - [1.09093] Time Remaining - [0:00:35]


In [14]:
pipeline_process.predict()

INFO - At epoch 0


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - Task mnli_mismatched -- epoch 0 -- Dev ACC: 34.856


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - [new test scores saved.]


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - Task mnli_matched -- epoch 0 -- Dev ACC: 36.383


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - [new test scores saved.]
INFO - model saved to checkpoint/model_0.pt
INFO - At epoch 1


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - Task mnli_mismatched -- epoch 1 -- Dev ACC: 34.856


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - [new test scores saved.]


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - Task mnli_matched -- epoch 1 -- Dev ACC: 36.383


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - [new test scores saved.]
INFO - model saved to checkpoint/model_1.pt
INFO - At epoch 2


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - Task mnli_mismatched -- epoch 2 -- Dev ACC: 34.856


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - [new test scores saved.]


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - Task mnli_matched -- epoch 2 -- Dev ACC: 36.383


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - [new test scores saved.]
INFO - model saved to checkpoint/model_2.pt
INFO - At epoch 3


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - Task mnli_mismatched -- epoch 3 -- Dev ACC: 34.856


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - [new test scores saved.]


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - Task mnli_matched -- epoch 3 -- Dev ACC: 36.383


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - [new test scores saved.]
INFO - model saved to checkpoint/model_3.pt
INFO - At epoch 4


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - Task mnli_mismatched -- epoch 4 -- Dev ACC: 34.856


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - [new test scores saved.]


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - Task mnli_matched -- epoch 4 -- Dev ACC: 36.383


predicting 0
predicting 100
predicting 200
predicting 300
predicting 400
predicting 500
predicting 600
predicting 700
predicting 800
predicting 900
predicting 1000
predicting 1100
predicting 1200


INFO - [new test scores saved.]
INFO - model saved to checkpoint/model_4.pt
