# Single Modeling Notebook

Run the following cells to train a joint classifier

## 0. Imports

In [1]:
%load_ext autoreload
%autoreload 2
    
%load_ext tensorboard

import sys
sys.path.append('../jointclassifier/')
from joint_args import ModelArguments, DataTrainingArguments, TrainingArguments
from joint_dataloader import load_dataset
from joint_trainer import JointTrainer
from single_trainer import SingleTrainer
from joint_model_v1 import JointSeqClassifier

from transformers import HfArgumentParser, AutoConfig, AutoTokenizer
import os

## 1. Initialize the Arguments

In [2]:
task = "formality_toy+jokes_toy"
data_dir = "../data/processed/"
model_name = "distilbert-base-cased"
model_nick = "distilbert"
output_dir = "../models/"
freeze_encoder = "True"
skip_preclassifier = "True"
train_jointly = "False"
epochs = "1"

parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses([
    "--model_name_or_path",
    model_name,
    "--model_nick",
    model_nick,
    "--task",
    task,
    "--data_dir",
    data_dir,
    "--output_dir",
    os.path.join(output_dir, model_nick, task, 'single'),
    "--cache_dir",
    os.path.join(output_dir, model_nick,"cache"),
    "--freeze_encoder",
    freeze_encoder,
    "--skip_preclassifier",
    skip_preclassifier,
    "--train_jointly",
    train_jointly,
    "--overwrite_cache",
    "--per_device_train_batch_size",
    "16",
    "--per_device_eval_batch_size",
    "16",
    "--max_seq_len",
    "64",
    "--gradient_accumulation_steps",
    "1",
    "--num_train_epochs",
    epochs,
    "--logging_steps",
    "50",
    "--save_steps",
    "50"
])


PyTorch: setting up devices
  return torch._C._cuda_getDeviceCount() > 0


## 2. Load the Tokenizer

In [3]:
model_config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir,
                                         model_max_length = data_args.max_seq_len)
    

loading configuration file https://huggingface.co/distilbert-base-cased/resolve/main/config.json from cache at ../models/distilbert/cache/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a
Model config DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.3.3",
  "vocab_size": 28996
}

loading configuration file https://huggingface.co/distilbert-base-cased/resolve/main/config.json from cache at ../models/distilbert/cache/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e1

## 3. Load the datasets 
Note : Single for Joint Training, Dict for Separate Training

In [4]:
tasks = data_args.task.split('+')
train_datasets = {} 
dev_datasets =  {}
for task in tasks:
    train_datasets[task] = load_dataset(data_args.data_dir, tokenizer, model_name=model_args.model_name_or_path, 
                            tasks=[task], mode="train")
    dev_datasets[task] = load_dataset(data_args.data_dir, tokenizer, model_name=model_args.model_name_or_path, 
                            tasks=[task], mode="dev")

1000it [00:00, 2438.69it/s]
200it [00:00, 2755.01it/s]
289it [00:00, 2881.95it/s]

torch.Size([1000, 64]) torch.Size([1000, 64]) torch.Size([1000, 1]) torch.Size([1000])
torch.Size([200, 64]) torch.Size([200, 64]) torch.Size([200, 1]) torch.Size([200])


1000it [00:00, 2422.96it/s]
200it [00:00, 2576.65it/s]

torch.Size([1000, 64]) torch.Size([1000, 64]) torch.Size([1000, 1]) torch.Size([1000])
torch.Size([200, 64]) torch.Size([200, 64]) torch.Size([200, 1]) torch.Size([200])





## 4. Initialize Trainer in a Loop (with the model)

In [5]:
# Open TensorBoard
%tensorboard --logdir runs

Reusing TensorBoard on port 6006 (pid 387229), started 3:20:36 ago. (Use '!kill 387229' to kill it.)

In [6]:
for t, task in enumerate(tasks):
    print(f"Processing Single Task : {task}")
    train_dataset = train_datasets[task]
    dev_dataset = dev_datasets[task]
    if t==0:
        model = JointSeqClassifier.from_pretrained(model_args.model_name_or_path,tasks=tasks, model_args=model_args,
                                                   task_if_single=task, joint = training_args.train_jointly)
    else:
        model = JointSeqClassifier.from_pretrained(training_args.output_dir,tasks=tasks, model_args=model_args,
                                                   task_if_single=task, joint = training_args.train_jointly)
    trainer = SingleTrainer([training_args,model_args, data_args], model, train_dataset, dev_dataset, task)
    trainer.train()

Processing Task : formality_toy


loading configuration file https://huggingface.co/distilbert-base-cased/resolve/main/config.json from cache at /home/nuwandavek/.cache/huggingface/transformers/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a
Model config DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.3.3",
  "vocab_size": 28996
}

loading weights file https://huggingface.co/distilbert-base-cased/resolve/main/pytorch_model.bin from cache at /home/nuwandavek/.cache/huggingface/transformers/9c9f39769dba4c5fe379b4bc82973eb01297bd607954621434eb9f1bc85a23a0.06b428c8

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=63.0, style=ProgressStyle(description_wid…

***** Running Evaluation *****
Num examples = 200
Total eval batch size = 16


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=13.0, style=ProgressStyle(description_wid…

Configuration saved in ../models/distilbert/formality_toy+jokes_toy/config.json





Model weights saved in ../models/distilbert/formality_toy+jokes_toy/pytorch_model.bin
Saving model checkpoint to ../models/distilbert/formality_toy+jokes_toy
New best model saved at step 50, epoch 0: f1 = 0.542713567839196
loading configuration file ../models/distilbert/formality_toy+jokes_toy/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "architectures": [
    "JointSeqClassifier"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.3.3",
  "vocab_size": 28996
}

loading weights file ../models/distilbert/formality_toy+jokes_toy/pytorch_model.bin




Processing Task : jokes_toy


All model checkpoint weights were used when initializing JointSeqClassifier.

All the weights of JointSeqClassifier were initialized from the model checkpoint at ../models/distilbert/formality_toy+jokes_toy.
If your task is similar to the task the model of the checkpoint was trained on, you can already use JointSeqClassifier for predictions without further training.
***** Running training *****
Num examples = 1000
Num Epochs = 1.0
Total train batch size = 16
Gradient Accumulation steps = 1
Total optimization steps = 63.0
Logging steps = 50
Save steps = 50


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=63.0, style=ProgressStyle(description_wid…

***** Running Evaluation *****
Num examples = 200
Total eval batch size = 16


HBox(children=(FloatProgress(value=0.0, description='Iteration', max=13.0, style=ProgressStyle(description_wid…

Configuration saved in ../models/distilbert/formality_toy+jokes_toy/config.json





Model weights saved in ../models/distilbert/formality_toy+jokes_toy/pytorch_model.bin
Saving model checkpoint to ../models/distilbert/formality_toy+jokes_toy
New best model saved at step 50, epoch 0: f1 = 0.7138263665594855




