# Joint Modeling Notebook

Run the following cells to train a joint classifier

## 0. Imports

In [2]:
%load_ext autoreload
%autoreload 2
    
import sys
sys.path.append('../jointclassifier/')
from joint_args import ModelArguments, DataTrainingArguments, TrainingArguments
from joint_dataloader import load_dataset
from joint_trainer import JointTrainer
from single_trainer import SingleTrainer
from joint_model_v1 import JointSeqClassifier

from transformers import HfArgumentParser, AutoConfig, AutoTokenizer
import os

## 1. Initialize the Arguments

In [3]:
task = "formality_toy"
data_dir = "../data/processed/"
model_name = "distilbert-base-cased"
model_nick = "distilbert"
output_dir = "../models/"
freeze_encoder = "False"
skip_preclassifier = "False"
train_jointly = "True"
epochs = "5"

parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses([
    "--model_name_or_path",
    model_name,
    "--model_nick",
    model_nick,
    "--task",
    task,
    "--data_dir",
    data_dir,
    "--output_dir",
    os.path.join(output_dir, model_nick),
    "--cache_dir",
    os.path.join(output_dir, model_nick,"cache"),
    "--freeze_encoder",
    freeze_encoder,
    "--skip_preclassifier",
    skip_preclassifier,
    "--train_jointly",
    train_jointly,
    "--overwrite_cache",
    "--per_device_train_batch_size",
    "16",
    "--per_device_eval_batch_size",
    "16",
    "--max_seq_len",
    "64",
    "--gradient_accumulation_steps",
    "1",
    "--num_train_epochs",
    epochs,
    "--logging_steps",
    "100",
    "--save_steps",
    "100"
])


PyTorch: setting up devices
  return torch._C._cuda_getDeviceCount() > 0


## 2. Load the Tokenizer

In [4]:
model_config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir,
                                         model_max_length = data_args.max_seq_len)
    

loading configuration file https://huggingface.co/distilbert-base-cased/resolve/main/config.json from cache at ../models/distilbert/cache/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a
Model config DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.3.3",
  "vocab_size": 28996
}

loading configuration file https://huggingface.co/distilbert-base-cased/resolve/main/config.json from cache at ../models/distilbert/cache/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e1

## 3. Load the datasets 
Note : Single for Joint Training, Dict for Separate Training

In [5]:
tasks = data_args.task.split('+')
train_dataset = load_dataset(data_args.data_dir, tokenizer, model_name=model_args.model_name_or_path, 
                            tasks=tasks, mode="train")
dev_dataset = load_dataset(data_args.data_dir, tokenizer, model_name=model_args.model_name_or_path, 
                            tasks=tasks, mode="dev")

1000it [00:00, 2535.43it/s]
200it [00:00, 3007.23it/s]

torch.Size([1000, 64]) torch.Size([1000, 64]) torch.Size([1000, 1]) torch.Size([1000])
torch.Size([200, 64]) torch.Size([200, 64]) torch.Size([200, 1]) torch.Size([200])





## 4. Initialize the Model

In [None]:
model = JointSeqClassifier.from_pretrained("distilbert-base-cased",tasks=data_args.task, model_args=model_args)

## 5. Initialize Trainer

In [None]:
if training_args.train_jointly:
    trainer_class = JointTrainer
else:
    trainer_class = SingleTrainer

trainer = trainer_class()