In [1]:
import dataclasses
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Dict, Optional

import numpy as np

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, EvalPrediction, GlueDataset
from transformers import GlueDataTrainingArguments as DataTrainingArguments
from transformers import (
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    glue_compute_metrics,
    glue_output_modes,
    glue_tasks_num_labels,
    set_seed,
)

logging.basicConfig(level=logging.INFO)

In [2]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )

In [8]:
model_args = ModelArguments(
    model_name_or_path="roberta-base",
)
data_args = DataTrainingArguments(task_name="mnli", data_dir="/nfs/research/regan/nntest/data/glue_data/MNLI")
training_args = TrainingArguments(
    output_dir="./models/model_name",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_gpu_train_batch_size=32,
    per_gpu_eval_batch_size=128,
    num_train_epochs=1,
    logging_steps=500,
    logging_first_step=True,
    save_steps=1000,
    evaluate_during_training=True,
)

In [9]:
set_seed(training_args.seed)

In [10]:
num_labels = glue_tasks_num_labels[data_args.task_name]
num_labels

3

In [11]:
config = AutoConfig.from_pretrained(
    model_args.model_name_or_path,
    num_labels=num_labels,
    finetuning_task=data_args.task_name,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_args.model_name_or_path,
)
model = AutoModelForSequenceClassification.from_pretrained(
    model_args.model_name_or_path,
    config=config,
)

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json from cache at /nfs/research/regan/.cache/torch/transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.117c81977c5979de8c088352e74ec6e70f5c66096c28b61d3c50101609b39690
INFO:transformers.configuration_utils:Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "finetuning_task": "mnli",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_t

In [12]:
# Get datasets
train_dataset = GlueDataset(data_args, tokenizer=tokenizer, limit_length=100_000)
eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode='dev')

INFO:filelock:Lock 140611966721872 acquired on /nfs/research/regan/nntest/data/glue_data/MNLI/cached_train_RobertaTokenizer_128_mnli.lock
INFO:transformers.data.datasets.glue:Creating features from dataset file at /nfs/research/regan/nntest/data/glue_data/MNLI
INFO:transformers.data.processors.glue:*** Example ***
INFO:transformers.data.processors.glue:guid: train-0
INFO:transformers.data.processors.glue:features: InputFeatures(input_ids=[0, 32253, 13851, 6353, 2972, 16364, 34, 80, 3280, 22735, 111, 1152, 8, 18947, 4, 2, 2, 7702, 8, 18947, 32, 99, 146, 6353, 2972, 16364, 173, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0

INFO:transformers.data.processors.glue:*** Example ***
INFO:transformers.data.processors.glue:guid: dev_matched-2
INFO:transformers.data.processors.glue:features: InputFeatures(input_ids=[0, 37463, 939, 218, 75, 216, 939, 939, 33, 4281, 8597, 59, 123, 37463, 2128, 939, 101, 123, 53, 23, 5, 276, 498, 939, 657, 7, 192, 4909, 1451, 123, 2, 2, 38, 101, 123, 13, 5, 144, 233, 6, 53, 74, 202, 2254, 1782, 951, 1451, 123, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [13]:
def compute_metrics(p: EvalPrediction) -> Dict:
    preds = np.argmax(p.predictions, axis=1)
    return glue_compute_metrics(data_args.task_name, preds, p.label_ids)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

INFO:transformers.training_args:PyTorch: setting up devices
INFO:transformers.trainer:You are instantiating a Trainer but W&B is not installed. To use wandb logging, run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface.


In [15]:
%%time
trainer.train()

INFO:transformers.trainer:***** Running training *****
INFO:transformers.trainer:  Num examples = 100000
INFO:transformers.trainer:  Num Epochs = 1
INFO:transformers.trainer:  Instantaneous batch size per device = 8
INFO:transformers.trainer:  Total train batch size (w. parallel, distributed & accumulation) = 128
INFO:transformers.trainer:  Gradient Accumulation steps = 1
INFO:transformers.trainer:  Total optimization steps = 782


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=782.0, style=ProgressStyle(description_wi…

INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 9815
INFO:transformers.trainer:  Batch size = 512


{"loss": 0.002216914415359497, "learning_rate": 4.993606138107417e-05, "epoch": 0.0012787723785166241, "step": 1}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=20.0, style=ProgressStyle(description_wi…


{"eval_loss": 1.1065552234649658, "eval_mnli/acc": 0.31818644931227713, "epoch": 0.0012787723785166241, "step": 1}


INFO:transformers.trainer:***** Running Evaluation *****
INFO:transformers.trainer:  Num examples = 9815
INFO:transformers.trainer:  Batch size = 512


{"loss": 0.5593169026374817, "learning_rate": 1.80306905370844e-05, "epoch": 0.639386189258312, "step": 500}


HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=20.0, style=ProgressStyle(description_wi…


{"eval_loss": 0.41381365060806274, "eval_mnli/acc": 0.8393275598573612, "epoch": 0.639386189258312, "step": 500}


INFO:transformers.trainer:

Training completed. Do not forget to share your model on huggingface.co/models =)






CPU times: user 10min 49s, sys: 19.2 s, total: 11min 8s
Wall time: 4min 2s


TrainOutput(global_step=782, training_loss=0.5172810772495806)

In [16]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [23]:
!kill 39926
#%tensorboard --logdir runs

In [24]:
### Now let's save our model and tokenizer to a directory
model.save_pretrained('/nfs/research/regan/bin/roberta_models')
tokenizer.save_pretrained('/nfs/research/regan/bin/roberta_models')

INFO:transformers.configuration_utils:Configuration saved in /nfs/research/regan/bin/roberta_models/config.json
INFO:transformers.modeling_utils:Model weights saved in /nfs/research/regan/bin/roberta_models/pytorch_model.bin


('/nfs/research/regan/bin/roberta_models/vocab.json',
 '/nfs/research/regan/bin/roberta_models/merges.txt',
 '/nfs/research/regan/bin/roberta_models/special_tokens_map.json',
 '/nfs/research/regan/bin/roberta_models/added_tokens.json')