In [1]:
from nlp import load_dataset
from sklearn.metrics import accuracy_score

from transformers import (
BertForSequenceClassification, 
BertTokenizerFast, 
Trainer, 
TrainingArguments,
AutoModelForSequenceClassification,
AutoTokenizer,
pipeline )

### Part 1: Data Loaders
(Using nlp scripts)

In [2]:
train_dataset, test_dataset = load_dataset('emo_data.py', split = ['train', 'test'])

Downloading and preparing dataset emo_dataset/tuples of labels and text (download: Unknown size, generated: Unknown size, total: Unknown size) to /home/tanmay/.cache/huggingface/datasets/emo_dataset/tuples of labels and text/1.0.0/9d27b5d223aa169b7d5aea4fa220e554e8c73ae2a637b38a8e4d6190350f0124...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset emo_dataset downloaded and prepared to /home/tanmay/.cache/huggingface/datasets/emo_dataset/tuples of labels and text/1.0.0/9d27b5d223aa169b7d5aea4fa220e554e8c73ae2a637b38a8e4d6190350f0124. Subsequent calls will reuse this data.


In [3]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 4)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [4]:
id2label = {}
for i in range(model.num_labels):
    id2label[i] = train_dataset.features["label"].int2str(i)

In [5]:
model.config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

In [6]:
model.config.id2label = id2label

In [7]:
model.config.label2id = {v:k for k,v in id2label.items()}

In [8]:
model.config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "others",
    "1": "happy",
    "2": "sad",
    "3": "angry"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "angry": 3,
    "happy": 1,
    "others": 0,
    "sad": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

In [9]:
def tokenize(batch):
    return tokenizer(batch['text'], padding = True, truncation = True)

In [10]:
train_dataset = train_dataset.map(tokenize, batched = True, batch_size = len(train_dataset))
test_dataset = test_dataset.map(tokenize, batched = True, batch_size = len(train_dataset))
train_dataset.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [11]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc
    }

### Part 2: Fine Tune

In [12]:
training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs = 1,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 64,
    warmup_steps = 500,
    weight_decay = 0.01,
    evaluate_during_training = True,
    logging_dir = './logs',
)

trainer = Trainer(
    model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)

In [13]:
trainer.train()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1885.0, style=ProgressStyle(description_w…



HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=87.0, style=ProgressStyle(description_wi…






TrainOutput(global_step=1885, training_loss=0.4493328989461183)

In [14]:
trainer.evaluate()

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=87.0, style=ProgressStyle(description_wi…




{'eval_loss': 0.37193206010450575,
 'eval_accuracy': 0.8740243238337266,
 'epoch': 1.0}

In [15]:
trainer.save_model("../models/emo-fine-tune/")

In [16]:
tokenizer.save_pretrained("../tokenizer/emo-fine-tune/")

('../tokenizer/emo-fine-tune/vocab.txt',
 '../tokenizer/emo-fine-tune/special_tokens_map.json',
 '../tokenizer/emo-fine-tune/added_tokens.json')

### Part 3: Pipeline

In [17]:
transformer = AutoModelForSequenceClassification.from_pretrained("../models/emo-fine-tune/")

In [18]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [19]:
nlp_sentence_classif = pipeline('sentiment-analysis', model = transformer, tokenizer = tokenizer)
nlp_sentence_classif("I've never had such a bad day in my life")

[{'label': 'sad', 'score': 0.9890854954719543}]