<a href="https://colab.research.google.com/github/mobarakol/tutorial_notebooks/blob/main/Text_Classification_HF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Installing libraries

In [2]:
! pip -q install datasets transformers

[K     |████████████████████████████████| 306 kB 4.2 MB/s 
[K     |████████████████████████████████| 3.4 MB 42.6 MB/s 
[K     |████████████████████████████████| 243 kB 49.6 MB/s 
[K     |████████████████████████████████| 67 kB 5.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 35.8 MB/s 
[K     |████████████████████████████████| 133 kB 53.1 MB/s 
[K     |████████████████████████████████| 895 kB 37.5 MB/s 
[K     |████████████████████████████████| 596 kB 33.9 MB/s 
[K     |████████████████████████████████| 3.3 MB 22.9 MB/s 
[K     |████████████████████████████████| 271 kB 47.3 MB/s 
[K     |████████████████████████████████| 144 kB 43.0 MB/s 
[K     |████████████████████████████████| 160 kB 52.4 MB/s 
[?25h

#Dataset

In [1]:
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer

#SST-2 (Stanford Sentiment Treebank) Determine if the sentence has a positive(1) or negative(0) sentiment
data = "sst2"
model_checkpoint = "distilbert-base-uncased"
batch_size = 256
dataset = load_dataset("glue", data)
metric = load_metric('glue', data)
print('sample data:',dataset["train"][2])
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True)
preprocess_function(dataset['train'][:5])

encoded_dataset = dataset.map(preprocess_function, batched=True)

Reusing dataset glue (/root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

sample data: {'sentence': 'that loves its characters and communicates something rather beautiful about human nature ', 'label': 1, 'idx': 2}


Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-3579cee9a16cdd8d.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-890a06cbede8501a.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-c8db0441bf104b36.arrow


# Fine-tuning the transformer model

In [4]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

metric_name = "accuracy"
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-{data}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.15.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/9c169103d7e5a

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.263776,0.881881


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: idx, sentence.
***** Running Evaluation *****
  Num examples = 872
  Batch size = 256
Saving model checkpoint to distilbert-base-uncased-finetuned-sst2/checkpoint-264
Configuration saved in distilbert-base-uncased-finetuned-sst2/checkpoint-264/config.json
Model weights saved in distilbert-base-uncased-finetuned-sst2/checkpoint-264/pytorch_model.bin
tokenizer config file saved in distilbert-base-uncased-finetuned-sst2/checkpoint-264/tokenizer_config.json
Special tokens file saved in distilbert-base-uncased-finetuned-sst2/checkpoint-264/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from distilbert-base-uncased-finetuned-sst2/checkpoint-264 (score: 0.8818807339449541).


TrainOutput(global_step=264, training_loss=0.28710911490700464, metrics={'train_runtime': 523.5597, 'train_samples_per_second': 128.637, 'train_steps_per_second': 0.504, 'total_flos': 902739449301840.0, 'train_loss': 0.28710911490700464, 'epoch': 1.0})

#Evaluation

In [5]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: idx, sentence.
***** Running Evaluation *****
  Num examples = 872
  Batch size = 256


{'epoch': 1.0,
 'eval_accuracy': 0.8818807339449541,
 'eval_loss': 0.2637762129306793,
 'eval_runtime': 2.8348,
 'eval_samples_per_second': 307.608,
 'eval_steps_per_second': 1.411}