<a href="https://colab.research.google.com/github/nickprock/appunti_data_science/blob/master/appunti_hugging_face/fine_tuning_sst2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from datasets import load_dataset

In [None]:
sst2 = load_dataset("glue", "sst2")

sst2



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [None]:
sst2.set_format(type = "pandas")

In [None]:
train_df = sst2["train"][:]

In [None]:
train_df

Unnamed: 0,sentence,label,idx
0,hide new secretions from the parental units,0,0
1,"contains no wit , only labored gags",0,1
2,that loves its characters and communicates som...,1,2
3,remains utterly satisfied to remain the same t...,0,3
4,on the worst revenge-of-the-nerds clichés the ...,0,4
...,...,...,...
67344,a delightful comedy,1,67344
67345,"anguish , anger and frustration",0,67345
67346,"at achieving the modest , crowd-pleasing goals...",1,67346
67347,a patient viewer,1,67347


In [None]:
train_df.label.value_counts()

1    37569
0    29780
Name: label, dtype: int64

In [None]:
sst2.reset_format()

In [None]:
train = sst2["train"]

In [None]:
train.features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [None]:
from transformers import AutoTokenizer

In [None]:
checkpoint = "distilbert-base-uncased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def tokenize(batch):
  return tokenizer(batch['sentence'], truncation = True)

In [None]:
"""
train_tkn = train.map(tokenize, batched = True)
valid_tkn = validation.map(tokenize, batched = True)
test_tkn = test.map(tokenize, batched = True)
"""

'\ntrain_tkn = train.map(tokenize, batched = True)\nvalid_tkn = validation.map(tokenize, batched = True)\ntest_tkn = test.map(tokenize, batched = True)\n'

In [None]:
# train_tkn

In [None]:
sst2_tkn = sst2.map(tokenize, batched=True)

  0%|          | 0/68 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [None]:
sst2_tkn

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [None]:
from transformers import DataCollatorWithPadding

In [None]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [None]:
samples = sst2_tkn["train"][:10]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence"]}
[len(x) for x in samples["input_ids"]]

[10, 11, 15, 10, 22, 13, 29, 6, 13, 16]

In [None]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': torch.Size([10, 29]),
 'token_type_ids': torch.Size([10, 29]),
 'attention_mask': torch.Size([10, 29]),
 'labels': torch.Size([10])}

In [None]:
from transformers import TrainingArguments

In [None]:
model_out = 'distilbert-base-uncased-sst2'
batch_size = 32
logging_steps = len(sst2_tkn["train"])//batch_size

In [None]:
training_args = TrainingArguments(model_out,
                                  evaluation_strategy="epoch",
                                  num_train_epochs = 5,
                                  per_device_train_batch_size = batch_size,
                                  per_device_eval_batch_size = batch_size,
                                  disable_tqdm = False,
                                  logging_steps = logging_steps,
                                  push_to_hub = False
                                  log_level = "error")

In [None]:
from transformers import AutoModelForSequenceClassification
import torch

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 2).to(device)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
from datasets import load_metric
import numpy as np
def compute_metrics(eval_preds):
    metric = load_metric("glue", "sst2")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import Trainer


In [None]:
trainer = Trainer(
    model,
    args = training_args,
    train_dataset=sst2_tkn["train"],
    eval_dataset=sst2_tkn["validation"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.2145,0.276529,0.912844
2,0.1136,0.327405,0.90367
3,0.0715,0.297265,0.909404
4,0.0445,0.32775,0.912844


  metric = load_metric("glue", "sst2")
