<a href="https://colab.research.google.com/github/nik0laygavrIlov/DA_ML_repo/blob/main/BERT_fine_tuning(sms_spam_collection).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning модели BERT на датасете sms-spam-collection (анализ настроений)

Загружаем датасет

In [None]:
!pip install transformers
!pip install datasets



In [None]:
import pandas as pd
import numpy as np

In [None]:
from datasets import load_dataset

dataset = load_dataset("codesignal/sms-spam-collection")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

categories = dataset['train']['label']

encoded_categories = label_encoder.fit_transform(categories)

def add_encoded_category(example, idx):
    example['label'] = encoded_categories[idx]
    return example

# Применяем функцию к набору данных с использованием метода map с индексами
dataset = dataset['train'].map(add_encoded_category, with_indices=True)


Map:   0%|          | 0/5572 [00:00<?, ? examples/s]

In [None]:
dataset

Dataset({
    features: ['label', 'message'],
    num_rows: 5572
})

In [None]:
# Activate GPU for faster training by clicking on 'Runtime' > 'Change runtime type' and then selecting GPU as the Hardware accelerator
# Then check if GPU is available
import torch
torch.cuda.is_available()

True

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")



In [None]:
def tokenize_function(examples):
    return tokenizer(examples['message'], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/5572 [00:00<?, ? examples/s]

In [None]:
# Define the evaluation metrics
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

def compute_metrics(eval_pred):
    # Извлекаем логиты и метки из eval_pred
    logits, labels = eval_pred

    # Преобразуем логиты в предсказания, выбирая наибольший логит по каждому примеру
    predictions = np.argmax(logits, axis=-1)

    # Вычисляем точность
    accuracy = accuracy_score(labels, predictions)

    # Вычисляем F1 с учетом многоклассовой задачи (изменяем параметр average)
    f1 = f1_score(labels, predictions)

    roc_auc = roc_auc_score(labels, predictions)

    return {"accuracy": accuracy, "f1": f1, "ROC-AUC": roc_auc}

In [None]:
# Use data_collector to convert our samples to PyTorch tensors and concatenate them with the correct amount of padding
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Define DistilBERT as our base model:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Поскольку датасет несбалансирован используем k-fold валидацию

In [None]:
# Define a new Trainer with all the objects we constructed so far
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import KFold

metrics = []

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=4e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="epoch",
)

kf = KFold(n_splits=5)
for fold, (train_index, test_index) in enumerate(kf.split(tokenized_datasets)):
    train_dataset = tokenized_datasets.select(train_index)
    eval_dataset = tokenized_datasets.select(test_index)

    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    )

    trainer.train()

    metric = trainer.evaluate()
    metrics.append(metric)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mnikolaygavrilov6454[0m ([33mnikolaygavrilov6454-bmstu[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


Step,Training Loss


Step,Training Loss


Step,Training Loss


Step,Training Loss


In [None]:
metrics

[{'eval_loss': 0.03970605134963989,
  'eval_accuracy': 0.9901345291479821,
  'eval_f1': 0.9671641791044776,
  'eval_ROC-AUC': 0.9795029416201539,
  'eval_runtime': 18.6556,
  'eval_samples_per_second': 59.767,
  'eval_steps_per_second': 1.876,
  'epoch': 1.0},
 {'eval_loss': 0.018185630440711975,
  'eval_accuracy': 0.9955156950672646,
  'eval_f1': 0.9823321554770318,
  'eval_ROC-AUC': 0.991367760350678,
  'eval_runtime': 20.2455,
  'eval_samples_per_second': 55.074,
  'eval_steps_per_second': 1.729,
  'epoch': 1.0},
 {'eval_loss': 0.027253124862909317,
  'eval_accuracy': 0.9937163375224417,
  'eval_f1': 0.9739776951672863,
  'eval_ROC-AUC': 0.9775904190543074,
  'eval_runtime': 19.3819,
  'eval_samples_per_second': 57.476,
  'eval_steps_per_second': 1.806,
  'epoch': 1.0},
 {'eval_loss': 0.011745789088308811,
  'eval_accuracy': 0.9973070017953322,
  'eval_f1': 0.9902912621359223,
  'eval_ROC-AUC': 0.9903846153846154,
  'eval_runtime': 19.3183,
  'eval_samples_per_second': 57.666,
  'ev

In [None]:
print(f'Fold\taccuracy\tf1\t\tROC-AUC')
accuracy_list = []
f1_list = []
ROC_AUC_list = []
for id, metric in enumerate(metrics):
  accuracy_list.append(metric["eval_accuracy"])
  f1_list.append(metric["eval_f1"])
  ROC_AUC_list.append(metric["eval_ROC-AUC"])
  print(f'{id}\t{metric["eval_accuracy"]:2f}\t{metric["eval_f1"]:2f}\t{metric["eval_ROC-AUC"]:2f}')

Fold	accuracy	f1		ROC-AUC
0	0.990135	0.967164	0.979503
1	0.995516	0.982332	0.991368
2	0.993716	0.973978	0.977590
3	0.997307	0.990291	0.990385
4	1.000000	1.000000	1.000000


In [None]:
np.array(accuracy_list).mean()

0.995334712706604

In [None]:
np.array(f1_list).mean()

0.9827530583769437

In [None]:
np.array(ROC_AUC_list).mean()

0.987769147281951

Видим, что точность на всех фолдах получилась близкая к 1.