# Fine-tuning BERT text classification

References : 

- https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb#scrollTo=chq_3nUz73ib

- https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb


In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!pip install -q transformers datasets
!pip install accelerate -U

## Load dataset

In [3]:
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split

# Read train and test CSV files
train_csv_path = "/content/AUG.csv"
validation_csv_path = "/content/eval_data.csv"

# Function to read CSV into pandas DataFrame
def read_csv_to_dict(csv_path):
    import pandas as pd
    df = pd.read_csv(csv_path)
    return df.to_dict(orient='list')

# Read train and test data from CSV
train_data = read_csv_to_dict(train_csv_path)
validation_data = read_csv_to_dict(validation_csv_path)

# Create DatasetDict
dataset = DatasetDict({
    "train": Dataset.from_dict({"text": train_data["text"], "label": train_data["label"]}),
    "validation": Dataset.from_dict({"text": validation_data["text"], "label": validation_data["label"]}),
})


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 19383
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 60
    })
})

In [5]:
example = dataset['train'][0]
example

{'text': 'in the same manner as the nonlinear svm for classification approach a nonlinear mapping can be used to map the data into a high dimensional feature space where linear regression is performed',
 'label': 'prediction'}

In [6]:
import numpy as np
labels = train_data["label"]
labels = np.unique(labels).tolist()
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['clustering', 'pattern_mining', 'prediction']

## Preprocess data

In [7]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def preprocess_data(batch):
    texts = batch['text']
    label = batch['label']

    # encode texts
    encodings = tokenizer(texts, padding="max_length", truncation=True, max_length=128)

    # create list for label
    label_idx = labels.index(label.strip().lower())  # Assuming label is a single string
    label_list = [0.0] * len(labels)
    label_list[label_idx] = 1.0

    # Assign label list to each example
    encodings["labels"] = label_list


    return encodings




In [None]:
encoded_dataset = {}
for split in dataset.keys():
    encoded_dataset[split] = dataset[split].map(preprocess_data)

# Remove "text" and "label" columns
for split in encoded_dataset.keys():
    encoded_dataset[split] = encoded_dataset[split].remove_columns(["text", "label"])

print(encoded_dataset)

In [11]:
example = encoded_dataset['train'][0]
print(example)

{'input_ids': [101, 1999, 1996, 2168, 5450, 2004, 1996, 27400, 17917, 2213, 2005, 5579, 3921, 1037, 27400, 12375, 2064, 2022, 2109, 2000, 4949, 1996, 2951, 2046, 1037, 2152, 8789, 3444, 2686, 2073, 7399, 26237, 2003, 2864, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [12]:
tokenizer.decode(example['input_ids'])

'[CLS] in the same manner as the nonlinear svm for classification approach a nonlinear mapping can be used to map the data into a high dimensional feature space where linear regression is performed [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [13]:
example['labels']

[0.0, 0.0, 1.0]

In [14]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['prediction']

In [9]:
encoded_dataset = DatasetDict({key: encoded_dataset[key] for key in encoded_dataset})


In [10]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 19383
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 60
    })
})

In [11]:
encoded_dataset.set_format("torch")

## Define model

In [12]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train the model!

In [13]:
batch_size = 3
metric_name = "f1"

In [14]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.5,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=True,
)

In [15]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [16]:
encoded_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [23]:
encoded_dataset['train']['input_ids'][0]

tensor([  101,  1999,  1996,  2168,  5450,  2004,  1996, 27400, 17917,  2213,
         2005,  5579,  3921,  1037, 27400, 12375,  2064,  2022,  2109,  2000,
         4949,  1996,  2951,  2046,  1037,  2152,  8789,  3444,  2686,  2073,
         7399, 26237,  2003,  2864,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [17]:
#forward pass
outputs = model(input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0), labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


SequenceClassifierOutput(loss=tensor(0.8199, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.0741,  0.3057, -0.3291]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

Let's start training!

In [18]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.0805,0.457839,0.866667,0.9,0.866667
2,0.0545,0.720657,0.85,0.8875,0.85
3,0.0033,0.991023,0.739496,0.804167,0.733333
4,0.0137,1.123375,0.773109,0.829167,0.766667
5,0.0028,1.195982,0.773109,0.829167,0.766667


TrainOutput(global_step=32305, training_loss=0.03390735011640633, metrics={'train_runtime': 2545.8498, 'train_samples_per_second': 38.068, 'train_steps_per_second': 12.689, 'total_flos': 6374909219777280.0, 'train_loss': 0.03390735011640633, 'epoch': 5.0})

## Evaluate

After training, we evaluate our model on the validation set.

In [20]:
trainer.evaluate()

{'eval_loss': 0.45783936977386475,
 'eval_f1': 0.8666666666666667,
 'eval_roc_auc': 0.9,
 'eval_accuracy': 0.8666666666666667,
 'eval_runtime': 0.2648,
 'eval_samples_per_second': 226.576,
 'eval_steps_per_second': 75.525,
 'epoch': 5.0}

In [None]:
trainer.push_to_hub()

## Inference


In [None]:
dataset['validation']

Dataset({
    features: ['text', 'label'],
    num_rows: 60
})

In [22]:
predicted_labels = []
truth =[]
for instance in dataset['validation']:
    text = instance['text']
    true_label = instance['label']
    truth.append(true_label)
    encoding = tokenizer(text, return_tensors="pt")
    encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

    outputs = trainer.model(**encoding)
    logits = outputs.logits
    # apply sigmoid + threshold
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits.squeeze().cpu())
    predictions = np.zeros(probs.shape)
    predictions[np.where(probs >= 0.5)] = 1

    if np.any(predictions == 1):
        # Get the index of the predicted label
        predicted_label_index = np.argmax(predictions)
        # Map index to label using id2label
        predicted_label = id2label[predicted_label_index]
    else:
        predicted_label = 'unknown'

    predicted_labels.append(predicted_label)






In [39]:
from sklearn.metrics import classification_report
from datetime import datetime
import os


timestamp = datetime.now().strftime('%Y%m%d%H%M%S')

dir = f"evaluation-results/bert"
os.makedirs(dir, exist_ok=True)  # Ensure directory creation or skip if it already exists

str_1 = "# Evaluation of {} \n".format(timestamp)
str_2 = "## Classification report: \n"
class_report = classification_report(y_true=truth, y_pred=predicted_labels)

file_name = os.path.join(dir, timestamp + "_" + "eval.md")

file = open(file_name, "w")
file.write(str_1 + str_2 + class_report)
file.close()