Step 1: Install/Import Necessary Libraries

In [5]:
!pip install -U transformers datasets scikit-learn




In [6]:
import numpy as np
import torch
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments, DataCollatorWithPadding)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score


Load the IMDb Dataset
Here we load the IMDb dataset (a binary sentiment dataset with labels 0 and 1)

In [7]:
# Load the dataset from Hugging Face
dataset = load_dataset("imdb")
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


Preprocess and Tokenize the Dataset

We use the distilbert-base-uncased tokenizer and map it over the dataset.

In [8]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Define a tokenization function
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

# Apply tokenization to the entire dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Create Training, Validation, and Test Splits

The IMDb dataset comes with a predefined test split. We'll split a small portion of the training data to serve as a validation set

In [9]:
# Split the original train set into training and validation sets (80% train, 20% validation)
split_datasets = tokenized_datasets["train"].train_test_split(test_size=0.2)
train_dataset = split_datasets["train"]
val_dataset = split_datasets["test"]
test_dataset = tokenized_datasets["test"]

print(f"Train dataset: {len(train_dataset)} samples")
print(f"Validation dataset: {len(val_dataset)} samples")
print(f"Test dataset: {len(test_dataset)} samples")

Train dataset: 20000 samples
Validation dataset: 5000 samples
Test dataset: 25000 samples


Prepare a Data Collator

We use a data collator that will dynamically pad our inputs to the maximum length in each batch

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Load a Pre-trained DistilBERT Model for Sequence Classification

In [11]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Define the Evaluation Metrics

We use accuracy, weighted F1, precision, and recall as our metrics

In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

Set Up Training Arguments and Initialize the Trainer

We configure the training arguments (including learning rate, batch sizes, number of epochs, etc.) and then initialize the Trainer.

In [13]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",         # Evaluate at the end of each epoch
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,         # Save the best model based on evaluation
    metric_for_best_model="accuracy",
    save_strategy="epoch",
    # Early stopping parameters:
    save_total_limit=2,  # Only keep the best 2 checkpoints
    #early_stopping_patience=2,  # Stop after 2 epochs without improvement
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


Fine-tune the Model

Run the training loop

In [14]:
trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mniklasroeker[0m ([33mniklasroeker-personal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2341,0.203139,0.9242,0.9242,0.9242,0.9242


TrainOutput(global_step=1250, training_loss=0.2624372383117676, metrics={'train_runtime': 2085.0757, 'train_samples_per_second': 9.592, 'train_steps_per_second': 0.599, 'total_flos': 2623392642195840.0, 'train_loss': 0.2624372383117676, 'epoch': 1.0})

Evaluate the Model on the Test Set

After training, evaluate the performance on the test set

In [15]:
test_results = trainer.evaluate(test_dataset)
print("Test set results:", test_results)


Test set results: {'eval_loss': 0.1996726244688034, 'eval_accuracy': 0.92584, 'eval_f1': 0.9258399957283838, 'eval_precision': 0.9258400981135586, 'eval_recall': 0.92584, 'eval_runtime': 367.6458, 'eval_samples_per_second': 68.0, 'eval_steps_per_second': 4.251, 'epoch': 1.0}


Save the Fine-tuned Model

In [16]:
trainer.save_model("./sentiment_model")


In [17]:
!pip install transformers



In [22]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:
from transformers import AutoModelForSequenceClassification

 # Assuming your model is saved at './sentiment_model'
model = AutoModelForSequenceClassification.from_pretrained("./sentiment_model")

 # Push the model to the Hugging Face Model Hub
model.push_to_hub("Niklas21/sentiment_model")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Niklas21/sentiment_model/commit/5d355c411d90a92f6748e86e125e6380048dc843', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='5d355c411d90a92f6748e86e125e6380048dc843', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Niklas21/sentiment_model', endpoint='https://huggingface.co', repo_type='model', repo_id='Niklas21/sentiment_model'), pr_revision=None, pr_num=None)