In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/Lighthouse Labs/LLM-Project/notebooks

Mounted at /content/drive
/content/drive/MyDrive/Lighthouse Labs/LLM-Project/notebooks


In [2]:
# Install necessary libraries
!pip install datasets evaluate transformers

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31

In [3]:
import os
import numpy as np
from datasets import load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import evaluate

In [4]:
# Disable wandb and optimize GPU
os.environ["WANDB_DISABLED"] = "true"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"


In [5]:
# Load tokenized dataset and model
tokenized_ds = load_from_disk("tokenized_imdb")
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

In [6]:
# Define compute_metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=predictions, references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels, average='weighted')
    return {**acc, **f1_score}

In [7]:
# Use a smaller subset of the dataset (10%)
small_train_ds = tokenized_ds["train"].shuffle(seed=42).select(range(800))
small_eval_ds = tokenized_ds["test"].shuffle(seed=42).select(range(300))

In [9]:
# Try multiple learning rates
learning_rates = [2e-5, 5e-5, 1e-4]
results = {}

for lr in learning_rates:
    print(f"\nTraining with learning rate: {lr}")

    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

    training_args = TrainingArguments(
        output_dir=f"./results_optimized_lr_{lr}",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        learning_rate=lr,
        num_train_epochs=1,
        evaluation_strategy="epoch",
        save_strategy="no",
        logging_dir="./logs_optimized",
        logging_steps=100,
        fp16=True,
        load_best_model_at_end=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=small_train_ds,
        eval_dataset=small_eval_ds,
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_result = trainer.evaluate()
    results[lr] = eval_result
    print(f"Results for lr={lr}: {eval_result}")


Training with learning rate: 2e-05


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.424,0.424337,0.823333,0.823316


Results for lr=2e-05: {'eval_loss': 0.4243372082710266, 'eval_accuracy': 0.8233333333333334, 'eval_f1': 0.8233156648998233, 'eval_runtime': 280.0072, 'eval_samples_per_second': 1.071, 'eval_steps_per_second': 0.268, 'epoch': 1.0}

Training with learning rate: 5e-05


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.524,0.509094,0.82,0.819712


Results for lr=5e-05: {'eval_loss': 0.5090937614440918, 'eval_accuracy': 0.82, 'eval_f1': 0.8197115384615384, 'eval_runtime': 280.0066, 'eval_samples_per_second': 1.071, 'eval_steps_per_second': 0.268, 'epoch': 1.0}

Training with learning rate: 0.0001


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7002,0.693122,0.5,0.333333


Results for lr=0.0001: {'eval_loss': 0.6931217312812805, 'eval_accuracy': 0.5, 'eval_f1': 0.3333333333333333, 'eval_runtime': 282.5261, 'eval_samples_per_second': 1.062, 'eval_steps_per_second': 0.265, 'epoch': 1.0}


In [11]:
print("\nSummary of optimization results:")
for lr, metrics in results.items():
    print(f"LR: {lr} -> Accuracy: {metrics['eval_accuracy']:.4f}, F1: {metrics['eval_f1']:.4f}")



Summary of optimization results:
LR: 2e-05 -> Accuracy: 0.8233, F1: 0.8233
LR: 5e-05 -> Accuracy: 0.8200, F1: 0.8197
LR: 0.0001 -> Accuracy: 0.5000, F1: 0.3333


# Hyperparameter Optimization Summary
To optimize the performance of our DistilBERT model, we experimented with different learning rates using a smaller subset of the IMDb dataset (10% for training and evaluation). We trained each configuration for 1 epoch to reduce compute time.

The best performance was achieved with a learning rate of 2e-5, which we selected for the final model. Higher learning rates like 1e-4 resulted in unstable training and poor performance. This simple hyperparameter sweep helped fine-tune the training process and improve model accuracy.

In [12]:
model.save_pretrained("/content/drive/MyDrive/Lighthouse Labs/LLM-Project/notebooks/final_distilbert_model")