<a href="https://colab.research.google.com/github/patelvishwa112/Machine-Learning-Project/blob/master/Fine_tune_SmolLM2_360M_Instruct_on_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install transformers datasets evaluate accelerate pandas numpy

In [None]:
# Full Python script to fine-tune the HuggingFaceTB/SmolLM2-360M-Instruct model
# for text classification using the Hugging Face Transformers and Datasets libraries.

# --- 1. Import Libraries ---
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
import evaluate # Using the 'evaluate' library (successor to datasets.load_metric)

print("Libraries imported.")

# --- 2. Configuration ---
MODEL_CHECKPOINT = "HuggingFaceTB/SmolLM2-360M-Instruct"
DATASET_NAME = "stanfordnlp/imdb" # Example dataset: IMDB movie reviews
OUTPUT_DIR = "./smollm2-imdb-finetuned" # Directory to save results/checkpoints
LOGGING_DIR = "./smollm2-imdb-logs" # Directory for logs

# Training Hyperparameters (Adjust based on your hardware and dataset)
LEARNING_RATE = 2e-5
TRAIN_BATCH_SIZE = 8 # Reduce if you encounter CUDA Out-of-Memory errors
EVAL_BATCH_SIZE = 16
NUM_EPOCHS = 3 # Adjust as needed
WEIGHT_DECAY = 0.01

print(f"Configuration set: Model={MODEL_CHECKPOINT}, Dataset={DATASET_NAME}")

# --- 3. Load and Prepare Dataset ---

# Load the dataset (e.g., IMDB)
print(f"Loading dataset '{DATASET_NAME}'...")
raw_datasets = load_dataset(DATASET_NAME)
print("Dataset loaded:", raw_datasets)

# Check dataset structure (IMDB has 'text' and 'label' columns)
# For IMDB: label 0 = negative, label 1 = positive
# If your dataset has different column names or label types, adjust the preprocessing function.

# Split the training data to create a validation set if one doesn't exist
if "validation" not in raw_datasets:
    print("Creating validation split...")
    train_val_split = raw_datasets["train"].train_test_split(test_size=0.1, seed=42)
    raw_datasets["train"] = train_val_split["train"]
    raw_datasets["validation"] = train_val_split["test"]
    print("Validation split created.")

# --- 4. Load Tokenizer ---
print(f"Loading tokenizer for '{MODEL_CHECKPOINT}'...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# Set padding token if it's not already set (common for decoder-only models)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print(f"Tokenizer pad token set to eos_token ({tokenizer.eos_token})")

# --- 5. Preprocess Data ---
def preprocess_function(examples):
    """Tokenizes the text data."""
    # Ensure label column is present and named 'label' for Trainer compatibility
    # The tokenizer handles text processing. Truncation ensures sequences aren't too long.
    tokenized_output = tokenizer(examples["text"], truncation=True, max_length=512) # Adjust max_length if needed
    # The labels are already integers in the IMDB dataset (0 or 1)
    # If your dataset has string labels, map them to integers here.
    # tokenized_output["label"] = [label_map[lbl] for lbl in examples["label_text_column"]]
    return tokenized_output

print("Preprocessing dataset...")
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

# Remove columns not needed by the model to prevent potential errors during training
# Keep 'input_ids', 'attention_mask', and 'label'
columns_to_remove = [col for col in raw_datasets['train'].column_names if col not in ['input_ids', 'attention_mask', 'label']]
if 'text' in columns_to_remove: # Make sure to remove original text column if present
     tokenized_datasets = tokenized_datasets.remove_columns(columns_to_remove)

print("Dataset preprocessing complete.")
print("Tokenized dataset features:", tokenized_datasets['train'].features)

# --- 6. Data Collator ---
# Dynamically pads sequences in each batch to the length of the longest sequence in that batch.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print("Data collator initialized.")

# --- 7. Load Model ---
# Determine number of labels from the dataset
num_labels = raw_datasets["train"].features["label"].num_classes
print(f"Number of labels in dataset: {num_labels}")

print(f"Loading model '{MODEL_CHECKPOINT}' for sequence classification...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=num_labels,
    ignore_mismatched_sizes=True # Useful if the base model doesn't have a classification head
)

# Ensure model's pad_token_id is set, important for attention masking during padding
if model.config.pad_token_id is None:
    model.config.pad_token_id = tokenizer.pad_token_id
    print(f"Model pad_token_id set to: {model.config.pad_token_id}")

# Optional: Freeze base model layers (uncomment the following lines to freeze)
# print("Freezing base model layers...")
# for param in model.base_model.parameters(): # Adjust 'base_model' if model structure differs
#     param.requires_grad = False
# print("Base model layers frozen.")

print("Model loaded successfully.")

# --- 8. Define Metrics ---
# Use the 'evaluate' library for metrics calculation
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """Computes accuracy metric."""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

print("Metrics function defined.")

# --- 9. Define Training Arguments ---
print("Defining training arguments...")
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,                 # Directory to save model checkpoints and results
    logging_dir=LOGGING_DIR,               # Directory for TensorBoard logs
    num_train_epochs=NUM_EPOCHS,           # Total number of training epochs
    per_device_train_batch_size=TRAIN_BATCH_SIZE,  # Batch size per device during training
    per_device_eval_batch_size=EVAL_BATCH_SIZE,    # Batch size for evaluation
    learning_rate=LEARNING_RATE,           # Initial learning rate
    weight_decay=WEIGHT_DECAY,             # Strength of weight decay regularization
    evaluation_strategy="epoch",           # Evaluate performance at the end of each epoch
    save_strategy="epoch",                 # Save model checkpoint at the end of each epoch
    logging_strategy="epoch",              # Log metrics at the end of each epoch
    load_best_model_at_end=True,           # Load the best model checkpoint found during training
    metric_for_best_model="accuracy",      # Metric used to identify the best model
    greater_is_better=True,                # Higher accuracy is better
    push_to_hub=False,                     # Set to True to push model to Hugging Face Hub (requires login)
    fp16=torch.cuda.is_available(),        # Enable mixed precision training if CUDA is available
    report_to="tensorboard",               # Log results to TensorBoard
)

# --- 10. Initialize Trainer ---
print("Initializing Trainer...")
trainer = Trainer(
    model=model,                           # The instantiated Transformers model to be trained
    args=training_args,                    # Training arguments defined above
    train_dataset=tokenized_datasets["train"], # Training dataset
    eval_dataset=tokenized_datasets["validation"], # Validation dataset
    tokenizer=tokenizer,                   # Tokenizer used for preprocessing
    data_collator=data_collator,           # Data collator for dynamic padding
    compute_metrics=compute_metrics,       # Function to compute evaluation metrics
)

# --- 11. Start Training ---
print("Starting training...")
try:
    train_result = trainer.train()
    print("Training finished.")

    # Save training metrics
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state() # Saves trainer state, including args, tokenizer, etc.

    print("Training metrics saved.")

except Exception as e:
    print(f"An error occurred during training: {e}")
    # Handle potential errors like CUDA Out-of-Memory

# --- 12. Evaluate on Test Set ---
if "test" in tokenized_datasets:
    print("Evaluating on the test set...")
    try:
        eval_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
        print("Test set evaluation results:", eval_results)
        trainer.log_metrics("eval", eval_results)
        trainer.save_metrics("eval", eval_results)
    except Exception as e:
        print(f"An error occurred during evaluation: {e}")
else:
    print("No test set found in the dataset. Skipping final evaluation.")

# --- 13. Save Final Model and Tokenizer ---
print(f"Saving final model and tokenizer to {OUTPUT_DIR}...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("Model and tokenizer saved.")

# --- 14. Optional: Inference Example ---
print("\n--- Inference Example ---")
try:
    # Load the fine-tuned model and tokenizer using pipeline
    from transformers import pipeline

    # Ensure the model path points to where the fine-tuned model was saved
    fine_tuned_model_path = OUTPUT_DIR
    classifier_pipeline = pipeline(
        "text-classification",
        model=fine_tuned_model_path,
        tokenizer=fine_tuned_model_path,
        device=0 if torch.cuda.is_available() else -1 # Use GPU if available
    )

    # Example texts to classify
    test_texts = [
        "This movie was absolutely fantastic, heartwarming, and beautifully shot!",
        "What a complete waste of time, the plot was predictable and the acting was terrible.",
        "It was an okay film, nothing special but not bad either."
    ]

    # Perform classification
    predictions = classifier_pipeline(test_texts)

    print("Predictions on example texts:")
    for text, prediction in zip(test_texts, predictions):
        # The pipeline output includes label (e.g., 'LABEL_1') and score
        # Map label back to meaningful name if possible (depends on dataset)
        label_name = "POSITIVE" if prediction['label'] == 'LABEL_1' else "NEGATIVE" # Specific to IMDB
        print(f"Text: '{text}'")
        print(f"Predicted Label: {prediction['label']} ({label_name}), Score: {prediction['score']:.4f}\n")

except Exception as e:
    print(f"Could not run inference example: {e}")

print("Script finished.")

Libraries imported.
Configuration set: Model=HuggingFaceTB/SmolLM2-360M-Instruct, Dataset=stanfordnlp/imdb
Loading dataset 'stanfordnlp/imdb'...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset loaded: DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})
Creating validation split...
Validation split created.
Loading tokenizer for 'HuggingFaceTB/SmolLM2-360M-Instruct'...


tokenizer_config.json:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

Preprocessing dataset...


Map:   0%|          | 0/22500 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Dataset preprocessing complete.
Tokenized dataset features: {'label': ClassLabel(names=['neg', 'pos'], id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}
Data collator initialized.
Number of labels in dataset: 2
Loading model 'HuggingFaceTB/SmolLM2-360M-Instruct' for sequence classification...


config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/724M [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM2-360M-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Metrics function defined.
Defining training arguments...
Initializing Trainer...


  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2113,0.212113,0.9472
2,0.0765,,0.952
3,0.0218,,0.9556


Training finished.
***** train metrics *****
  epoch                    =        3.0
  total_flos               = 56984405GF
  train_loss               =     0.1032
  train_runtime            = 2:15:26.55
  train_samples_per_second =      8.306
  train_steps_per_second   =      1.038
Training metrics saved.
Evaluating on the test set...


Test set evaluation results: {'eval_loss': nan, 'eval_accuracy': 0.95832, 'eval_runtime': 809.7394, 'eval_samples_per_second': 30.874, 'eval_steps_per_second': 1.93, 'epoch': 3.0}
***** eval metrics *****
  epoch                   =        3.0
  eval_accuracy           =     0.9583
  eval_loss               =        nan
  eval_runtime            = 0:13:29.73
  eval_samples_per_second =     30.874
  eval_steps_per_second   =       1.93
Saving final model and tokenizer to ./smollm2-imdb-finetuned...
Model and tokenizer saved.

--- Inference Example ---


Device set to use cuda:0


Predictions on example texts:
Text: 'This movie was absolutely fantastic, heartwarming, and beautifully shot!'
Predicted Label: LABEL_1 (POSITIVE), Score: 0.9994

Text: 'What a complete waste of time, the plot was predictable and the acting was terrible.'
Predicted Label: LABEL_0 (NEGATIVE), Score: 0.9999

Text: 'It was an okay film, nothing special but not bad either.'
Predicted Label: LABEL_0 (NEGATIVE), Score: 0.9479

Script finished.


In [None]:
# Code snippet to save the fine-tuned model and tokenizer to Google Drive in a Colab environment.
# Place this snippet AFTER the trainer.train() and trainer.evaluate() steps
# in the previous script, assuming 'trainer' and 'tokenizer' objects exist.

import os
from google.colab import drive

print("Mounting Google Drive...")
# This will prompt for authorization in Colab.
drive.mount('/content/drive')
print("Google Drive mounted successfully.")

# --- Define Path in Google Drive ---
# IMPORTANT: Change 'My Drive/my_finetuned_model' to your desired path in Google Drive.
# 'My Drive' corresponds to the root of your Google Drive.
gdrive_save_path = "/content/drive/MyDrive/smollm2-imdb-finetuned-colab" # Example path

# --- Create Directory if it doesn't exist ---
print(f"Ensuring directory exists: {gdrive_save_path}")
os.makedirs(gdrive_save_path, exist_ok=True)

# --- Save Model and Tokenizer to Google Drive ---
print(f"Saving model and tokenizer to Google Drive path: {gdrive_save_path} ...")
try:
    # Use the trainer object to save the final model state
    trainer.save_model(gdrive_save_path)

    # Use the tokenizer object to save its configuration
    tokenizer.save_pretrained(gdrive_save_path)

    print("Model and tokenizer successfully saved to Google Drive.")
    print(f"Files saved in: {gdrive_save_path}")

except Exception as e:
    print(f"An error occurred while saving to Google Drive: {e}")

# Optional: You can unmount Drive afterwards if desired, though it's often not necessary
# drive.flush_and_unmount()
# print("Google Drive unmounted.")

Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted successfully.
Ensuring directory exists: /content/drive/MyDrive/smollm2-imdb-finetuned-colab
Saving model and tokenizer to Google Drive path: /content/drive/MyDrive/smollm2-imdb-finetuned-colab ...
Model and tokenizer successfully saved to Google Drive.
Files saved in: /content/drive/MyDrive/smollm2-imdb-finetuned-colab


In [None]:
classifier_pipeline.model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 960, padding_idx=2)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=960, out_features=960, bias=False)
          (k_proj): Linear(in_features=960, out_features=320, bias=False)
          (v_proj): Linear(in_features=960, out_features=320, bias=False)
          (o_proj): Linear(in_features=960, out_features=960, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=960, out_features=2560, bias=False)
          (up_proj): Linear(in_features=960, out_features=2560, bias=False)
          (down_proj): Linear(in_features=2560, out_features=960, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((960,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((960,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((960,), eps=1e-05)
  