<a href="https://colab.research.google.com/github/osama-kheshaifaty/SPE-KSA-WORKSHOP-2025/blob/main/fine_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Fine-tuning a Pretrained LLM (DistilBERT) on a Small Text Classification Task
# -------------------------------------------------------------------------------

# Step 1: Install required libraries
# Uncomment and run this line if you're missing the libraries
# !pip install transformers datasets scikit-learn

# Step 2: Import libraries
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
import torch

# Step 3: Prepare a Small Synthetic Dataset
# --------------------------------------------
# We'll create a simple dataset: classify text as positive (1) or negative (0)

data = {
    "text": [
        "The well production increased significantly",
        "We encountered severe water breakthrough",
        "The ESP failed unexpectedly after 2 months",
        "Drilling was faster than expected",
        "Production rate dropped by 40%",
        "Well intervention improved flow rate",
        "Unexpected sand production started",
        "Field redevelopment was successful",
    ],
    "label": [1, 0, 0, 1, 0, 1, 0, 1]
}

# Convert to HuggingFace Dataset
dataset = Dataset.from_dict(data)

# Step 4: Load Pre-trained Model and Tokenizer
# ----------------------------------------------
# Tokenizer turns raw text into model-readable numbers (input_ids, attention_mask)
# Model is a DistilBERT (small version of BERT)

model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Step 5: Tokenize the Dataset
# ------------------------------
def preprocess(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

tokenized_dataset = dataset.map(preprocess, batched=True)

# Step 6: Define Training Arguments
# -----------------------------------
# These control how fine-tuning will happen:
# - where outputs are saved
# - batch size
# - number of epochs (full passes through data)
# - learning rate (how fast model updates)

training_args = TrainingArguments(
    output_dir="./results",        # folder for saving results
    evaluation_strategy="epoch",   # evaluate after each epoch
    learning_rate=2e-5,             # small learning rate (important for fine-tuning)
    per_device_train_batch_size=4,  # batch size
    num_train_epochs=5,             # small number of epochs (since tiny dataset)
    weight_decay=0.01,              # small regularization
    logging_dir="./logs",           # folder for logs
    logging_steps=5,
)

# Step 7: Define Trainer
# ------------------------
# Trainer is HuggingFace's simple way to manage training

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Step 8: Fine-tune the Model
# -----------------------------
# This is where fine-tuning actually happens.
# The model weights will adjust slightly to better fit this new specific task.

trainer.train()

# Step 9: Test the Fine-tuned Model
# -----------------------------------
# Let's manually test a new sentence.

test_text = "The new well intervention was highly successful."
inputs = tokenizer(test_text, return_tensors="pt", truncation=True, padding=True)

# Set model to evaluation mode
model.eval()

# Predict
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = logits.argmax().item()

# Map prediction to label
label_map = {0: "Negative", 1: "Positive"}
print(f"\nPrediction for: \"{test_text}\"")
print("Predicted label:", label_map[predicted_class])