#Fine-tunning Distil-bert

In [None]:
!pip install transformers[torch] datasets evaluate accelerate bitsandbytes peft -q
!pip install -U datasets

In [None]:
import torch
from datasets import load_dataset
import numpy as np
import evaluate

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType
)

BASE_MODEL = "distilbert-base-uncased"
DATASET_NAME = "sst2" # Stanford Sentiment Treebank
HUB_MODEL_ID = f"distilbert-base-sst2-lora"

# --- LoRA Configuration ---
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, # This is a sequence classification task
    r=16, # The dimension of the low-rank matrices
    lora_alpha=32, # The scaling factor for the low-rank matrices
    lora_dropout=0.1, # Dropout probability for LoRA layers
    target_modules=["q_lin", "v_lin"] # Target the query and value layers in the attention blocks
)

print("✅ Configuration and LoRA setup complete.")

✅ Configuration and LoRA setup complete.


In [None]:
raw_datasets = load_dataset("glue", DATASET_NAME)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns("sentence")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Inspect a sample
print(raw_datasets['train'][0])
print("\n✅ Dataset loaded and prepared.")

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL,
    num_labels=2,
).to("cuda")

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("\n✅ PEFT model created.")

In [None]:
# Define metrics
accuracy_metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir=HUB_MODEL_ID,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    push_to_hub=True,
)

print("✅ Metrics and Training Arguments defined.")

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start fine-tuning
print("🚀 Starting PEFT/LoRA fine-tuning...")
trainer.train()
print("🎉 Fine-tuning complete!")

# Push the final adapter to the Hub
# trainer.push_to_hub()
# print(f"✅ Adapter pushed to Hub: https://huggingface.co/{HUB_MODEL_ID.replace('.', '/')}")

In [None]:
from peft import AutoPeftModelForSequenceClassification
from transformers import AutoTokenizer, pipeline

# Load the fine-tuned PEFT model from the Hub for inference
model = AutoPeftModelForSequenceClassification.from_pretrained(HUB_MODEL_ID)
tokenizer = AutoTokenizer.from_pretrained(HUB_MODEL_ID)

# Create a pipeline
classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True # Important for getting confidence scores
)

# Test sentences
test_sentences = [
    "This movie was fantastic, a true masterpiece of cinema!",
    "The acting was wooden and the plot was predictable.",
    "The movie was painfully slow and boring.",
    "I'm not sure how I feel about this film."
]

# Run predictions
results = classifier(test_sentences)

# Print results in a structured way
label_map = { "LABEL_0": "Negative", "LABEL_1": "Positive" }

print("\n--- Inference Results from PEFT model ---")
for sentence, result in zip(test_sentences, results):
    # Find the prediction with the highest score
    prediction = max(result, key=lambda x: x['score'])
    predicted_label = label_map[prediction['label']]
    confidence = prediction['score']

    print(f"Input: '{sentence}'")
    print(f"  -> Predicted Label: {predicted_label} | Confidence: {confidence:.2%}")
    print("-" * 20)

In [None]:
# Get evaluation results after training
eval_results = trainer.evaluate()

# Print accuracy and loss
print("\n--- Evaluation Results ---")
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"Loss: {eval_results['eval_loss']:.4f}")
print("-" * 20)

# Push the model and tokenizer to the Hub
trainer.push_to_hub()

print(f"\n✅ Model and tokenizer pushed to Hub: https://huggingface.co/myselfmankar/{HUB_MODEL_ID.replace('.', '/')}")
