<a href="https://colab.research.google.com/github/manmustbecool/Experiment/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from datasets import load_dataset
import torch

# Step 1: Load the smallest LLM and tokenizer
# Using "bigscience/bloom-560m" as an example of a small LLM.
model_name = "bigscience/bloom-560m"
# model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Step 2: Configure PEFT with LoRA (Low-Rank Adaptation)
# LoRA reduces the number of trainable parameters, making fine-tuning efficient.
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Specify the task type (causal language modeling)
    inference_mode=False,          # Set to False for training mode
    r=4,                           # Rank of the LoRA matrices (smaller for efficiency)
    lora_alpha=16,                 # Scaling factor for LoRA
    lora_dropout=0.1               # Dropout rate for LoRA layers
)
model = get_peft_model(model, peft_config)  # Wrap the base model with the PEFT configuration


In [None]:
dataset = load_dataset("imdb")
print(dataset['train'])

In [None]:
# Step 3: Load the IMDb dataset and create a small sample
dataset = load_dataset("imdb", split="train")  # Load the full training split
subset_size = int(0.005 * len(dataset))  # Calculate 0.5% of the dataset size
small_sample = dataset.select(range(subset_size))  # select a subset
print(small_sample)

# Step 4: Tokenize the dataset
# Convert text data into tokenized format suitable for the model.
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=512  # Align batch size to match your training configuration
)
print(tokenized_dataset)

print(tokenized_dataset[0])

In [None]:
# Step 5: Define training arguments
# Specify hyperparameters and settings for the training process.
training_args = TrainingArguments(
    output_dir="./results",          # Directory to save training results
    # eval_strategy="epoch",    # Evaluate the model at the end of each epoch
    learning_rate=2e-5,             # Learning rate for the optimizer
    per_device_train_batch_size=4,  # Batch size per device
    num_train_epochs=1,             # Number of training epochs
    weight_decay=0.01,              # Weight decay for regularization
    save_total_limit=1,             # Limit the number of saved checkpoints
    label_names=["label"],          # Explicitly set label_names
    report_to="none"                # Disable integration with W&B
)

# Step 6: Initialize the Trainer
# The Trainer class handles the training loop and evaluation.
trainer = Trainer(
    model=model,                    # Model to be trained
    args=training_args,             # Training arguments
    train_dataset=tokenized_dataset # Training dataset
)

# Step 7: Fine-tune the model
trainer.train()

# Step 8: Save the fine-tuned model
# Save the model and tokenizer for future use.
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

print("training finished")

In [None]:
# Import necessary libraries
from datasets import Dataset
from sklearn.metrics import accuracy_score
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from datasets import load_dataset
import torch

# Sample dataset provided in the prompt
sample_data = [
    {"prompt": "ww", "response": "ssss"},
    {"prompt": "dd", "response": "ss"},
    {"prompt": "ss", "response": "sss"}
]

# Load the dataset into a Hugging Face Dataset object
dataset = Dataset.from_list(sample_data)

# Initialize the tokenizer and the small LLM (e.g., GPT-2-small)
model_name = "bigscience/bloom-560m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Step 2: Configure PEFT with LoRA (Low-Rank Adaptation)
# LoRA reduces the number of trainable parameters, making fine-tuning efficient.
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Specify the task type (causal language modeling)
    inference_mode=False,          # Set to False for training mode
    r=4,                           # Rank of the LoRA matrices (smaller for efficiency)
    lora_alpha=16,                 # Scaling factor for LoRA
    lora_dropout=0.1               # Dropout rate for LoRA layers
)
# model = get_peft_model(model, peft_config)  # Wrap the base model with the PEFT configuration


# Tokenize the sample dataset for fine-tuning
def preprocess_function(example):
    inputs = tokenizer(example["prompt"], truncation=True, padding=True, max_length=64, return_tensors="pt")
    labels = tokenizer(example["response"], truncation=True, padding=True, max_length=64, return_tensors="pt")["input_ids"]
    return {"input_ids": inputs["input_ids"][0], "labels": labels[0]}

tokenized_dataset = dataset.map(preprocess_function)

for i in range(len(tokenized_dataset)):
    print(tokenized_dataset[i])

# Step 5: Define training arguments
# Specify hyperparameters and settings for the training process.
training_args = TrainingArguments(
    output_dir="./results",          # Directory to save training results
    # eval_strategy="epoch",    # Evaluate the model at the end of each epoch
    learning_rate=2e-5,             # Learning rate for the optimizer
    per_device_train_batch_size=8,  # Batch size per device
    num_train_epochs=1,             # Number of training epochs
    weight_decay=0.01,              # Weight decay for regularization
    save_total_limit=1,             # Limit the number of saved checkpoints
    label_names=["labels"],          # Explicitly set label_names
    report_to="none"                # Disable integration with W&B
)

# Step 6: Initialize the Trainer
# The Trainer class handles the training loop and evaluation.
trainer = Trainer(
    model=model,                    # Model to be trained
    args=training_args,             # Training arguments
    train_dataset=tokenized_dataset # Training dataset
)

# Step 7: Fine-tune the model
trainer.train()

In [None]:
# Step 5: Define training arguments
# Specify hyperparameters and settings for the training process.
training_args = TrainingArguments(
    output_dir="./results",          # Directory to save training results
    evaluation_strategy="epoch",    # Evaluate the model at the end of each epoch
    learning_rate=2e-5,             # Learning rate for the optimizer
    per_device_train_batch_size=8,  # Batch size per device
    num_train_epochs=1,             # Number of training epochs
    weight_decay=0.01,              # Weight decay for regularization
    save_total_limit=1,             # Limit the number of saved checkpoints
)

# Step 6: Initialize the Trainer
# The Trainer class handles the training loop and evaluation.
trainer = Trainer(
    model=model,                    # Model to be trained
    args=training_args,             # Training arguments
    train_dataset=tokenized_dataset # Training dataset
)

# Step 7: Fine-tune the model
trainer.train()

# Step 8: Save the fine-tuned model
# Save the model and tokenizer for future use.
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

# Step 9: Compare the fine-tuned model with the original model
# Evaluate both models on the same dataset and compare their losses.
def evaluate_model(model, tokenizer, dataset):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    for example in dataset:
        inputs = tokenizer(example["text"], return_tensors="pt", truncation=True, padding="max_length", max_length=128)
        with torch.no_grad():  # Disable gradient computation for evaluation
            outputs = model(**inputs, labels=inputs["input_ids"])  # Compute loss
        total_loss += outputs.loss.item()  # Accumulate loss
    return total_loss / len(dataset)  # Return average loss

# Load the original model for comparison
original_model = AutoModelForCausalLM.from_pretrained(model_name)

# Evaluate both models
original_loss = evaluate_model(original_model, tokenizer, tokenized_dataset)
fine_tuned_loss = evaluate_model(model, tokenizer, tokenized_dataset)

# Print the comparison results
print(f"Original Model Loss: {original_loss}")
print(f"Fine-Tuned Model Loss: {fine_tuned_loss}")