<a href="https://colab.research.google.com/github/manmustbecool/Experiment/blob/main/llm_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from datasets import load_dataset
import torch

# Step 1: Load the smallest LLM and tokenizer
# Using "bigscience/bloom-560m" as an example of a small LLM.
model_name = "bigscience/bloom-560m"
# model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Step 2: Configure PEFT with LoRA (Low-Rank Adaptation)
# LoRA reduces the number of trainable parameters, making fine-tuning efficient.
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Specify the task type (causal language modeling)
    inference_mode=False,          # Set to False for training mode
    r=4,                           # Rank of the LoRA matrices (smaller for efficiency)
    lora_alpha=16,                 # Scaling factor for LoRA
    lora_dropout=0.1               # Dropout rate for LoRA layers
)
model = get_peft_model(model, peft_config)  # Wrap the base model with the PEFT configuration


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

ValueError: Unrecognized configuration class <class 'transformers.models.t5.configuration_t5.T5Config'> for this kind of AutoModel: AutoModelForCausalLM.
Model type should be one of AriaTextConfig, BambaConfig, BartConfig, BertConfig, BertGenerationConfig, BigBirdConfig, BigBirdPegasusConfig, BioGptConfig, BlenderbotConfig, BlenderbotSmallConfig, BloomConfig, CamembertConfig, LlamaConfig, CodeGenConfig, CohereConfig, Cohere2Config, CpmAntConfig, CTRLConfig, Data2VecTextConfig, DbrxConfig, DeepseekV3Config, DiffLlamaConfig, ElectraConfig, Emu3Config, ErnieConfig, FalconConfig, FalconMambaConfig, FuyuConfig, GemmaConfig, Gemma2Config, Gemma3Config, Gemma3TextConfig, GitConfig, GlmConfig, Glm4Config, GotOcr2Config, GPT2Config, GPT2Config, GPTBigCodeConfig, GPTNeoConfig, GPTNeoXConfig, GPTNeoXJapaneseConfig, GPTJConfig, GraniteConfig, GraniteMoeConfig, GraniteMoeSharedConfig, HeliumConfig, JambaConfig, JetMoeConfig, LlamaConfig, Llama4Config, Llama4TextConfig, MambaConfig, Mamba2Config, MarianConfig, MBartConfig, MegaConfig, MegatronBertConfig, MistralConfig, MixtralConfig, MllamaConfig, MoshiConfig, MptConfig, MusicgenConfig, MusicgenMelodyConfig, MvpConfig, NemotronConfig, OlmoConfig, Olmo2Config, OlmoeConfig, OpenLlamaConfig, OpenAIGPTConfig, OPTConfig, PegasusConfig, PersimmonConfig, PhiConfig, Phi3Config, Phi4MultimodalConfig, PhimoeConfig, PLBartConfig, ProphetNetConfig, QDQBertConfig, Qwen2Config, Qwen2MoeConfig, Qwen3Config, Qwen3MoeConfig, RecurrentGemmaConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, RwkvConfig, Speech2Text2Config, StableLmConfig, Starcoder2Config, TransfoXLConfig, TrOCRConfig, WhisperConfig, XGLMConfig, XLMConfig, XLMProphetNetConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig, ZambaConfig, Zamba2Config.

In [8]:
pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl 

In [None]:
dataset = load_dataset("imdb")
print(dataset['train'])

In [None]:
# Step 3: Load the IMDb dataset and create a small sample
dataset = load_dataset("imdb", split="train")  # Load the full training split
subset_size = int(0.005 * len(dataset))  # Calculate 0.5% of the dataset size
small_sample = dataset.select(range(subset_size))  # select a subset
print(small_sample)

# Step 4: Tokenize the dataset
# Convert text data into tokenized format suitable for the model.
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    batch_size=512  # Align batch size to match your training configuration
)
print(tokenized_dataset)

print(tokenized_dataset[0])

In [None]:
# Step 5: Define training arguments
# Specify hyperparameters and settings for the training process.
training_args = TrainingArguments(
    output_dir="./results",          # Directory to save training results
    # eval_strategy="epoch",    # Evaluate the model at the end of each epoch
    learning_rate=2e-5,             # Learning rate for the optimizer
    per_device_train_batch_size=4,  # Batch size per device
    num_train_epochs=1,             # Number of training epochs
    weight_decay=0.01,              # Weight decay for regularization
    save_total_limit=1,             # Limit the number of saved checkpoints
    label_names=["label"],          # Explicitly set label_names
    report_to="none"                # Disable integration with W&B
)

# Step 6: Initialize the Trainer
# The Trainer class handles the training loop and evaluation.
trainer = Trainer(
    model=model,                    # Model to be trained
    args=training_args,             # Training arguments
    train_dataset=tokenized_dataset # Training dataset
)

# Step 7: Fine-tune the model
trainer.train()

# Step 8: Save the fine-tuned model
# Save the model and tokenizer for future use.
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

print("training finished")

In [None]:
# Import necessary libraries
from datasets import Dataset
from sklearn.metrics import accuracy_score
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from datasets import load_dataset
import torch

# Sample dataset provided in the prompt
sample_data = [
    {"prompt": "ww", "response": "ssss"},
    {"prompt": "dd", "response": "ss"},
    {"prompt": "ss", "response": "sss"}
]

# Load the dataset into a Hugging Face Dataset object
dataset = Dataset.from_list(sample_data)

# Initialize the tokenizer and the small LLM (e.g., GPT-2-small)
model_name = "bigscience/bloom-560m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Step 2: Configure PEFT with LoRA (Low-Rank Adaptation)
# LoRA reduces the number of trainable parameters, making fine-tuning efficient.
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # Specify the task type (causal language modeling)
    inference_mode=False,          # Set to False for training mode
    r=4,                           # Rank of the LoRA matrices (smaller for efficiency)
    lora_alpha=16,                 # Scaling factor for LoRA
    lora_dropout=0.1               # Dropout rate for LoRA layers
)
# model = get_peft_model(model, peft_config)  # Wrap the base model with the PEFT configuration


# Tokenize the sample dataset for fine-tuning
def preprocess_function(example):
    inputs = tokenizer(example["prompt"], truncation=True, padding=True, max_length=64, return_tensors="pt")
    labels = tokenizer(example["response"], truncation=True, padding=True, max_length=64, return_tensors="pt")["input_ids"]
    return {"input_ids": inputs["input_ids"][0], "labels": labels[0]}

tokenized_dataset = dataset.map(preprocess_function)

for i in range(len(tokenized_dataset)):
    print(tokenized_dataset[i])

# Step 5: Define training arguments
# Specify hyperparameters and settings for the training process.
training_args = TrainingArguments(
    output_dir="./results",          # Directory to save training results
    # eval_strategy="epoch",    # Evaluate the model at the end of each epoch
    learning_rate=2e-5,             # Learning rate for the optimizer
    per_device_train_batch_size=8,  # Batch size per device
    num_train_epochs=1,             # Number of training epochs
    weight_decay=0.01,              # Weight decay for regularization
    save_total_limit=1,             # Limit the number of saved checkpoints
    label_names=["labels"],          # Explicitly set label_names
    report_to="none"                # Disable integration with W&B
)

# Step 6: Initialize the Trainer
# The Trainer class handles the training loop and evaluation.
trainer = Trainer(
    model=model,                    # Model to be trained
    args=training_args,             # Training arguments
    train_dataset=tokenized_dataset # Training dataset
)

# Step 7: Fine-tune the model
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

ValueError: expected sequence of length 2 at dim 1 (got 1)

In [None]:
# Step 5: Define training arguments
# Specify hyperparameters and settings for the training process.
training_args = TrainingArguments(
    output_dir="./results",          # Directory to save training results
    evaluation_strategy="epoch",    # Evaluate the model at the end of each epoch
    learning_rate=2e-5,             # Learning rate for the optimizer
    per_device_train_batch_size=8,  # Batch size per device
    num_train_epochs=1,             # Number of training epochs
    weight_decay=0.01,              # Weight decay for regularization
    save_total_limit=1,             # Limit the number of saved checkpoints
)

# Step 6: Initialize the Trainer
# The Trainer class handles the training loop and evaluation.
trainer = Trainer(
    model=model,                    # Model to be trained
    args=training_args,             # Training arguments
    train_dataset=tokenized_dataset # Training dataset
)

# Step 7: Fine-tune the model
trainer.train()

# Step 8: Save the fine-tuned model
# Save the model and tokenizer for future use.
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

# Step 9: Compare the fine-tuned model with the original model
# Evaluate both models on the same dataset and compare their losses.
def evaluate_model(model, tokenizer, dataset):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    for example in dataset:
        inputs = tokenizer(example["text"], return_tensors="pt", truncation=True, padding="max_length", max_length=128)
        with torch.no_grad():  # Disable gradient computation for evaluation
            outputs = model(**inputs, labels=inputs["input_ids"])  # Compute loss
        total_loss += outputs.loss.item()  # Accumulate loss
    return total_loss / len(dataset)  # Return average loss

# Load the original model for comparison
original_model = AutoModelForCausalLM.from_pretrained(model_name)

# Evaluate both models
original_loss = evaluate_model(original_model, tokenizer, tokenized_dataset)
fine_tuned_loss = evaluate_model(model, tokenizer, tokenized_dataset)

# Print the comparison results
print(f"Original Model Loss: {original_loss}")
print(f"Fine-Tuned Model Loss: {fine_tuned_loss}")

In [22]:
## test

# Import necessary libraries
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding
import torch


# Initialize tokenizer and model
model_name = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


# Sample dataset
sample_data = [
    {"prompt": "wer?", "response": "no"},
    {"prompt": "wer wer?", "response": "no no no"},
]*300

# Load dataset
dataset = Dataset.from_list(sample_data)

# Tokenization function with uniform padding
def preprocess_function(example):
    inputs = tokenizer(example["prompt"], truncation=True, padding="max_length", max_length=64, return_tensors="pt")
    labels = tokenizer(example["response"], truncation=True, padding="max_length", max_length=64, return_tensors="pt")["input_ids"]
    return {"input_ids": inputs["input_ids"].squeeze(0), "labels": labels.squeeze(0)}

tokenized_dataset = dataset.map(preprocess_function)

# Define data collator for consistent tensor shapes
data_collator = DataCollatorWithPadding(tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=30,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=1,
    label_names=["labels"],
    report_to="none"
)

# Initialize Trainer with data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator  # Handles padding dynamically
)

# Fine-tune the model
trainer.train()

model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

Model Output: wer?
I'm not sure if it


Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Step,Training Loss


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

In [25]:
# Initialize tokenizer and model
model_name = "./fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Test the model with a sample prompt
input_text = "wer?"
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(inputs["input_ids"], max_length=20) # Generate response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Model Output:", response)

Model Output: wer?

wer

wer

wer

wer

wer




In [24]:
# Initialize tokenizer and model
model_name = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Test the model with a sample prompt
input_text = "wer?"
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(inputs["input_ids"], max_length=20) # Generate response
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Model Output:", response)

Model Output: wer?
I'm not sure if it's a joke or not, but I'm pretty
