<a href="https://colab.research.google.com/github/manmustbecool/Experiment/blob/main/llm_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install datasets

In [None]:
# A simple example for fine tuning LLM.
# supervized learning
#

from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding
import torch

# Initialize tokenizer and model
model_name = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


# Sample dataset
sample_data = [
    {"prompt": "wer?", "response": "no "},
    {"prompt": "wer wer?", "response": "no no no"},
]*10

# Load dataset
dataset = Dataset.from_list(sample_data)

# Tokenization function for processing prompts and responses
def preprocess_function(example):
    tokenized = tokenizer(
        example["prompt"],  # Tokenizing the input prompt
        text_target=example["response"],  # Tokenizing the expected response (labels)
        truncation=True,  # Ensures sequences longer than max_length are truncated
        padding="max_length",  # Pads sequences to a fixed max_length (64)
        max_length=128  # Defines the maximum token length for each sequence, should not exceed the model’s token limit
    )

    # Returns tokenized input IDs and labels (target responses)
    return {"input_ids": tokenized["input_ids"], "labels": tokenized["labels"]}

tokenized_dataset = dataset.map(preprocess_function, batched=True)
print(tokenized_dataset)

# Define data collator for consistent tensor shapes
data_collator = DataCollatorWithPadding(tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=1,
    label_names=["labels"],
    report_to="none"
)

# Initialize Trainer with data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    # data_collator=data_collator  # Handles padding dynamically
)

# Fine-tune the model
trainer.train()

model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

In [None]:
from transformers import pipeline

model_name = "./fine_tuned_model"

generator = pipeline('text-generation', model=model_name)
generator("wer?")

In [None]:
from transformers import pipeline

generator = pipeline('text-generation', model="facebook/opt-125m")
generator("wer?")