<a href="https://colab.research.google.com/github/maybeanns/ML-AI_Projects/blob/main/FineTune_YT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install unsloth peft transformers datasets accelerate bitsandbytes --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.9/46.9 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.5/294.5 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.8/375.8 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.2/154.2 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes


# Task
Fine-tune the Llama 3.1 LLM model using the Unsloth library and a provided dataset.

## Load llama 3.1 model

### Subtask:
Load the Llama 3.1 model using Unsloth's optimized loading function.


In [3]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # You can change this
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.12: Fast Llama patching. Transformers: 4.53.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

## Add LoRA **Adapter**

In [4]:
from peft import get_peft_model, LoraConfig
import torch

lora_config = LoraConfig(
    r = 16,
    lora_alpha = 16,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
        "embed_tokens", "lm_head",
    ],
    lora_dropout = 0,
    bias = "none",
    use_rslora = False,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 46,178,304 || all params: 8,076,439,552 || trainable%: 0.5718


## Load dataset

### Subtask:
Load the dataset you want to use for fine-tuning. Ensure the dataset is in a format suitable for training.


In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset

# 1. Define the Alpaca Prompt Template*************************************************************

# This template is designed to guide the language model during fine-tuning.
# It clearly separates the instruction, input, response, and explanation.
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Given the company database schema, generate a SQL query based on the SQL prompt. Also provide an explanation of the generated SQL.

### Company database:
{}

### Input:
SQL Prompt: {}

### Response:
SQL: {}
Explanation: {}
"""


# Get the End-Of-Sequence token. This is crucial for the model to know
# where the generated response should stop during inference.
EOS_TOKEN = tokenizer.eos_token

# 3. Define the formatting function*************************************************************

# This function takes examples from the dataset and formats them
# according to the alpaca_prompt template.
def formatting_prompts_func(examples):
    # Extract the relevant data fields from the dataset examples
    company_databases = examples["sql_context"]
    sql_prompts = examples["sql_prompt"]
    sqls = examples["sql"]
    explanations = examples["sql_explanation"]

    texts = []
    # Iterate through each example in the batch
    for company_database, sql_prompt, sql, explanation in zip(company_databases, sql_prompts, sqls, explanations):
        # Format the alpaca_prompt with the current example's data
        # and append the EOS_TOKEN.
        # The EOS_TOKEN is essential to prevent the model from generating indefinitely.
        formatted_text = alpaca_prompt.format(
            company_database,
            sql_prompt,
            sql,
            explanation
        ) + EOS_TOKEN
        texts.append(formatted_text)

    # Return a dictionary with a "text" key, containing all the formatted strings.
    return { "text" : texts }

# 4. Load the dataset*************************************************************

# Load the "gretelai/synthetic_text_to_sql" dataset from Hugging Face Hub.
# We specify 'train' split for training.
print("Loading dataset...")
dataset = load_dataset("gretelai/synthetic_text_to_sql", split="train")
print(f"Dataset loaded. Number of examples: {len(dataset)}")

# 5. Apply the formatting function to the dataset
# The .map() function applies our formatting_prompts_func to each example
# in the dataset. 'batched=True' is more efficient for processing.
print("Formatting dataset examples...")
formatted_dataset = dataset.map(formatting_prompts_func, batched=True)
print("Dataset formatting complete.")

# 6. Display a sample formatted example
print("\n--- Sample Formatted Example ---")
if len(formatted_dataset) > 0:
    print(formatted_dataset[0]["text"])
else:
    print("No examples in the formatted dataset to display.")

# You can now use 'formatted_dataset' for fine-tuning your language model.
# The 'formatted_dataset' will now have a single 'text' column, where each entry
# is a fully prepared instruction-following string.
print(f"\nFinal formatted dataset structure: {formatted_dataset}")

Loading dataset...


README.md: 0.00B [00:00, ?B/s]

(…)nthetic_text_to_sql_train.snappy.parquet:   0%|          | 0.00/32.4M [00:00<?, ?B/s]

(…)ynthetic_text_to_sql_test.snappy.parquet:   0%|          | 0.00/1.90M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5851 [00:00<?, ? examples/s]

Dataset loaded. Number of examples: 100000
Formatting dataset examples...


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Dataset formatting complete.

--- Sample Formatted Example ---
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Given the company database schema, generate a SQL query based on the SQL prompt. Also provide an explanation of the generated SQL.

### Company database:
CREATE TABLE salesperson (salesperson_id INT, name TEXT, region TEXT); INSERT INTO salesperson (salesperson_id, name, region) VALUES (1, 'John Doe', 'North'), (2, 'Jane Smith', 'South'); CREATE TABLE timber_sales (sales_id INT, salesperson_id INT, volume REAL, sale_date DATE); INSERT INTO timber_sales (sales_id, salesperson_id, volume, sale_date) VALUES (1, 1, 120, '2021-01-01'), (2, 1, 150, '2021-02-01'), (3, 2, 180, '2021-01-01');

### Input:
SQL Prompt: What is the total volume of timber sold by each salesperson, sorted by salesperson?

### Response:
SQL: SELECT salesperson_id, name, SUM(volume) a

## Train the model

### Subtask:
Train the model using Unsloth's training function.


In [10]:
import torch
from transformers import TrainingArguments # <--- ADD THIS IMPORT
from trl import SFTTrainer
from unsloth import is_bfloat16_supported # Assuming unsloth is installed and used

# Assume 'model', 'tokenizer', 'dataset', and 'max_seq_length' are already defined
# based on your previous code snippets.
# For demonstration purposes, I'll add placeholder definitions if they're not explicitly shown here.

# --- Placeholder definitions (replace with your actual model, tokenizer, and dataset) ---
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from datasets import load_dataset
# from unsloth import FastLanguageModel

# if torch.cuda.is_available():
#     # For a real scenario, load your actual model
#     model, tokenizer = FastLanguageModel.from_pretrained(
#         model_name = "unsloth/llama-2-7b-Instruct",
#         max_seq_length = 2048, # Example max_seq_length
#         dtype = None, # Auto detects bfloat16 or float16
#         load_in_4bit = True,
#     )
# else:
#     # Fallback for CPU, if you're running without GPU
#     model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
#     tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

# # Example dataset (replace with your actual formatted_dataset)
# dataset = load_dataset("gretelai/synthetic_text_to_sql", split="train")
# # Assuming you've run the formatting_prompts_func on this dataset as per previous steps
# # dataset = dataset.map(formatting_prompts_func, batched=True, remove_columns=dataset.column_names)
# max_seq_length = 2048 # Example length, adjust as needed

# --- END Placeholder definitions ---


trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = formatted_dataset, # Use formatted_dataset instead of dataset
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,

    args = TrainingArguments( # Configuration for the training process
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    )
)

# You would then typically start training with:
# trainer.train()

Map (num_proc=2):   0%|          | 0/100000 [00:00<?, ? examples/s]

In [11]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 46,178,304 of 8,000,000,000 (0.58% trained)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mm1annskhan[0m ([33mm1annskhan-ghulam-ishaq-khan-institute-of-engineering-sc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,1.8963
2,2.2155
3,2.008
4,1.8986
5,1.7915
6,2.1875
7,1.9598


RuntimeError: PassManager::run failed