In [None]:
%%capture

!pip install unsloth transformers  trl==0.14.0
!pip install triton

In [None]:
from unsloth import FastLanguageModel
import torch 
from trl import SFTTrainer 
from unsloth import is_bfloat16_supported
from huggingface_hub import login
from transformers import TrainingArguments 
from datasets import load_dataset 
import wandb
from kaggle_secrets import UserSecretsClient
import json
import os
from datasets import Dataset
from unsloth import add_new_tokens
from pathlib import Path


In [None]:
TMP_DIR = Path('../temp')
TMP_DIR.mkdir(exist_ok=True)



In [None]:

cd ../temp

In [None]:
user_secrets = UserSecretsClient() 
hugging_face_token = user_secrets.get_secret("Hugging_Face_Token")
wnb_token = user_secrets.get_secret("wnb")


login(hugging_face_token) 


wandb.login(key=wnb_token) 
run = wandb.init(
    project='Fine-tune-DeepSeek-R1-Distill-Llama-8B on Function Calling', 
    job_type="training", 
    anonymous="allow"
)

In [None]:
max_seq_length = 8192
dtype = None  
load_in_4bit = True 


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/DeepSeek-R1-Distill-Llama-8B", 
    max_seq_length=max_seq_length, 
    dtype=dtype, 
    load_in_4bit=load_in_4bit, 
    token=hugging_face_token, 
)

In [None]:
train_prompt_style = """Below is an instruction that describes a task, paired with input context. Your goal is to analyze the user's question, generate the necessary queries, and return the final output in a structured JSON format. In your response, include your full chain-of-thought (CoT) process—including any function calls—inside `<think>` tags. Both your reasoning and function call details must be visible in the final output.
### Instruction:
1. Analyze the user's question thoroughly.
2. Within `<think>` tags, generate a step-by-step chain-of-thought explaining your reasoning process. Also include any function calls (using the `<query>` tag to issue queries and `<query_res>` tags to represent received results) within the `<think>` block.
3. Based on your reasoning, determine the necessary queries.
4. After the `<think>` block, produce a final JSON object inside the `<results>` block that summarizes the queries and their corresponding search results. The JSON should be structured as follows:
{{
  "Query One": {{
      "query": "First query text",
      "query_res": [
          {{
              "distance": 0.85,
              "metadata": {{
                  "publisher": "Generic News",
                  "date_published": "2025-01-15",
                  "url": "https://www.genericnews.com/article1"
              }},
              "content": "content_of article"
          }},
          {{
              "distance": 0.80,
              "metadata": {{
                  "publisher": "Tech Daily",
                  "date_published": "2025-01-16",
                  "url": "https://www.techdaily.com/article2"
              }},
              "content": "content_of article"
          }},
          {{
              "distance": 0.78,
              "metadata": {{
                  "publisher": "World Report",
                  "date_published": "2025-01-17",
                  "url": "https://www.worldreport.com/article3"
              }},
             "content": "content_of article"
          }}
      ]
  }},
  "Query Two": {{
      "query": "Second query text",
      "query_res": [
          {{
              "distance": 0.88,
              "metadata": {{
                  "publisher": "Generic News",
                  "date_published": "2025-01-18",
                  "url": "https://www.genericnews.com/article4"
              }},
              "content": "content_of article"
          }},
          {{
              "distance": 0.83,
              "metadata": {{
                  "publisher": "Tech Daily",
                  "date_published": "2025-01-19",
                  "url": "https://www.techdaily.com/article5"
              }},
              "content": "content_of article"
          }},
          {{
              "distance": 0.80,
              "metadata": {{
                  "publisher": "World Report",
                  "date_published": "2025-01-20",
                  "url": "https://www.worldreport.com/article6"
              }},
              "content": "content_of article"
          }}
      ]
  }},
  "Query Three": {{
      "query": "Third query text",
      "query_res": [
          {{
              "distance": 0.86,
              "metadata": {{
                  "publisher": "Generic News",
                  "date_published": "2025-01-21",
                  "url": "https://www.genericnews.com/article7"
              }},
              "content": "content_of article"
          }},
          {{
              "distance": 0.82,
              "metadata": {{
                  "publisher": "Tech Daily",
                  "date_published": "2025-01-22",
                  "url": "https://www.techdaily.com/article8"
              }},
              "content": "content_of article"
          }},
          {{
              "distance": 0.79,
              "metadata": {{
                  "publisher": "World Report",
                  "date_published": "2025-01-23",
                  "url": "https://www.worldreport.com/article9"
              }},
              "content": "content_of article"
          }}
      ]
  }}
}}
Final output structure:
1. A `<think>` block that contains your complete chain-of-thought process, including any `<query>` and `<query_res>` function call details.
2. Following the `<think>` block, output the final JSON object with your queries and associated results inside the `<results>` block.
### Question:
{}
### Response:
{}"""

### Step 2 — Download the fine-tuning dataset and format it for fine-tuning


In [None]:


DATA_DIR = "/kaggle/input/db-query-reasoning/train.jsonl" # this is the previously generated synthetic dataset
with open(DATA_DIR) as f: 
    dataset = json.loads(f.read())

In [None]:
tokens = []
tokens.append("<query>")
tokens.append("<query_res>")
tokens.append("<think>")
tokens.append("<results>")
tokens.append("</query>")
tokens.append("</query_res>")
tokens.append("</think>")
tokens.append("</results>")
add_new_tokens(model, tokenizer, new_tokens=tokens)
model.resize_token_embeddings(len(tokenizer))# We need to format the dataset to fit our prompt training style 
EOS_TOKEN =tokenizer.eos_token  # Define EOS_TOKEN which the model when to stop generating text during training


In [None]:
import re 
def clean_text(txt): 
    result =re.sub(r"```text\n|```|\n", "", txt)

    return re.sub(r" +", " ", result)
    

In [None]:

def formatting_prompts_func(dataset): 
    outputs = [] 
    
    for entry in dataset:   
        query = entry["question"]
        formatted_entry = clean_text(entry["answer"])
        fields = formatted_entry.split("<results>")
        if len(fields) == 2:
            text = re.sub(r" +", " ", train_prompt_style.format(query, formatted_entry) + EOS_TOKEN)
            outputs.append({"text": text})  # Each example is now a dict with the key "text"
            
    return outputs

formatted_examples = formatting_prompts_func(dataset)
dataset_finetune = Dataset.from_list(formatted_examples)


In [None]:
model_lora = FastLanguageModel.get_peft_model(
    model,
    r=16,  
    target_modules=[  
        "q_proj",   
        "k_proj",   
        "v_proj",   
        "o_proj",   
        "gate_proj",  
        "up_proj",    
        "down_proj",  
    ],
    lora_alpha=16,  
    lora_dropout=0,  
    bias="none", 
    use_gradient_checkpointing="unsloth",  
    random_state=3407,  
    use_rslora=False,  
    loftq_config=None,  
)

In [None]:
trainer = SFTTrainer(
    model=model_lora,  # The model to be fine-tuned
    tokenizer=tokenizer,  # Tokenizer to process text inputs
    train_dataset=dataset_finetune,  # Dataset used for training
    dataset_text_field="text",  # Specifies which field in the dataset contains training text
    max_seq_length=max_seq_length,  # Defines the maximum sequence length for inputs
    dataset_num_proc=2,  # Uses 2 CPU threads to speed up data preprocessing

    # Define training arguments
    args=TrainingArguments(
        per_device_train_batch_size=2,  # Number of examples processed per device (GPU) at a time
        gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps before updating weights
        num_train_epochs=1, # Full fine-tuning run
        warmup_steps=5,  # Gradually increases learning rate for the first 5 steps
        max_steps=120,  # Limits training to 60 steps (useful for debugging; increase for full fine-tuning)
        learning_rate=2e-4,  # Learning rate for weight updates (tuned for LoRA fine-tuning)
        fp16=not is_bfloat16_supported(),  # Use FP16 (if BF16 is not supported) to speed up training
        bf16=is_bfloat16_supported(),  # Use BF16 if supported (better numerical stability on newer GPUs)
        logging_steps=10,  # Logs training progress every 10 steps
        optim="adamw_8bit",  # Uses memory-efficient AdamW optimizer in 8-bit mode
        weight_decay=0.01,  # Regularization to prevent overfitting
        lr_scheduler_type="linear",  # Uses a linear learning rate schedule
        seed=3407,  # Sets a fixed seed for reproducibility
        output_dir="outputs",  # Directory where fine-tuned model checkpoints will be saved
    ),
)


## Step 4 — Model training! 



In [None]:
trainer_stats = trainer.train()

In [None]:
wandb.finish()

In [None]:
login(hugging_face_token)
model_lora.push_to_hub("Martingkc/llama_lora_adapters_v3", token = hugging_face_token) 
tokenizer.push_to_hub("Martingkc/llama_lora_adapters_v3", token = hugging_face_token) 

In [None]:

model_lora.push_to_hub_merged("Martingkc/llama_lora_merged_model_v3", tokenizer, save_method = "merged_16bit", token = hugging_face_token)
model_lora.push_to_hub_gguf("Martingkc/llama_lora_q4_k_m_GGUF_v3", tokenizer, quantization_method = "q4_k_m", token = hugging_face_token)
