In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextStreamer, QuantoConfig
import torch
from dotenv import load_dotenv
from datasets import load_dataset

data = load_dataset("squad", split="train")
load_dotenv("../.env")

base_model_path = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(base_model_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
quant_config = QuantoConfig(weights="int8")

def fetch_model(is_quant: bool=False):
    config = dict(
        pretrained_model_name_or_path=base_model_path,
        return_dict=True,
        low_cpu_mem_usage=True,
        # torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
        quantization_config=quant_config,
    )
    if is_quant:
        quant_cfg = {
            "quantization_config": quant_config,
        }
        config = config | quant_cfg

    model = AutoModelForCausalLM.from_pretrained(**config)
    if model.config.pad_token_id is None:
        model.config.pad_token_id = model.config.eos_token_id
    return model

def get_trainable_params(model) -> None:
    """
    Get the number of trainable parameters in the model.
    """
    params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total trainable parameters: {params}")

quant_model = fetch_model(is_quant=True)
base_model = fetch_model(is_quant=False)

python-dotenv could not parse statement starting at line 2


In [None]:
base_model.config._name_or_path

In [None]:
pipe = pipeline(
    "text-generation",
    model=base_model,
    tokenizer=tokenizer,
    # torch_dtype=torch.float16,
    device_map="auto",
)

In [None]:
get_trainable_params(base_model)

In [None]:
get_trainable_params(quant_model)

In [None]:
from peft import (
    get_peft_model,
    prepare_model_for_kbit_training,
    LoraConfig
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=[
        "q_proj","k_proj","v_proj",
        "o_proj", "gate_proj", "up_proj",
        "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
)

lora_base_model = prepare_model_for_kbit_training(base_model)
lora_base_model = get_peft_model(lora_base_model, lora_config)

In [None]:
get_trainable_params(lora_base_model)

In [None]:
get_trainable_params(quant_model)

In [None]:
11272192/262735872

In [None]:
messages = [{"role": "user", "content": "Who is Vincent van Gogh?"}]

prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

outputs = pipe(prompt, max_new_tokens=120, do_sample=True)

print(outputs[0]["generated_text"])

In [None]:
from IPython.display import Markdown, display

messages = [
    {
        "role": "system",
        "content": "You are a skilled Python developer specializing in database management and optimization.",
    },
    {
        "role": "user",
        "content": "I'm experiencing a sorting issue in my database. Could you please provide Python code to help resolve this problem?",
    },
]

prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

outputs = pipe(prompt, max_new_tokens=512, do_sample=True)

display(
    Markdown(
        outputs[0]["generated_text"].split(
            "<|start_header_id|>assistant<|end_header_id|>"
        )[1]
    )
)

In [None]:
data["train"][0]

In [None]:
def apply_chat_template(example):
    answer = example['answers']['text']
    messages = [
        {"role": "user", "content": example['question']},
        {"role": "assistant", "content": answer}
    ]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return {"prompt": prompt}

templated_dataset = data.map(apply_chat_template)

In [None]:
from transformers import PreTrainedTokenizerFast


def apply_chat_template(
        input_dataset: dict, tokeniser: PreTrainedTokenizerFast
) -> dict:
    """Preprocess the input dataset to fit expected format.

    See: https://huggingface.co/docs/trl/en/sft_trainer for details.
    """
    answer = input_dataset['answers']['text']
    message_template = [
        {"role": "system", "content": input_dataset['context']},
        {"role": "user", "content": input_dataset['question'][0]},
        {"role": "assistant", "content": answer}
    ]
    # messages = tokeniser.apply_chat_template(
    #     message_template, tokenize=False, add_generation_prompt=True
    # )
    return {"messages": messages}

def apply_message_template(
    input_dataset: dict
) -> dict:
    """Preprocess the input dataset to fit expected format.

    See: https://huggingface.co/docs/trl/en/sft_trainer for details.
    """
    answer = input_dataset['answers']['text']
    if isinstance(input_dataset['answers']['text'], list):
        answer = input_dataset['answers']['text'][0]
    message_template = [
        {"role": "system", "content": input_dataset['context']},
        {"role": "user", "content": input_dataset['question']},
        {"role": "assistant", "content": answer}
    ]
    return {"messages": message_template}

templated_dataset = data.map(apply_message_template)

In [None]:
templated_dataset[0]

In [None]:
# tokenise the prompts
def tokenize_function(example):
    tokens = tokenizer(
        example['prompt'],
        padding="max_length",
        truncation=True,
        max_length=128
    )
    # Set padding token labels to -100 to ignore them in loss calculation
    tokens['labels'] = [
        -100 if token == tokenizer.pad_token_id else token for token in tokens['input_ids']
    ]
    return tokens

tokenised_dataset = templated_dataset.map(tokenize_function)
tokenised_dataset = tokenised_dataset.remove_columns(["title", "context", "question", "answers", "prompt"])

In [None]:
# # https://www.mercity.ai/blog-post/guide-to-fine-tuning-llms-with-lora-and-qlora
# # https://huggingface.co/docs/diffusers/en/quantization/quanto
# for name, module in model.named_modules(): # https://www.datacamp.com/tutorial/fine-tuning-llama-3-2
#     print(f"name: {name}")
#     print(f"module: {module}")


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer
lora_base_model.train()
training_args = SFTConfig(
    output_dir = "cp",
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 8,
    per_device_eval_batch_size = 8,
    eval_accumulation_steps = 8,
    # optim = "paged_adamw_32bit",
    save_steps = 10,
    logging_steps = 10,
    learning_rate = 5e-5,
    max_grad_norm = 0.3,
    max_steps = 50,
    warmup_ratio = 0.03,
    eval_strategy="steps",
    lr_scheduler_type = "linear",
)
trainer = SFTTrainer(
    model=lora_base_model,
    args=training_args,
    train_dataset=tokenised_dataset["train"],
)

# Initialize Trainer
# trainer = Trainer(
#     model=lora_base_model,
#     args=training_args,
#     train_dataset=tokenised_dataset["train"],
#     eval_dataset=tokenised_dataset["validation"],
#     tokenizer=tokenizer)
#
# # Train the model
# trainer.train()
#
# # Save the model and tokenizer
# trainer.save_model("./fine-tuned-model")
# tokenizer.save_pretrained("./fine-tuned-model")

In [None]:
import pandas as pd
import json

path = "raw_data/ConvFinQA/train.json"
with open(path, "r") as file:
    data = json.load(file)

In [None]:
data[0]

In [None]:
from peft import PeftModel
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
def load_lora_model(base_model_path, adapter_path):
    # First load the base model
    tokenizer = AutoTokenizer.from_pretrained(base_model_path)
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        device_map="auto",
        trust_remote_code=True
    )

    # Then load the LoRA adapter weights
    fine_tuned_model = PeftModel.from_pretrained(base_model, adapter_path)
    return fine_tuned_model, base_model, tokenizer
model_path = "../cp/checkpoint-60"  # Your output directory from training
base_model_path = "meta-llama/Llama-3.2-3B-Instruct"
ft_model, base_model, tokeniser = load_lora_model(base_model_path, model_path)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset
val_data = load_dataset("squad")

ft_pipe = pipeline(
    "text-generation",
    model=ft_model,
    tokenizer=tokeniser,
    # torch_dtype=torch.float16,
    device_map="auto",
)

base_pipe = pipeline(
    "text-generation",
    model=base_model,
    tokenizer=tokeniser,
    # torch_dtype=torch.float16,
    device_map="auto",
)

In [None]:
val_data["train"][1]

In [55]:

messages = [{"role": "user", "content": "What is in front of the Notre Dame Main Building?"}]

prompt = tokeniser.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
with torch.no_grad():
    outputs = base_pipe(prompt, max_new_tokens=120, do_sample=True)

print(outputs[0]["generated_text"])

NameError: name 'tokeniser' is not defined

In [None]:
with torch.no_grad():
    outputs = ft_pipe(prompt, max_new_tokens=120, do_sample=True)

print(outputs[0]["generated_text"])

In [57]:
from train.fine_tune import CustomFineTuner
from transformers import QuantoConfig
from peft import LoraConfig
HUGGING_FACE_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    bias="none",
    task_type="CAUSAL_LM",
)

q_cfg = QuantoConfig(weights="int8")
fine_tuner = CustomFineTuner(
    huggingface_model=HUGGING_FACE_MODEL,
    lora_config=lora_cfg,
)

model = fine_tuner.fetch_model()

[32m2025-05-08 22:08:34.026[0m | [1mINFO    [0m | [36mtrain.fine_tune[0m:[36mlog_trainable_params[0m:[36m98[0m - [1mTotal trainable parameters for meta-llama/Llama-3.2-1B-Instruct: 1235814400[0m


In [58]:
fine_tuner.generate_predictions(model, input="hello")

Device set to use mps


'Hello! How can I assist you today?'

In [62]:
from transformers import pipeline
tokenizer_no_pad = AutoTokenizer.from_pretrained(
    HUGGING_FACE_MODEL,
    add_bos_token=True
)

base_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer_no_pad,
    # torch_dtype=torch.float16,
    device_map="auto",
)

messages = [
    {"role": "user", "content": "Hello"}]

prompt = tokenizer_no_pad.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
with torch.no_grad():
    outputs = base_pipe(prompt, max_new_tokens=120, do_sample=True)

print(outputs[0]["generated_text"])

Device set to use mps
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 08 May 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

Hello<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Hello. Is there something I can help you with or would you like to chat?


In [60]:
from mlflow.models import infer_signature
import mlflow
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
from transformers import pipeline
os.environ["TOKENIZERS_PARALLELISM"] = "false"
mlflow.set_experiment("Llama-3.2-1B-Instruct")
with mlflow.start_run():
    tokenizer_no_pad = AutoTokenizer.from_pretrained(
        HUGGING_FACE_MODEL,
        add_bos_token=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        HUGGING_FACE_MODEL,
        device_map="auto",

    )
    base_pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer_no_pad,
        # torch_dtype=torch.float16,
        device_map="auto",
    )
    # Ensure pad_token_id is an integer before logging
    # if isinstance(model.generation_config.pad_token_id, list):
    #     model.generation_config.pad_token_id = model.generation_config.pad_token_id[0]
    logged_model = mlflow.transformers.log_model(
        transformers_model=base_pipe,
        task="text-generation",
        artifact_path="model",
        signature=infer_signature(
            model_input={
                "content": "What is in front of the Notre Dame Main Building?",
            },
        ),
    )

Device set to use mps


🏃 View run rare-grouse-884 at: http://127.0.0.1:8080/#/experiments/821275711990790082/runs/4f907bbfd18b4475ab15b8a37870b1d6
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/821275711990790082


In [63]:
loaded_model = mlflow.transformers.load_model(
    logged_model.model_uri,
)

Downloading artifacts: 100%|██████████| 22/22 [00:18<00:00,  1.20it/s]
2025/05/08 22:13:41 INFO mlflow.transformers: 'runs:/4f907bbfd18b4475ab15b8a37870b1d6/model' resolved as 'mlflow-artifacts:/821275711990790082/4f907bbfd18b4475ab15b8a37870b1d6/artifacts/model'
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 173.22it/s]
Loading checkpoint shards: 100%|██████████| 9/9 [00:00<00:00, 68.23it/s]
Device set to use mps:0


In [65]:
messages = [
    {"role": "user", "content": "Hello"}]

prompt = tokenizer_no_pad.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
with torch.no_grad():
    outputs = loaded_model(prompt, max_new_tokens=120, do_sample=True)
print(outputs[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 08 May 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

Hello<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Hello! How can I assist you today?


In [66]:
loaded_model("hello", do_sample=False)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': 'hello, how can i get a free gift card to a popular restaurant chain?\nHere are some ways to'}]

In [68]:
messages = [
    {"role": "user", "content": "Hello"}]

prompt = loaded_model.tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
with torch.no_grad():
    outputs = loaded_model(prompt, max_new_tokens=120, do_sample=True)
print(outputs[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 08 May 2025

<|eot_id|><|start_header_id|>user<|end_header_id|>

Hello<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Hello! How can I assist you today?
