In [1]:
!pip install bitsandbytes accelerate peft trl cut_cross_entropy ipywidgets sentencepiece protobuf huggingface_hub hf_transfer transformers packaging ninja matplotlib torch datasets pandas numpy

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import json

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer
from peft import LoraConfig

In [3]:
HF_TOKEN = "hf_"
local_base_meta_merged_lora = "models/Llama-3.2-3B-lora-pubmed-qa"

# Llama-3.2-3B-Instruct
llama_base_model = "meta-llama/Llama-3.2-3B-Instruct"

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda:0


In [4]:
base_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=llama_base_model,
    return_dict=True,
    low_cpu_mem_usage=True,
    torch_dtype="auto",
    device_map="auto",
    trust_remote_code=True,
    # quantization_config=quantization_config,
    token=HF_TOKEN,
).to(device)

base_tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=llama_base_model,
    token=HF_TOKEN,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### The template for llama3 models

"""
We now use the Llama-3.1 format for conversation style fine tuning. 
But we convert it to HuggingFace's normal multiturn format ("role", "content"). 
Llama-3 renders multi turn conversations like below:

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Hello!<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Hey there! How Can i assist you?<|eot_id|><|start_header_id|>user<|end_header_id|>

I'm great thanks!<|eot_id|>
"""

In [5]:
def formatting_prompts_func(examples):
    """
    Formats each chat conversation in examples["messages"] into a prompt string
    using the tokenizer's chat template. Returns a dict with the formatted texts.
    """
    messages = examples["messages"]
    texts = [
        base_tokenizer.apply_chat_template(
            message, tokenize=False, add_generation_prompt=False
        )
        for message in messages
    ]
    return {
        "text": texts,
    }

In [6]:
dataset_raw = load_dataset(
    "json", data_files="ft_pubmedqa.jsonl", split="train"
)

dataset_llama_format = dataset_raw.map(
    formatting_prompts_func,
    batched=True,
)


In [7]:
messages = dataset_llama_format[49]["messages"]
print(json.dumps(messages, indent=2))

[
  {
    "role": "user",
    "content": "Does circulating atrial natriuretic peptide genetic association study identify a novel gene cluster associated with stroke in whites?"
  },
  {
    "role": "assistant",
    "content": "This is the first large-scale genetic association study of circulating NT-proANP levels performed with replication and functional assessment that identified genetic variants in the MTHFR-CLCN6-NPPA-NPPB cluster to be significantly associated with NT-proANP levels. The clinical significance of this variation is related to lower NT-proANP levels, higher blood pressures, and an increased risk of stroke in the general community."
  }
]


In [26]:
# training args for model fine tuning
training_args = SFTConfig(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=5,
    num_train_epochs=1,
    output_dir="./output",
    save_strategy="epoch",
    learning_rate=2e-5,
    bf16=True, # according to your GPU config
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    hub_model_id="pavanmantha/Llama-3.2-3B-pubmed-LoRA",
)


In [27]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules="all-linear",
    bias="none",
    use_rslora=True,
    task_type="CAUSAL_LM",
)

In [28]:

trainer = SFTTrainer(
    model=base_model,
    train_dataset=dataset_llama_format.select(range(15000)),
    args=training_args,
    peft_config=peft_config,
)



Tokenizing train dataset:   0%|          | 0/15000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [29]:
# show current GPU stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024**3, 3)
max_memory = round(gpu_stats.total_memory / 1024**3, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA H100 80GB HBM3. Max memory = 79.209 GB.
74.771 GB of memory reserved.


In [30]:
# trigger finetuning
trainer_stats = trainer.train()

Step,Training Loss
1,3.9254
2,3.664
3,4.0546
4,3.7501
5,3.7069
6,3.361
7,3.4085
8,2.9707
9,2.8425
10,2.7193



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-3B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-3B-Instruct.


In [31]:
# push lora adapter to huggingface
# trainer.push_to_hub(
#     'pavanmantha/Llama-3.2-3B-LoRA-adapter',
#     token=HF_TOKEN,
#     blocking=True,
# )


Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.2-3B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.2-3B-Instruct.


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /workspace/output/tokenizer.json      : 100%|##########| 17.2MB / 17.2MB            

  ...ce/output/adapter_model.safetensors:   0%|          | 45.8kB / 97.3MB            

  /workspace/output/training_args.bin   :   2%|1         |   113B / 6.16kB            

CommitInfo(commit_url='https://huggingface.co/pavanmantha/Llama-3.2-3B-pubmed-LoRA/commit/4c66b7c693a376aa042ed576a0e08955249f50b9', commit_message='pavanmantha/Llama-3.2-3B-LoRA-adapter', commit_description='', oid='4c66b7c693a376aa042ed576a0e08955249f50b9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/pavanmantha/Llama-3.2-3B-pubmed-LoRA', endpoint='https://huggingface.co', repo_type='model', repo_id='pavanmantha/Llama-3.2-3B-pubmed-LoRA'), pr_revision=None, pr_num=None)

# Merge LoRA with base model

In [34]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

hf_base_meta_merged_lora = 'pavanmantha/Llama-3.2-3B-pubmed-qa-LoRA'

model = AutoModelForCausalLM.from_pretrained(llama_base_model)
model = PeftModel.from_pretrained(model, 'pavanmantha/Llama-3.2-3B-pubmed-LoRA')
merged_model = model.merge_and_unload()
merged_model.save_pretrained(local_base_meta_merged_lora)

merged_model.push_to_hub(
    hf_base_meta_merged_lora,
    token=HF_TOKEN,
    commit_message="Pushing merged model to Hugging Face Hub",
    blocking=True,
)

tokenizer = AutoTokenizer.from_pretrained(llama_base_model)

tokenizer.push_to_hub(
    hf_base_meta_merged_lora,
    token=HF_TOKEN,
    commit_message="Pushing tokenizer to Hugging Face Hub",
    blocking=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/943 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/97.3M [00:00<?, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...6a/model-00003-of-00003.safetensors:   0%|          |  555kB / 2.92GB            

  ...6a/model-00002-of-00003.safetensors:   0%|          |  555kB / 4.93GB            

  ...6a/model-00001-of-00003.safetensors:   2%|2         |  117MB / 5.00GB            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmpq30q5sn2/tokenizer.json       : 100%|##########| 17.2MB / 17.2MB            

CommitInfo(commit_url='https://huggingface.co/pavanmantha/Llama-3.2-3B-pubmed-qa-LoRA/commit/d88faa3f15cc66a5f5c081abf7f3ba7a62a34c74', commit_message='Pushing tokenizer to Hugging Face Hub', commit_description='', oid='d88faa3f15cc66a5f5c081abf7f3ba7a62a34c74', pr_url=None, repo_url=RepoUrl('https://huggingface.co/pavanmantha/Llama-3.2-3B-pubmed-qa-LoRA', endpoint='https://huggingface.co', repo_type='model', repo_id='pavanmantha/Llama-3.2-3B-pubmed-qa-LoRA'), pr_revision=None, pr_num=None)