In [1]:
!pip uninstall transformers bitsandbytes
!pip install --quiet -U transformers accelerate peft bitsandbytes trl

Found existing installation: transformers 4.48.3
Uninstalling transformers-4.48.3:
  Would remove:
    /usr/local/bin/transformers-cli
    /usr/local/lib/python3.11/dist-packages/transformers-4.48.3.dist-info/*
    /usr/local/lib/python3.11/dist-packages/transformers/*
Proceed (Y/n)? Y
  Successfully uninstalled transformers-4.48.3
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m95.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m342.1/342.1 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.9/318.9 kB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.

In [2]:
!pip install -U bitsandbytes



In [3]:
import bitsandbytes
import torch

In [4]:
import os
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments, pipeline, logging

In [5]:
from huggingface_hub import notebook_login
from IPython import get_ipython
from IPython.display import display

In [6]:
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [27]:
##Parameters
# The model that you want to train from the Hugging Face hub
model_name = "meta-llama/Llama-3.2-1B-Instruct"

# The instruction dataset to use
dataset_name = "Josephgflowers/Finance-Instruct-500k"

# Fine-tuned model name
new_model = "Llama-3.2-1B-Instruct-Finance"


# https://huggingface.co/docs/autotrain/en/llm_finetuning_params
# https://rentry.org/llm-training#low-rank-adaptation-lora_1

################################################################################
# QLoRA parameters https://lightning.ai/pages/community/tutorial/lora-llm/
# https://lightning.ai/pages/community/lora-insights/#toc5
################################################################################

# LoRA attention dimension. Set the 'r' parameter for Low-Rank Adaptation (LoRA). Default is 16.
lora_r = 8

# Alpha parameter for LoRA scaling. Specify the 'alpha' parameter for LoRA. Default is 32.
# As a thumb rule, choose Alpha that is twice as large as rank
lora_alpha = 16

# Dropout probability for LoRA layers. Set the dropout rate within the LoRA layers to help prevent overfitting during adaptation. Default is 0.05
lora_dropout = 0.05

################################################################################
# bitsandbytes parameters # used when loading a base model in 4-bit precision
# https://huggingface.co/blog/4bit-transformers-bitsandbytes
# https://lightning.ai/pages/community/article/what-is-quantization/
# https://generativeai.pub/practical-guide-of-llm-quantization-gptq-awq-bitsandbytes-and-unsloth-bdeaa2c0bbf6#255c
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16" # or use "bfloat16" for faster training

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 1

# Batch size per GPU for evaluation
per_device_eval_batch_size = 1

# number of steps before performing a backward/update pass
gradient_accumulation_steps = 1

# Enable gradient checkpointing. use gradient checkpointing to save memory
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
# Set the maximum norm for gradient clipping, which is critical for preventing gradients from exploding during backpropagation. Default is 1.0.
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
# Define the weight decay rate for regularization, which helps prevent overfitting by penalizing larger weights. Default is 0.0
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 25

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# max sequence length for model and packing of the dataset
max_seq_length = 2048

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}  # 0 or 'auto'


In [12]:
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train[0:10000]")

In [13]:
print(dataset)

Dataset({
    features: ['system', 'user', 'assistant'],
    num_rows: 10000
})


In [14]:
print(dataset[10])

{'system': '\n', 'user': 'Explain the factors that influence interest rates and how central banks use monetary policy to affect interest rates.', 'assistant': '• Supply and demand for loans: When demand for loans is high, interest rates tend to rise as lenders seek to balance supply and demand. When demand is low, interest rates tend to fall.\n\n• Expected inflation: Higher expected inflation tends to push up interest rates, as lenders seek to offset the erosion of the purchasing power of their principal.\n\n• Economic growth: Strong economic growth tends to increase demand for loans and thus push up interest rates. Weak growth has the opposite effect.\n\n• Central bank policies: Central banks, like the Federal Reserve in the U.S., use monetary policy tools to affect interest rates. Their goals are typically to promote economic growth, full employment, and stable prices.\n\nCentral banks influence interest rates mainly through two types of monetary policy actions:\n\n1) Changing the di

In [15]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `kali.mane26` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `kali.m

# Inferencing with a Llama model

In [16]:
## Inferencing with a llama model
import torch
from transformers import pipeline

model_id = "meta-llama/Llama-3.2-1B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
messages = [
    {"role": "system", "content": "You are a finance chatbot who always responds in finance language with simple terms!"},
    {"role": "user", "content": "Explain the meaning and significance of money supply for an economy?"},
]
outputs = pipe(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])


config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'role': 'assistant', 'content': "**Money Supply: The Lifeblood of the Economy**\n\nIn simple terms, the money supply refers to the total amount of money circulating in an economy. It's like the water supply for a city - just as too little water can cause a drought, too much water can cause flooding.\n\n**Why is the Money Supply Important?**\n\nThe money supply plays a crucial role in an economy's performance. Here are some key reasons:\n\n1. **Economic Growth**: An increase in the money supply can lead to economic growth, as more money chases a constant quantity of goods and services. This can stimulate investment, consumption, and job creation.\n2. **Inflation**: A high money supply can cause inflation, which is when prices rise due to increased demand for goods and services. This can erode the purchasing power of consumers and reduce the value of savings.\n3. **Interest Rates**: The money supply affects interest rates, which are the interest rates charged on loans and savings accoun

In [17]:
print(dataset[20])

{'system': '\n', 'user': 'Explain the meaning and significance of money supply for an economy.', 'assistant': "Money supply refers to the total amount of money available in an economy. It includes things like cash in circulation and funds held in checking and savings accounts. The money supply has important implications for an economy:\n\n1. It affects inflation. If the money supply grows too quickly relative to the production of goods and services, it can lead to too much money chasing too few goods, putting upward pressure on prices and causing inflation. Central banks try to manage money supply growth to keep inflation in check.\n\n2. It influences interest rates. A high money supply relative to the demand for loans tends to put downward pressure on interest rates, and vice versa. This is because a large supply of available funds means banks can lend at lower rates.\n\n3. It impacts economic activity and growth. Easy access to money through a large money supply can help fuel economi


* In the above inference, we can clearly see that Llama-3.2-1B-Instruct has a different language of the answer than the one mentioned in our finance dataset.

# Fine-tuning the Llama model using the Finance dataset

In [18]:
# Combine user and assistant into a new column named 'text'
def combine_prompt_and_completion(example):
    example['text'] = example['user'] + example['assistant']
    return example

dataset = dataset.map(combine_prompt_and_completion)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [19]:
print(dataset)

Dataset({
    features: ['system', 'user', 'assistant', 'text'],
    num_rows: 10000
})


In [20]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [21]:
# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

In [28]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

In [29]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

In [30]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

In [31]:
# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

In [32]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    #dataset_text_field="text",
    #max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    #packing=packing,
)

  trainer = SFTTrainer(


Converting train dataset to ChatML:   0%|          | 0/10000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [33]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

Step,Training Loss
25,1.7303
50,1.6798
75,1.625
100,1.8955
125,1.7448
150,1.7154
175,1.6972
200,1.6729
225,1.5006
250,1.768


# Load and inference using the Fine-tuned LLM Model

In [2]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Replace with the directory where you saved the model
model_dir = "Llama-3.2-1B-Instruct-Finance"
base_model_id = "meta-llama/Llama-3.2-1B-Instruct"

#Load base model and tokenizer
model = AutoModelForCausalLM.from_pretrained(base_model_id)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

#Load finetuned model
peft_config = PeftConfig.from_pretrained(model_dir)
model = PeftModel.from_pretrained(model, model_dir)

In [8]:
finetuned_pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            device_map="auto",
        )

messages = [
    {"role": "system", "content": "You are a finance chatbot who always responds in finance language with simple terms!"},
    {"role": "user", "content": "Explain the meaning and significance of money supply for an economy?"},
]
outputs = finetuned_pipe(
    messages,
    max_new_tokens=2048,
)
print(outputs[0]["generated_text"][-1])

Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausa

{'role': 'assistant', 'content': "Money supply is the total amount of money in existence in an economy. It's a measure of how much money is circulating around the economy. The money supply can impact economic growth, inflation, and employment.\n\nThe money supply is typically controlled by the central bank, which is like the bank of the government. The central bank can increase or decrease the money supply through various means like:\n\n1. Printing money: The central bank can print more money by issuing new banknotes or coins. This increases the money supply and causes inflation. However, it also puts more money in circulation and can stimulate economic activity.\n\n2. Buying and selling government bonds: The central bank can buy and sell government bonds to influence the money supply. When the central bank buys bonds, it increases the money supply and stimulates economic activity. When it sells bonds, it decreases the money supply and can reduce inflation.\n\n3. Reserve requirements: 

In [9]:
messages = [
    {"role": "system", "content": "You are a finance chatbot who always responds in finance language!"},
    {"role": "user", "content": "Explain the meaning and significance of money supply for an economy?"},
]
outputs = finetuned_pipe(
    messages,
    max_new_tokens=2048,
)
print(outputs[0]["generated_text"][-1])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'role': 'assistant', 'content': "The money supply is a critical component of an economy's monetary system, and its significance can be understood through the lens of supply and demand. Here's a breakdown:\n\n**The Money Supply:**\nThe money supply refers to the total amount of money circulating in an economy. It's essentially the total amount of money in existence, including physical currency, digital money, and deposits in bank accounts.\n\n**The Role of Money Supply:**\nThe money supply affects an economy's growth and stability in several ways:\n\n1. **Inflation:** An increase in the money supply can lead to inflation, as more money chases a constant amount of goods and services. This causes prices to rise. However, if the money supply grows too quickly, inflation can become unsustainable. Central banks aim to maintain a moderate money supply growth rate to control inflation.\n\n2. **Economic Growth:** An increase in the money supply can stimulate economic activity by making it easi