## 0.Install required dependencies

In [31]:
# Core training libraries
!pip install -q \
    transformers==4.44.2 \
    datasets==2.20.0 \
    tokenizers==0.19.1 \
    accelerate==1.0.0 \
    peft==0.18.1 \
    trl==0.9.6 \
    bitsandbytes==0.49.1 \
    evaluate==0.4.2

# Utilities
!pip install -q \
    numpy==2.0.0 \
    pandas==3.0.0 \
    scikit-learn \
#     rich \
#     pyyaml \
#     python-dotenv \
#     tqdm

# Evaluation (requires pydantic v2)
# !pip install -q --upgrade pydantic
!pip install -q google-genai rouge-score

print("Installation complete!")
print("All dependencies compatible (pydantic v2 + google-genai)")

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 3.0.0 which is incompatible.
jax 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
bqplot 0.12.45 requires pandas<3.0.0,>=1.0.0, but you have pandas 3.0.0 which is incompatible.
opencv-python-headless 4.13.0.90 requires numpy>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
shap 0.50.0 requires numpy>=2, but you have numpy 1.26.4 which is incompatible.
cudf-cu12 25.10.0 requires pandas<2.4.0dev0,>=2.0, but you have pandas 3.0.0 which is incompatible.
opencv-python 4.13.0.90 requires numpy>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
db-dtypes 1.5.0 requires pandas<3.0.0,>=1.5.3, but you have pandas 3.0.0 which is incompatible.
gradio 5.50.0 requires pandas<3.0,

## 01.Setup environmental variables


In [32]:
from google.colab import userdata
key = userdata.get('GEMINI_API_KEY')[:10]
print(f"{key}*****")

AIzaSyDSmx*****


In [33]:
import os
import sys
import torch

print('='*60)
print(f'Environment check')
print('='*60)
print(f'Python Version:{sys.version}')
print(f'Torch Version:{torch.__version__}')
print(f'Cuda availability:{torch.cuda.is_available()}')

if torch.cuda.is_available():
  print(f'Cuda version: {torch.version.cuda}')
  print(f'Device name: {torch.cuda.get_device_name(0)}')
  print(f'Device properties: {torch.cuda.get_device_properties(0)}')
  print(f"Total VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("WARNING: CUDA not available. Training will be VERY slow on CPU.")

print("="*60)

Environment check
Python Version:3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
Torch Version:2.9.0+cu126
Cuda availability:True
Cuda version: 12.6
Device name: Tesla T4
Device properties: _CudaDeviceProperties(name='Tesla T4', major=7, minor=5, total_memory=15095MB, multi_processor_count=40, uuid=6e052747-78c6-7d26-dd45-a1e78074a483, pci_bus_id=0, pci_device_id=4, pci_domain_id=0, L2_cache_size=4MB)
Total VRAM: 14.74 GB


## 2.Setting the reproducability

In [34]:
import os
import random
import torch
import numpy as np

SEED = 42

os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

if torch.cuda.is_available():
  torch.cuda.manual_seed(SEED)
  torch.cuda.manual_seed_all(SEED)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

print(f"Seeds set to {SEED} for reproducibility")
print("Note: Full determinism on GPU is not guaranteed due to non-deterministic operations")

Seeds set to 42 for reproducibility
Note: Full determinism on GPU is not guaranteed due to non-deterministic operations


In [35]:
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")
!hf auth login --token $HF_TOKEN

print("Hugging Face login skipped. Uncomment login() to push models to Hub.")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `zuucrew-ai-eng` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
Hugging Face login skipped. Uncomment login() to push models to Hub.


### Configuration

In [36]:
from dataclasses import dataclass,field
from typing import List

@dataclass
class Config:
  # Model
  base_model : str = "Qwen/Qwen2.5-1.5B-Instruct"

  # Dataset
  dataset_name : str = "virattt/financial-qa-10K"
  dataset_split : str = "train"
  dataset_subsample : int = 300
  train_val_split : float = 0.9

  # Tokenization
  max_length: int = 512

  # Training
  num_train_epochs : int = 1
  max_steps :int = 250
  per_device_train_batch_size: int = 1
  gradient_accumulation_steps: int = 64
  learning_rate : float = 2e-5
  warmup_ratio : float = 0.03
  logging_steps : int = 10
  save_steps : int = 200
  eval_steps :int = 100
  save_total_limit: int = 2

  # LoRA
  lora_r : int =  16
  lora_alpha :int = 32
  lora_dropout : float = 0.05
  lora_target_modules: List[str]=field(
        default_factory=lambda: [
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj"
        ]
    )

  # Quantization
  load_in_4bit : bool = True
  bnb_4bit_quant_type : str = "nf4"
  bnb_4bit_use_double_quant: bool = True
  bnb_4bit_compute_dtype: torch.dtype = torch.float16

  # Output
  output_dir: str = "outputs/adapter"
  push_to_hub :bool = False

  # Generation
  max_new_tokens :int = 128
  temperature : float = 0.0  # Deterministic
  do_sample : bool = True

  # HF credentials
  hf_username : str = 'madhurabe'
  hub_model_name :str = 'fin-bot'


In [37]:
from pprint import pprint
use_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
compute_dtype = torch.bfloat16 if use_bf16 else torch.float16
config = Config(bnb_4bit_compute_dtype=compute_dtype)
print("="*60)
print("CONFIGURATION (COLAB FREE TIER)")
print("="*60)
pprint(config)
print("="*60)

CONFIGURATION (COLAB FREE TIER)
Config(base_model='Qwen/Qwen2.5-1.5B-Instruct',
       dataset_name='virattt/financial-qa-10K',
       dataset_split='train',
       dataset_subsample=300,
       train_val_split=0.9,
       max_length=512,
       num_train_epochs=1,
       max_steps=250,
       per_device_train_batch_size=1,
       gradient_accumulation_steps=64,
       learning_rate=2e-05,
       warmup_ratio=0.03,
       logging_steps=10,
       save_steps=200,
       eval_steps=100,
       save_total_limit=2,
       lora_r=16,
       lora_alpha=32,
       lora_dropout=0.05,
       lora_target_modules=['q_proj',
                            'k_proj',
                            'v_proj',
                            'o_proj',
                            'gate_proj',
                            'up_proj',
                            'down_proj'],
       load_in_4bit=True,
       bnb_4bit_quant_type='nf4',
       bnb_4bit_use_double_quant=True,
       bnb_4bit_compute_dtype=torch.float16,

In [38]:
from datasets import load_dataset, Dataset
import json

def load_financial_dataset(dataset_name:str,split:float,subsample:int,seed:int=42):
  """Load dataset with robust field mapping and fallback."""
  try:
    print(f"Loading dataset: {dataset_name}...")
    dataset = load_dataset(dataset_name,split=split)
    dataset = dataset.shuffle(seed=seed).select(range(min(subsample,len(dataset))))
    print(f"Loaded {len(dataset)} examples from Hugging Face")
    return dataset
  except Exception as e:
    print(f"Failed to load from Hugging Face: {e}")


def map_dataset_fields(example):
  instruction = None
  for key in ["instruction", "question", "prompt", "task"]:
    if key in example and example[key]:
      instruction = str(example[key]).strip()
      break

  input_text = ""
  for key in ["input", "context", "passage", "history"]:
    if key in example and example[key]:
      input_text = str(example[key]).strip()
      break

  output = None
  for key in ["output", "response", "answer", "target", "completion"]:
    if key in example and example[key]:
      output = str(example[key]).strip()
      break

  return {
        "instruction": instruction,
        "input": input_text,
        "output": output
    }



In [42]:
# Load dataset
dataset = load_financial_dataset(dataset_name=config.dataset_name,split=config.dataset_split,subsample=config.dataset_subsample,seed=SEED)
print(f"\n Dataset before cleaning: {len(dataset)} examples")
# Map fields
dataset = dataset.map(map_dataset_fields)
#Delete the previous headers
dataset = dataset.remove_columns(
    [c for c in dataset.column_names if c not in ["instruction", "input", "output"]]
)
# Drop raws with missing values
dataset = dataset.filter(lambda x: x["instruction"] is not None and x["output"] is not None)
print(f"Dataset after cleaning: {len(dataset)} examples")
print(f"Dropped {config.dataset_subsample - len(dataset)} examples with missing data\n")

# Split into train and validation data
split_dataset = dataset.train_test_split(train_size = config.train_val_split,seed=SEED)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

print(f"Train: {len(train_dataset)} | Validation: {len(val_dataset)}")
print("\nSample example:")
print(train_dataset[0])

Loading dataset: virattt/financial-qa-10K...
Loaded 300 examples from Hugging Face

 Dataset before cleaning: 300 examples


Filter:   0%|          | 0/300 [00:00<?, ? examples/s]

Dataset after cleaning: 300 examples
Dropped 0 examples with missing data

Train: 270 | Validation: 30

Sample example:
{'instruction': 'How much did the special items impact the 2023 adjusted earnings before taxes?', 'input': 'For 2023, the impact of special items deducted from the GAAP income before taxes ($17,780 million) was $9,583 million, resulting in adjusted earnings before taxes of $8,197 million (Non-GAAP).', 'output': '$9,583 million'}


### Visualization

In [43]:
import pandas as pd

df_preview = pd.DataFrame(train_dataset[:50])

pd.set_option('display.max_colwidth', 100)  # Limit column width for readability
print(f"Displaying first 50 samples out of {len(dataset)} total examples\n")
df_preview


Displaying first 50 samples out of 300 total examples



Unnamed: 0,instruction,input,output
0,How much did the special items impact the 2023 adjusted earnings before taxes?,"For 2023, the impact of special items deducted from the GAAP income before taxes ($17,780 millio...","$9,583 million"
1,What was the equity interest percentage in the joint venture with AGC Equity Partners at the end...,"The equity interest in the joint venture with AGC Equity Partners was 20.00% at December 31, 2023.",20.00%
2,How did the transition to a distributor operating model impact NIKE Brand's APLA revenue growth ...,"Within our CASA territory, the transition of our Chile, Argentina and Uruguay entities to a thir...",The transition to a distributor operating model reduced NIKE Brand's APLA revenue growth by appr...
3,How did the goodwill value change over the period in the data provided?,"Throughout the period, the goodwill value maintained a constant value of $69,021 before a slight...","The goodwill value remained constant at $69,021 before decreasing by $1 to $69,022."
4,How are Google™ and Google Chrome™ related to Google LLC?,Google™ and Google Chrome™ are trademarks of Google LLC.,They are trademarks owned by Google LLC.
5,How has the European market compared to the U.S. market in terms of annual spend per customer?,"The European market lags the U.S. market across a number of factors, including annual spend per ...",The European market lags the U.S. market in annual spend per customer.
6,"What documents are referenced in Part IV, Item 15(a)(1) of the Annual Report on Form 10-K?","The consolidated financial statements and accompanying notes listed in Part IV, Item 15(a)(1) of...",The consolidated financial statements and accompanying notes are referenced.
7,What is the impact of structural changes on the unit case volume and concentrate sales volume of...,"Typically, structural changes do not impact the Company’s unit case volume or concentrate sales ...","Typically, structural changes do not impact the company's unit case volume or concentrate sales ..."
8,How did HP's long-term debt change from 2022 to 2023?,"Long-term debt was $9,254 million as of October 31, 2023, compared to $10,796 million as of Octo...","HP's long-term debt decreased from $10,796 million in 2022 to $9,254 million in 2023."
9,"What was the total depreciation and amortization expense for the fiscal year ended June 30, 2023?","Total depreciation and amortization expense for the fiscal year ended June 30, 2023 amounted to ...","The total depreciation and amortization expense for the fiscal year ended June 30, 2023 was $124..."


### Token length and truncation diagnostics

In [44]:
from transformers import AutoTokenizer
import numpy as np

print(f"Loading Tokenizer: {config.base_model}")
tokenizer =  AutoTokenizer.from_pretrained(config.base_model,trust_remote_code =True)

if tokenizer.pad_token is None:
  tokenizer.pad_taken = tokenizer.eos_token

sample_size = min(300,len(train_dataset))
print(f"Sampling {sample_size} examples for diagnostics")
sample_dataset = train_dataset.select(range(sample_size))

token_lengths = []
for example in sample_dataset:
  text = f"{example['instruction']} {example['input']} {example['output']}"
  tokens = tokenizer(text,add_special_tokens=True)
  token_lengths.append(len(tokens["input_ids"]))
token_lengths = np.array(token_lengths)

print("="*60)
print("TOKEN LENGTH DIAGNOSTICS")
print("="*60)
print(f"Sample size: {sample_size}")
print(f"Average token length: {token_lengths.mean():.1f}")
print(f"Median token length: {np.median(token_lengths):.1f}")
print(f"Min token length: {token_lengths.min()}")
print(f"Max token length: {token_lengths.max()}")
print(f"95th percentile: {np.percentile(token_lengths, 95):.1f}")
print(f"99th percentile: {np.percentile(token_lengths, 99):.1f}")
print()
truncated = (token_lengths > config.max_length).sum()
truncation_rate = truncated / len(token_lengths) * 100
print(f"Truncation at max_length={config.max_length}: {truncated}/{len(token_lengths)} ({truncation_rate:.1f}%)")
print("="*60)


Loading Tokenizer: Qwen/Qwen2.5-1.5B-Instruct


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Sampling 270 examples for diagnostics
TOKEN LENGTH DIAGNOSTICS
Sample size: 270
Average token length: 99.3
Median token length: 84.0
Min token length: 28
Max token length: 809
95th percentile: 205.0
99th percentile: 338.4

Truncation at max_length=512: 1/270 (0.4%)


### Build SFT Prompts

In [45]:
def chatml_format(user_text, system_text="You are a financial assistant.", assistant_text=None ):
  messages = [
      {"role": "system", "content": system_text},
      {"role": "user", "content": user_text},
  ]
  if assistant_text:
      messages.append({"role": "assistant", "content": assistant_text})

  formatted = ""
  for message in messages:
    formatted += f"<|im_start|>{message['role']}\n{message['content']}<|im_end|>\n"

  return formatted

def build_sft_prompt(row):
  user_text = row["instruction"]
  if row["input"]:
    user_text += f"\n\nInput: {row['input']}"
  prompt = chatml_format(
        user_text=user_text,
        system_text="You are a financial assistant.",
        assistant_text=row["output"]
    )
  return {"text": prompt}

  # Map datasets to text format
train_dataset = train_dataset.map(build_sft_prompt)
val_dataset = val_dataset.map(build_sft_prompt)

print("Prompts built for SFT")
print("Sample formatted prompt:")
print(train_dataset[0]["text"][:500] + "...")

Map:   0%|          | 0/270 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Prompts built for SFT
Sample formatted prompt:
<|im_start|>system
You are a financial assistant.<|im_end|>
<|im_start|>user
How much did the special items impact the 2023 adjusted earnings before taxes?

Input: For 2023, the impact of special items deducted from the GAAP income before taxes ($17,780 million) was $9,583 million, resulting in adjusted earnings before taxes of $8,197 million (Non-GAAP).<|im_end|>
<|im_start|>assistant
$9,583 million<|im_end|>
...


### Baseline inference

In [46]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import time
import torch

bnb_config = BitsAndBytesConfig(
    load_in_8bit = config.load_in_4bit,
    bnb_4bit_compute_dtype = config.bnb_4bit_compute_dtype,
    bnb_4bit_quant_type = config.bnb_4bit_quant_type,
    bnb_4bit_use_double_quant= config.bnb_4bit_use_double_quant
    )
print(f"Loading the base model:{config.base_model} for training")
base_model = AutoModelForCausalLM.from_pretrained(
    config.base_model,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
print("Base model loaded in 4-bit")

test_prompts = [
    {
        "title": "Hard Fact Extraction",
        "prompt": "Answer the following question using only the provided context: The company reported total revenue of $12.5 billion in 2024, reflecting a 7% year-over-year increase. What was the total revenue reported?"
    },
    {
        "title": "Numerical Detail Check",
        "prompt": "Extract the key financial figure from the following text: Operating income reached $3.2 billion, while net profit margins improved compared to the prior year."
    },
    {
        "title": "Strategic Summary",
        "prompt": "Summarize the main reason for cost increases based on the following context: Operating expenses increased primarily due to higher research and development spending and expanded international operations."
    },
    {
        "title": "Risk Identification",
        "prompt": "Identify one major risk mentioned in the following passage: The company faces potential revenue volatility due to fluctuations in global demand and foreign exchange rates."
    },
    {
        "title": "Stylistic Simplification",
        "prompt": "Rewrite the following statement in simple terms for a non-financial audience: The company improved profitability through operational efficiencies and disciplined cost management."
    },
    {
        "title": "Bullet Point Conversion",
        "prompt": "Convert the following paragraph into two concise bullet points: The organization focused on expanding its digital services portfolio and strengthening customer retention strategies during the year."
    },
    {
        "title": "Grounding / No Hallucination Test",
        "prompt": "Answer the following question using only the given context. If the information is missing, say so clearly. Context: The company invested heavily in infrastructure upgrades and workforce expansion. Question: What was the company’s net income?"
    }
]

print("\n" + "="*60)
print("BASELINE OUTPUTS (PRE-FINETUNE)")
print("="*60)
if torch.cuda.is_available():
  vram_before = torch.cuda.memory_allocated()/1024**3
  print(f"VRAM before generation: {vram_before:.2f} GB\n")

for i, test in enumerate(test_prompts,1):
  formatted_prompt = chatml_format(test["prompt"])
  inputs = tokenizer(formatted_prompt, return_tensors="pt").to(base_model.device)

  start_time = time.time()
  with torch.no_grad():
    outputs = base_model.generate(
        **inputs,
        max_new_tokens = config.max_new_tokens,
        temperature = config.temperature,
        do_sample =  False, # Set to False when temperature is 0.0 for greedy decoding
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id

    )
  elapsed = time.time() - start_time
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

  # Extract only the assistant response
  if "<|im_start|>assistant" in generated_text:
      response = generated_text.split("<|im_start|>assistant")[-1].split("<|im_end|>")[0].strip()
  else:
      response = generated_text[len(formatted_prompt):].strip()

  tokens_generated = outputs.shape[1] - inputs["input_ids"].shape[1]
  tokens_per_sec = tokens_generated / elapsed if elapsed > 0 else 0

  print(f"\n[{i}] {test['title']}")
  print("-" * 60)
  print(f"Prompt: {test['prompt'][:100]}...")
  print(f"\nResponse:\n{response}")
  print(f"\nLatency: {elapsed:.2f}s | Tokens: {tokens_generated} | Speed: {tokens_per_sec:.1f} tok/s")
  print("-" * 60)

if torch.cuda.is_available():
  vram_after = torch.cuda.memory_allocated() / 1024**3
  print(f"\nVRAM after generation: {vram_after:.2f} GB")
  print(f"VRAM delta: {vram_after - vram_before:.2f} GB")
print("\n" + "="*60)

Loading the base model:Qwen/Qwen2.5-1.5B-Instruct for training


config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Base model loaded in 4-bit

BASELINE OUTPUTS (PRE-FINETUNE)
VRAM before generation: 2.16 GB






[1] Hard Fact Extraction
------------------------------------------------------------
Prompt: Answer the following question using only the provided context: The company reported total revenue of...

Response:
pany for 2024 is $12.5 billion.

Latency: 14.16s | Tokens: 25 | Speed: 1.8 tok/s
------------------------------------------------------------





[2] Numerical Detail Check
------------------------------------------------------------
Prompt: Extract the key financial figure from the following text: Operating income reached $3.2 billion, whi...

Response:
om the given text is:

Operating income: $3.2 billion

Latency: 6.01s | Tokens: 23 | Speed: 3.8 tok/s
------------------------------------------------------------

[3] Strategic Summary
------------------------------------------------------------
Prompt: Summarize the main reason for cost increases based on the following context: Operating expenses incr...

Response:
mentioned in the context is that operating expenses have risen mainly because of greater investment in research and development (R&D) activities, as well as expanding into additional international markets.

Latency: 9.90s | Tokens: 44 | Speed: 4.4 tok/s
------------------------------------------------------------

[4] Risk Identification
------------------------------------------------------------
Prompt: Identify 

### Model + LoRA Setup and Training

In [None]:
import os
os.environ["ACCELERATE_USE_DEEPSPEED"] = "false"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer,SFTConfig

print("Preparing model for k-bit training...")
base_model = prepare_model_for_kbit_training(base_model)

lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    lora_dropout=config.lora_dropout,
    target_modules=config.lora_target_modules,
    bias="none",
    task_type="CAUSAL_LM",
)

print("Applying LoRA adapters...")
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

training_args =SFTConfig(
    output_dir = config.output_dir,
    num_train_epochs = config.num_train_epochs,
    max_steps = config.max_steps,
    per_device_train_batch_size=config.per_device_train_batch_size,
    per_device_eval_batch_size=config.per_device_train_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    learning_rate=config.learning_rate,
    warmup_ratio=config.warmup_ratio,
    logging_steps=config.logging_steps,
    save_steps=config.save_steps,
    eval_steps=config.eval_steps,
    save_total_limit=config.save_total_limit,
    fp16=not use_bf16,
    bf16=use_bf16,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    report_to="none",
    max_seq_length=config.max_length,
    packing=False,
    dataset_text_field="text",
)

trainer = SFTTrainer(
    model = model,
    args = training_args,
    train_dataset =  train_dataset,
    eval_dataset = val_dataset,
    tokenizer = tokenizer
)

print("\nStarting training...\n")
train_result = trainer.train()

print("\n" + "="*60)
print("TRAINING COMPLETE")
print("="*60)
print(f"Total training time: {train_result.metrics.get('train_runtime', 0):.2f}s")
print(f"Samples per second: {train_result.metrics.get('train_samples_per_second', 0):.2f}")
print(f"Steps per second: {train_result.metrics.get('train_steps_per_second', 0):.4f}")


Preparing model for k-bit training...
Applying LoRA adapters...
trainable params: 18,464,768 || all params: 1,562,179,072 || trainable%: 1.1820


Map:   0%|          | 0/270 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...



Starting training...



  return fn(*args, **kwargs)


In [None]:
import os

# Save adapter
print(f"Saving adapter to {config.output_dir}...")
trainer.model.save_pretrained(config.output_dir)
tokenizer.save_pretrained(config.output_dir)
print("Adapter and tokenizer saved")

# Save model card
model_card = f"""---
license: Check base model license
tags:
- financial
- qlora
- finetuned
---

# Financial QLoRA Adapter

This is a LoRA adapter finetuned on financial data.

## Base Model
{config.base_model}

## Dataset
{config.dataset_name} (subsample: {config.dataset_subsample})

## Training Hyperparameters
- LoRA rank: {config.lora_r}
- LoRA alpha: {config.lora_alpha}
- Learning rate: {config.learning_rate}
- Max steps: {config.max_steps}
- Max length: {config.max_length}

## Usage

```python
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained("{config.base_model}")
model = PeftModel.from_pretrained(base_model, "path/to/adapter")
tokenizer = AutoTokenizer.from_pretrained("path/to/adapter")
```

## Limitations

- **Educational use only** - NOT a financial advice! Do Your Own Research
- Trained on limited data subsample
- May produce incorrect or harmful information


## License & Attribution

This adapter inherits the license of the base model and dataset. Check those licenses before use or redistribution.
"""

with open(os.path.join(config.output_dir, "README.md"), "w") as f:
    f.write(model_card)

print("Model card saved")

# Push to Hub (if enabled)
push_to_hub = True
if push_to_hub:
    # Create full repo name: username/model-name
    repo_name = f"{config.hf_username}/{config.hub_model_name}"

    print(f"\n Pushing to Hugging Face Hub: {repo_name}")
    print("   This will create a new model repository if it doesn't exist.")

    trainer.model.push_to_hub(repo_name)
    tokenizer.push_to_hub(repo_name)

    print(f"Pushed to Hub: https://huggingface.co/{repo_name}")
else:
    print("\push_to_hub=False, skipping Hub upload")
    print(f"   To push to Hub, set CONFIG['push_to_hub'] = True")
    print(f"   Model will be pushed as: {config.hf_username}/{config.hub_model_name}")