<a href="https://colab.research.google.com/github/nhi1e/llama3.2-1B-lora-banking/blob/main/llama3_2_1B_lora_banking_1_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
uploaded = files.upload()

Saving NghiepVu5.xlsx to NghiepVu5 (1).xlsx


In [2]:
from google.colab import userdata
from huggingface_hub import login


hf_token = userdata.get('HF_llama')
login(token=hf_token)


# Make sure Colab runtime = GPU (Runtime > Change runtime type > GPU)
!nvidia-smi

# Clean installs for Colab
!pip -q install -U transformers accelerate peft bitsandbytes datasets evaluate openpyxl


Fri Aug 15 04:26:40 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [7]:
import os, torch, platform, numpy as np, pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer, AutoConfig, LlamaForSequenceClassification,
    BitsAndBytesConfig, DataCollatorWithPadding, TrainingArguments, Trainer
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import evaluate

# --------- Paths & model ids ---------
xls_path = "NghiepVu5.xlsx"      # Upload this file to Colab's working dir (left panel)
base_model_name = "meta-llama/Llama-3.2-1B"
output_dir = "llama-3.2-1b-colab-lora-cls"

# --------- GPU & precision detection ----------
assert torch.cuda.is_available(), "No GPU detected. In Colab: Runtime > Change runtime type > GPU."
gpu_name = torch.cuda.get_device_name(0)
cap_major = torch.cuda.get_device_capability(0)[0]
use_bf16 = cap_major >= 8  # Ampere+ (A100/L4)
use_fp16 = not use_bf16    # T4, etc.
mp_dtype = torch.bfloat16 if use_bf16 else torch.float16

print(f"GPU: {gpu_name} | bf16={use_bf16} fp16={use_fp16}")

# --------- Speed levers (adjust if needed) ---------
MAX_LEN = 128   # biggest speed knob; 128 is fast, 256 if you want more context
ATTN_ONLY = False  # set True if you OOM; trains LoRA on attention only (faster/leaner)
LORA_R = 16        # 8 or 16 are good starting points
LORA_ALPHA = 32    # ~2x r is typical
LORA_DROPOUT = 0.05


GPU: Tesla T4 | bf16=False fp16=True


In [8]:
# Read all sheets: each sheet name is the BIG category (target); each row has 'question' and 'label' (the hint)
xls = pd.ExcelFile(xls_path)

dfs = []
for sheet in xls.sheet_names:
    df = pd.read_excel(xls_path, sheet_name=sheet)
    df.columns = [c.lower().strip() for c in df.columns]
    if not {"question", "label"}.issubset(df.columns):
        raise ValueError(f"Sheet '{sheet}' must have columns: question, label")
    df = df[["question","label"]].copy()
    df.rename(columns={"label": "hint"}, inplace=True)   # subcategory -> hint
    df["label"] = sheet.strip()                          # big 5 category (target)
    dfs.append(df)

data = pd.concat(dfs, ignore_index=True)
data = (data
        .dropna(subset=["question","label"])
        .drop_duplicates(subset=["question"])
        .reset_index(drop=True))

# Encode big categories
cat = data["label"].astype("category")
data["label_id"] = cat.cat.codes
id2label = dict(enumerate(cat.cat.categories))
label2id = {v:k for k,v in id2label.items()}

# Split
train_df, test_df = train_test_split(data, test_size=0.15, stratify=data["label_id"], random_state=42)
train_df, val_df  = train_test_split(train_df, test_size=0.15, stratify=train_df["label_id"], random_state=42)

print("Label mapping:", label2id)
print("Train/Val/Test sizes:", len(train_df), len(val_df), len(test_df))


Label mapping: {'Chuyển tiền': 0, 'Others': 1, 'TKTT': 2, 'Thẻ': 3, 'ipay': 4}
Train/Val/Test sizes: 3272 578 680


In [9]:
CHOICES = ", ".join([name for _, name in id2label.items()])

def make_prompt(q: str, hint: str | None) -> str:
    extra = f"\nAdditional hint (may help classification): {hint.strip()}" if (isinstance(hint, str) and hint.strip()) else ""
    return (
        "You are a text classification system. "
        "Read the following user question and choose exactly ONE category from the list that best matches its meaning.\n"
        f"Categories: {CHOICES}\n"
        f"User Question: {q.strip()}{extra}"
    )


# Build HF datasets
train_ds = Dataset.from_pandas(train_df[["question","hint","label_id","label"]].rename(columns={"label_id":"labels"}))
val_ds   = Dataset.from_pandas(val_df  [["question","hint","label_id","label"]].rename(columns={"label_id":"labels"}))
test_ds  = Dataset.from_pandas(test_df [["question","hint","label_id","label"]].rename(columns={"label_id":"labels"}))
raw_ds   = DatasetDict(train=train_ds, validation=val_ds, test=test_ds)

tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tok_fn(batch):
    prompts = [make_prompt(q, h) for q, h in zip(batch["question"], batch.get("hint", [""]*len(batch["question"])))]
    out = tokenizer(prompts, truncation=True, max_length=MAX_LEN, padding=False)
    out["labels"] = batch["labels"]
    return out

tokenized_ds = raw_ds.map(tok_fn, batched=True, remove_columns=raw_ds["train"].column_names)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)


Map:   0%|          | 0/3272 [00:00<?, ? examples/s]

Map:   0%|          | 0/578 [00:00<?, ? examples/s]

Map:   0%|          | 0/680 [00:00<?, ? examples/s]

In [10]:
num_labels = len(id2label)
config = AutoConfig.from_pretrained(
    base_model_name,
    num_labels=num_labels,
    id2label={i:id2label[i] for i in range(num_labels)},
    label2id={v:k for k,v in id2label.items()},
)

# 4-bit quantized base weights (fast & memory light on CUDA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=mp_dtype,
)

base_model = LlamaForSequenceClassification.from_pretrained(
    base_model_name,
    config=config,
    quantization_config=bnb_config,
    torch_dtype=mp_dtype,
    device_map="auto",
)

# Prepare for k-bit training (cast norms, etc.)
base_model = prepare_model_for_kbit_training(base_model)

# LoRA target modules
if ATTN_ONLY:
    target_modules = ["q_proj","k_proj","v_proj","o_proj"]
else:
    target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

lora_cfg = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=target_modules,
    bias="none",
    task_type="SEQ_CLS",
)

model = get_peft_model(base_model, lora_cfg)

# Optional memory saver on GPU; small slowdown
model.config.use_cache = False
model.gradient_checkpointing_enable()

model.print_trainable_parameters()


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 11,282,432 || all params: 1,247,107,072 || trainable%: 0.9047


In [11]:
# Batch sizes tuned per GPU
if "A100" in gpu_name or "L4" in gpu_name:
    per_device_train_bsz = 16
    per_device_eval_bsz  = 32
    grad_accum = 4         # effective batch ≈ 64
elif "T4" in gpu_name:
    per_device_train_bsz = 8
    per_device_eval_bsz  = 16
    grad_accum = 8         # effective batch ≈ 64
else:
    per_device_train_bsz = 4
    per_device_eval_bsz  = 8
    grad_accum = 16

args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=2,                     # start with 2; go 3 if time/VRAM allows
    per_device_train_batch_size=per_device_train_bsz,
    per_device_eval_batch_size=per_device_eval_bsz,
    gradient_accumulation_steps=grad_accum,

    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,

    # 🚀 Speed mode: no mid-epoch eval/saves
    eval_strategy="no",
    save_strategy="no",
    logging_steps=50,
    report_to="none",

    # Colab-friendly precision & optimizer
    bf16=use_bf16,
    fp16=use_fp16,
    optim="adamw_bnb_8bit",    # 8-bit AdamW from bitsandbytes (saves memory)
    dataloader_num_workers=2,
)


In [13]:
# If tokenizer has no pad token, assign one
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # or another unused token

# Make sure the model knows about it too
model.config.pad_token_id = tokenizer.pad_token_id

# Train only (fast pass)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)
train_result = trainer.train()

# Metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
        "f1_weighted": f1.compute(predictions=preds, references=labels, average="weighted")["f1"],
    }

# Evaluate ONCE after training
eval_trainer = Trainer(
    model=model,
    args=args,  # reuse device/precision settings
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
val_metrics = eval_trainer.evaluate()
print("Validation:", val_metrics)

test_trainer = Trainer(
    model=model,
    args=args,
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
test_metrics = test_trainer.evaluate()
print("Test:", test_metrics)


  trainer = Trainer(


Step,Training Loss
50,4.1562


Step,Training Loss
50,4.1562
100,0.0849


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  eval_trainer = Trainer(


Validation: {'eval_loss': 0.03529631346464157, 'eval_model_preparation_time': 0.0058, 'eval_accuracy': 0.9913494809688581, 'eval_f1_macro': 0.9921265518950204, 'eval_f1_weighted': 0.9913307654445727, 'eval_runtime': 14.5319, 'eval_samples_per_second': 39.775, 'eval_steps_per_second': 2.546}


  test_trainer = Trainer(


Test: {'eval_loss': 0.05206017196178436, 'eval_model_preparation_time': 0.0033, 'eval_accuracy': 0.9955882352941177, 'eval_f1_macro': 0.9961963146215116, 'eval_f1_weighted': 0.995582870888568, 'eval_runtime': 16.8931, 'eval_samples_per_second': 40.253, 'eval_steps_per_second': 2.545}


In [14]:
# Confusion matrix + per-class report on TEST
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

logits = test_trainer.predict(tokenized_ds["test"]).predictions
preds  = np.argmax(logits, axis=-1)
y_true = np.array(tokenized_ds["test"]["labels"])

print(classification_report(
    y_true, preds, target_names=[id2label[i] for i in range(len(id2label))], digits=4
))
print("Confusion matrix:\n", confusion_matrix(y_true, preds))


              precision    recall  f1-score   support

 Chuyển tiền     0.9948    1.0000    0.9974       192
      Others     1.0000    1.0000    1.0000        37
        TKTT     1.0000    0.9831    0.9915       177
         Thẻ     0.9844    1.0000    0.9921       126
        ipay     1.0000    1.0000    1.0000       148

    accuracy                         0.9956       680
   macro avg     0.9958    0.9966    0.9962       680
weighted avg     0.9956    0.9956    0.9956       680

Confusion matrix:
 [[192   0   0   0   0]
 [  0  37   0   0   0]
 [  1   0 174   2   0]
 [  0   0   0 126   0]
 [  0   0   0   0 148]]


In [15]:
import os
adapter_dir = os.path.join(output_dir, "lora_adapter")
os.makedirs(adapter_dir, exist_ok=True)
model.save_pretrained(adapter_dir)
tokenizer.save_pretrained(adapter_dir)
print("Saved to:", adapter_dir)

Saved to: llama-3.2-1b-colab-lora-cls/lora_adapter


In [16]:
!zip -r lora_adapter.zip {output_dir}/lora_adapter
from google.colab import files; files.download("lora_adapter.zip")

  adding: llama-3.2-1b-colab-lora-cls/lora_adapter/ (stored 0%)
  adding: llama-3.2-1b-colab-lora-cls/lora_adapter/adapter_model.safetensors (deflated 7%)
  adding: llama-3.2-1b-colab-lora-cls/lora_adapter/tokenizer.json (deflated 85%)
  adding: llama-3.2-1b-colab-lora-cls/lora_adapter/adapter_config.json (deflated 57%)
  adding: llama-3.2-1b-colab-lora-cls/lora_adapter/special_tokens_map.json (deflated 64%)
  adding: llama-3.2-1b-colab-lora-cls/lora_adapter/tokenizer_config.json (deflated 96%)
  adding: llama-3.2-1b-colab-lora-cls/lora_adapter/README.md (deflated 66%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
import os
from huggingface_hub import HfApi, create_repo

# 0) This is the folder you saved earlier
adapter_dir = os.path.join(output_dir, "lora_adapter")

# Sanity check it's really there in THIS runtime
assert os.path.isdir(adapter_dir), f"Not found: {adapter_dir}. If you restarted Colab, re-save or re-upload."

# 1) Set folder_path to your adapter folder
folder_path = adapter_dir

# # 2) Read your token from the env var you mentioned (HF_llama)
# token = os.environ.get("HF_llama")
# assert token, "Env var HF_llama not set. Do: os.environ['HF_llama'] = 'hf_...'."

# 3) Your existing repo
repo_id = "nhi-le/llama3.2-1B-lora-banking"

# 4) Upload (idempotent)
api = HfApi(token=hf_token)
create_repo(repo_id, repo_type="model", private=True, exist_ok=True, token=hf_token)

api.upload_folder(
    folder_path=folder_path,         # <-- local path
    repo_id=repo_id,                 # <-- your repo
    repo_type="model",
    path_in_repo="",                 # or "adapter/" if you want it under a subfolder
    commit_message="Upload/update LoRA adapter and tokenizer"
)

print("Uploaded:", folder_path, "→", repo_id)


tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

Uploaded: llama-3.2-1b-colab-lora-cls/lora_adapter → nhi-le/llama3.2-1B-lora-banking
