# Finetune LLaMA 3.1 on the CDCP dataset on the ATC task

## Libraries

In [1]:
# %cd ..
# %rm -rf LLaMA-Factory
# !git clone https://github.com/hiyouga/LLaMA-Factory.git
# %cd LLaMA-Factory
# %ls
# !pip install -e .[torch,bitsandbytes]

In [2]:
# !pip uninstall -y pydantic
# !pip install pydantic==1.10.9 # 

# !pip uninstall -y gradio
# !pip install gradio==3.48.0

# !pip uninstall -y bitsandbytes
# !pip install --upgrade bitsandbytes

# !pip install tqdm
# !pip install ipywidgets
# !pip install scikit-learn

# Restart kernel afterwards.

In [3]:
import os
import ast
import sys
import json
import torch
import pickle
import inspect
import argparse
import subprocess

# sys.path.append('../')

import pandas as pd
from tqdm import tqdm
from pathlib import Path

from llamafactory.chat import ChatModel
from llamafactory.extras.misc import torch_gc
from sklearn.metrics import classification_report

In [4]:
try:    
    assert torch.cuda.is_available() is True
    
except AssertionError:
    
    print("Please set up a GPU before using LLaMA Factory...")

In [5]:
!nvidia-smi

Mon Sep  9 15:50:10 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA H100 PCIe               Off |   00000000:82:00.0 Off |                    0 |
| N/A   51C    P0             85W /  350W |       4MiB /  81559MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [6]:
# from huggingface_hub import login
# key = "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"
# login(key)

## BASE MODEL

In [7]:
BASE_MODEL = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit"
TASK = "acc"

## Paths

In [8]:
current_dir = Path(os.path.dirname(os.path.abspath("__file__"))).as_posix()
cdcp_dir = Path(current_dir).parent.absolute().as_posix()
parent_dir = Path(cdcp_dir).parent.absolute().as_posix()
sys.path.append(os.path.abspath(cdcp_dir))
from utils.post_processing import *

In [9]:
ROOT_DIR = parent_dir
DATASET_DIR = os.path.join(cdcp_dir, "datasets")
LLAMA_FACTORY_DIR = os.path.join(ROOT_DIR, "LLaMA-Factory")
OUTPUT_DIR = os.path.join(cdcp_dir, "finetuned_models", f"""CDCP_{TASK}_{BASE_MODEL.split("/")[1]}""")

## Load Dataset

In [10]:
# *** TRAIN DATASET *** #

train_dataset_name = f"""CDCP_{TASK}_train.json"""
train_dataset_file = os.path.join(DATASET_DIR, train_dataset_name)

# *** TEST DATASET *** #

test_dataset_name = f"""CDCP_{TASK}_test.json"""
test_dataset_file = os.path.join(DATASET_DIR, test_dataset_name)

In [11]:
print(BASE_MODEL, train_dataset_file, test_dataset_file)

unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit /nfs/scratch/umushtaq/coling_2025/cdcp/datasets/CDCP_acc_train.json /nfs/scratch/umushtaq/coling_2025/cdcp/datasets/CDCP_acc_test.json


## Fine-tune Model

### Update dataset info file in LLaMA Factory

In [12]:
if not os.path.exists(os.path.join(cdcp_dir, "ft_arg_files")):
    os.mkdir(os.path.join(cdcp_dir, "ft_arg_files"))

In [13]:
train_file = os.path.join(cdcp_dir, "ft_arg_files", f"""{train_dataset_name.split(".")[0].split("train")[0]}{BASE_MODEL.split("/")[1]}.json""")

In [14]:
dataset_info_line =  {
  "file_name": f"{train_dataset_file}",
  "columns": {
    "prompt": "instruction",
    "query": "input",
    "response": "output"
  }
}

In [15]:
with open(os.path.join(LLAMA_FACTORY_DIR, "data/dataset_info.json"), "r") as jsonFile:
    data = json.load(jsonFile)

data["cdcp"] = dataset_info_line

with open(os.path.join(LLAMA_FACTORY_DIR, "data/dataset_info.json"), "w") as jsonFile:
    json.dump(data, jsonFile)

### Model Download and Args File

In [16]:
model_downloads = os.path.join(ROOT_DIR, "model_downloads")

In [17]:
model_downloads

'/nfs/scratch/umushtaq/coling_2025/model_downloads'

In [18]:
NB_EPOCHS = 0.2

In [19]:
args = dict(
  stage="sft",                           # do supervised fine-tuning
  do_train=True,
  model_name_or_path=BASE_MODEL,         # use bnb-4bit-quantized Llama-3-8B-Instruct model
  dataset="cdcp",                     # use alpaca and identity datasets
  cache_dir=model_downloads,
  template="llama3",                     # use llama3 prompt template
  finetuning_type="lora",                # use LoRA adapters to save memory
  lora_target="all",                     # attach LoRA adapters to all linear layers
  output_dir=OUTPUT_DIR,                 # the path to save LoRA adapters
  overwrite_output_dir=True,             # overrides existing output contents
  per_device_train_batch_size=2,         # the batch size
  gradient_accumulation_steps=4,         # the gradient accumulation steps
  lr_scheduler_type="cosine",            # use cosine learning rate scheduler
  logging_steps=100,                      # log every 10 steps
  warmup_ratio=0.1,                      # use warmup scheduler
  save_steps=3000,                       # save checkpoint every 1000 steps
  learning_rate=5e-5,                    # the learning rate
  num_train_epochs=NB_EPOCHS,            # the epochs of training
  max_samples=2000,                       # use 500 examples in each dataset
  max_grad_norm=1.0,                     # clip gradient norm to 1.0
  quantization_bit=4,                    # use 4-bit QLoRA
  loraplus_lr_ratio=16.0,                # use LoRA+ algorithm with lambda=16.0
  fp16=True,                             # use float16 mixed precision training
  report_to="none"                       # discards wandb
)

In [20]:
json.dump(args, open(train_file, "w", encoding="utf-8"), indent=2)

### Run Fine-tune 

In [21]:
p = subprocess.Popen(["llamafactory-cli", "train", train_file], cwd=LLAMA_FACTORY_DIR)

In [22]:
p.wait()

09/09/2024 15:50:16 - INFO - llamafactory.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, compute dtype: torch.float16


[INFO|configuration_utils.py:733] 2024-09-09 15:50:16,225 >> loading configuration file config.json from cache at /nfs/scratch/umushtaq/coling_2025/model_downloads/models--unsloth--Meta-Llama-3.1-8B-Instruct-bnb-4bit/snapshots/90ff2083c372e1c422abd1b5596cbba1b994a170/config.json
[INFO|configuration_utils.py:800] 2024-09-09 15:50:16,226 >> Model config LlamaConfig {
  "_name_or_path": "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 128004,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,


09/09/2024 15:50:17 - INFO - llamafactory.data.template - Replace eos token: <|eot_id|>
09/09/2024 15:50:17 - INFO - llamafactory.data.loader - Loading dataset /nfs/scratch/umushtaq/coling_2025/cdcp/datasets/CDCP_acc_train.json...


[INFO|configuration_utils.py:733] 2024-09-09 15:50:17,812 >> loading configuration file config.json from cache at /nfs/scratch/umushtaq/coling_2025/model_downloads/models--unsloth--Meta-Llama-3.1-8B-Instruct-bnb-4bit/snapshots/90ff2083c372e1c422abd1b5596cbba1b994a170/config.json
[INFO|configuration_utils.py:800] 2024-09-09 15:50:17,813 >> Model config LlamaConfig {
  "_name_or_path": "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 128004,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,


training example:
input_ids:
[128000, 128006, 882, 128007, 271, 14711, 1472, 527, 459, 6335, 304, 14138, 26917, 13, 1472, 527, 2728, 264, 1495, 902, 5727, 49926, 5811, 6956, 44910, 555, 366, 1741, 1500, 1741, 29, 9681, 13, 4718, 3465, 374, 311, 49229, 1855, 5811, 3777, 304, 279, 1495, 439, 3060, 330, 34210, 498, 330, 35890, 498, 330, 16690, 498, 330, 1985, 65556, 1, 477, 330, 970, 3343, 1472, 2011, 471, 264, 1160, 315, 5811, 3777, 4595, 11, 26549, 315, 3160, 220, 18, 11, 304, 2768, 4823, 3645, 25, 5324, 8739, 9962, 794, 4482, 8739, 1857, 320, 496, 11844, 330, 8739, 1857, 320, 496, 11844, 330, 8739, 1857, 320, 496, 8, 93546, 1405, 1855, 2449, 330, 8739, 1857, 320, 496, 10143, 374, 12860, 555, 3060, 330, 34210, 498, 330, 35890, 498, 330, 16690, 498, 330, 1985, 65556, 1, 477, 330, 970, 3343, 4815, 14711, 5810, 374, 279, 1495, 25, 366, 1741, 16, 29, 1423, 323, 2254, 5590, 5718, 7170, 1304, 1670, 59358, 1790, 810, 4461, 4005, 1741, 16, 1822, 1741, 17, 29, 1789, 3187, 11, 994, 264, 1732, 889

[INFO|modeling_utils.py:4507] 2024-09-09 15:50:19,865 >> All model checkpoint weights were used when initializing LlamaForCausalLM.

[INFO|modeling_utils.py:4515] 2024-09-09 15:50:19,866 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit.
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.
[INFO|configuration_utils.py:993] 2024-09-09 15:50:20,054 >> loading configuration file generation_config.json from cache at /nfs/scratch/umushtaq/coling_2025/model_downloads/models--unsloth--Meta-Llama-3.1-8B-Instruct-bnb-4bit/snapshots/90ff2083c372e1c422abd1b5596cbba1b994a170/generation_config.json
[INFO|configuration_utils.py:1038] 2024-09-09 15:50:20,054 >> Generate config GenerationConfig {
  "bos_token_id": 128000,
  "do_sample": true,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "max_length": 13

09/09/2024 15:50:20 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
09/09/2024 15:50:20 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
09/09/2024 15:50:20 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
09/09/2024 15:50:20 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA
09/09/2024 15:50:20 - INFO - llamafactory.model.model_utils.misc - Found linear modules: k_proj,q_proj,o_proj,up_proj,gate_proj,down_proj,v_proj
09/09/2024 15:50:20 - INFO - llamafactory.model.loader - trainable params: 20,971,520 || all params: 8,051,232,768 || trainable%: 0.2605


[INFO|trainer.py:648] 2024-09-09 15:50:20,338 >> Using auto half precision backend


09/09/2024 15:50:21 - INFO - llamafactory.train.trainer_utils - Using LoRA+ optimizer with loraplus lr ratio 16.00.


[INFO|trainer.py:2134] 2024-09-09 15:50:21,104 >> ***** Running training *****
[INFO|trainer.py:2135] 2024-09-09 15:50:21,104 >>   Num examples = 580
[INFO|trainer.py:2136] 2024-09-09 15:50:21,104 >>   Num Epochs = 1
[INFO|trainer.py:2137] 2024-09-09 15:50:21,104 >>   Instantaneous batch size per device = 2
[INFO|trainer.py:2140] 2024-09-09 15:50:21,104 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:2141] 2024-09-09 15:50:21,104 >>   Gradient Accumulation steps = 4
[INFO|trainer.py:2142] 2024-09-09 15:50:21,104 >>   Total optimization steps = 15
[INFO|trainer.py:2143] 2024-09-09 15:50:21,107 >>   Number of trainable parameters = 20,971,520
100%|██████████| 15/15 [00:26<00:00,  1.73s/it][INFO|trainer.py:3503] 2024-09-09 15:50:47,561 >> Saving model checkpoint to /nfs/scratch/umushtaq/coling_2025/cdcp/finetuned_models/CDCP_acc_Meta-Llama-3.1-8B-Instruct-bnb-4bit/checkpoint-15
[INFO|configuration_utils.py:733] 2024-09-09 15:50:47,831 >> loading 

{'train_runtime': 27.1974, 'train_samples_per_second': 4.265, 'train_steps_per_second': 0.552, 'train_loss': 0.2880144437154134, 'epoch': 0.21}
***** train metrics *****
  epoch                    =     0.2069
  total_flos               =  2251414GF
  train_loss               =      0.288
  train_runtime            = 0:00:27.19
  train_samples_per_second =      4.265
  train_steps_per_second   =      0.552


0

## Inference on the fine-tuned model

In [23]:
os.listdir(OUTPUT_DIR)

['checkpoint-15',
 'README.md',
 'adapter_model.safetensors',
 'adapter_config.json',
 'tokenizer_config.json',
 'special_tokens_map.json',
 'tokenizer.json',
 'training_args.bin',
 'train_results.json',
 'all_results.json',
 'trainer_state.json',
 'CDCP_acc_results_0.2.pickle',
 'classification_report.pickle',
 'trainer_log.jsonl']

In [24]:
args = dict(
  model_name_or_path=BASE_MODEL, # use bnb-4bit-quantized Llama-3-8B-Instruct model
    cache_dir=model_downloads,
  adapter_name_or_path=OUTPUT_DIR,            # load the saved LoRA adapters
  template="llama3",                     # same to the one in training
  finetuning_type="lora",                  # same to the one in training
  quantization_bit=4,                    # load 4-bit quantized model
)

In [25]:
model = ChatModel(args)

[INFO|configuration_utils.py:733] 2024-09-09 15:50:50,201 >> loading configuration file config.json from cache at /nfs/scratch/umushtaq/coling_2025/model_downloads/models--unsloth--Meta-Llama-3.1-8B-Instruct-bnb-4bit/snapshots/90ff2083c372e1c422abd1b5596cbba1b994a170/config.json
[INFO|configuration_utils.py:800] 2024-09-09 15:50:50,202 >> Model config LlamaConfig {
  "_name_or_path": "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 128004,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,


09/09/2024 15:50:51 - INFO - llamafactory.data.template - Replace eos token: <|eot_id|>


[INFO|configuration_utils.py:733] 2024-09-09 15:50:51,582 >> loading configuration file config.json from cache at /nfs/scratch/umushtaq/coling_2025/model_downloads/models--unsloth--Meta-Llama-3.1-8B-Instruct-bnb-4bit/snapshots/90ff2083c372e1c422abd1b5596cbba1b994a170/config.json
[INFO|configuration_utils.py:800] 2024-09-09 15:50:51,584 >> Model config LlamaConfig {
  "_name_or_path": "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 128004,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,


09/09/2024 15:50:51 - INFO - llamafactory.model.model_utils.quantization - Loading ?-bit BITSANDBYTES-quantized model.
09/09/2024 15:50:51 - INFO - llamafactory.model.patcher - Using KV cache for faster generation.


[INFO|modeling_utils.py:3678] 2024-09-09 15:50:51,623 >> loading weights file model.safetensors from cache at /nfs/scratch/umushtaq/coling_2025/model_downloads/models--unsloth--Meta-Llama-3.1-8B-Instruct-bnb-4bit/snapshots/90ff2083c372e1c422abd1b5596cbba1b994a170/model.safetensors
[INFO|modeling_utils.py:1606] 2024-09-09 15:50:51,642 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16.
[INFO|configuration_utils.py:1038] 2024-09-09 15:50:51,645 >> Generate config GenerationConfig {
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "pad_token_id": 128004
}

[INFO|quantizer_bnb_4bit.py:106] 2024-09-09 15:50:51,946 >> target_dtype {target_dtype} is replaced by `CustomDtype.INT4` for 4-bit BnB quantization
[INFO|modeling_utils.py:4507] 2024-09-09 15:50:53,047 >> All model checkpoint weights were used when initializing LlamaForCausalLM.

[INFO|modeling_utils.py:4515] 2024-09-09 15:50:53,049 >> All the weights of LlamaForCausalLM we

09/09/2024 15:50:53 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
09/09/2024 15:50:53 - INFO - llamafactory.model.adapter - Loaded adapter(s): /nfs/scratch/umushtaq/coling_2025/cdcp/finetuned_models/CDCP_acc_Meta-Llama-3.1-8B-Instruct-bnb-4bit
09/09/2024 15:50:53 - INFO - llamafactory.model.loader - all params: 8,051,232,768


In [26]:
with open(test_dataset_file, "r+") as fh:
    test_dataset = json.load(fh)


In [28]:

test_prompts = []
test_grounds = []

for sample in test_dataset:
    test_prompts.append("\nUser:" + sample["instruction"] + sample["input"])
    test_grounds.append(sample["output"])

In [29]:
len(test_prompts), len(test_grounds)

(150, 150)

In [30]:
test_predictions = []

for prompt in tqdm(test_prompts):

    messages = []
    messages.append({"role": "user", "content": prompt})

    response = ""
    
    for new_text in model.stream_chat(messages):
        #print(new_text, end="", flush=True)
        response += new_text
        #print()
    test_predictions.append({"role": "assistant", "content": response})

100%|██████████| 150/150 [02:54<00:00,  1.16s/it]


In [31]:
with open(os.path.join(OUTPUT_DIR, f"""CDCP_{TASK}_results_{NB_EPOCHS}.pickle"""), 'wb') as fh:
    results_d = {"ground_truths": test_grounds,
                 "predictions": test_predictions    
        
    }
    pickle.dump(results_d, fh)


In [32]:
with open(os.path.join(OUTPUT_DIR, f"""CDCP_{TASK}_results_{NB_EPOCHS}.pickle"""), "rb") as fh:
        
        results = pickle.load(fh)

In [33]:
results

{'ground_truths': ['{"component_types": ["fact", "value", "policy"]}',
  '{"component_types": ["value", "policy", "policy", "policy", "policy"]}',
  '{"component_types": ["policy", "policy"]}',
  '{"component_types": ["value", "value"]}',
  '{"component_types": ["value", "value", "value", "testimony", "value"]}',
  '{"component_types": ["testimony", "testimony", "testimony", "value", "value", "policy"]}',
  '{"component_types": ["value", "fact", "testimony", "testimony", "testimony"]}',
  '{"component_types": ["value", "testimony", "value", "value", "policy", "policy"]}',
  '{"component_types": ["testimony", "fact", "fact", "policy", "fact", "value", "fact", "fact", "fact", "value"]}',
  '{"component_types": ["value", "value", "value", "value"]}',
  '{"component_types": ["fact", "value", "testimony", "testimony", "testimony", "policy", "value", "value"]}',
  '{"component_types": ["fact", "value", "policy", "value", "policy", "policy"]}',
  '{"component_types": ["value", "value", "fact"

In [34]:
if TASK == 'acc':
    task_grounds, task_preds = post_process_acc(results)

elif TASK == 'ari':
    task_grounds, task_preds = post_process_ari(results)

elif TASK == 'arc':
    task_grounds, task_preds = post_process_arc(results)

elif TASK == 'joint':
    task_grounds, task_preds = post_process_joint(results)

In [35]:
print(classification_report(task_grounds, task_preds, digits=3))

              precision    recall  f1-score   support

        fact      0.305     0.765     0.436       132
   inference      0.000     0.000     0.000         0
      policy      0.650     0.752     0.697       153
   reference      0.500     1.000     0.667         1
   testimony      0.878     0.652     0.748       244
       value      0.820     0.552     0.660       496

    accuracy                          0.634      1026
   macro avg      0.526     0.620     0.535      1026
weighted avg      0.742     0.634     0.658      1026



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [36]:
with open(f"""{OUTPUT_DIR}/classification_report.pickle""", 'wb') as fh:
    
    pickle.dump(classification_report(task_grounds, task_preds, output_dict=True), fh)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
