# Finetune Comics dataset for emotion classification

## Libraries

In [1]:
# Run this cell only once to install LLaMA-Factory

# %cd ..
# %rm -rf LLaMA-Factory
# !git clone https://github.com/hiyouga/LLaMA-Factory.git
# %cd LLaMA-Factory
# %ls
# !pip install -e .[torch,bitsandbytes]

In [2]:
# !pip uninstall -y pydantic
# !pip install pydantic==1.10.9 # 

# !pip uninstall -y gradio
# !pip install gradio==3.48.0

# !pip uninstall -y bitsandbytes
# !pip install --upgrade bitsandbytes

# !pip install tqdm
# !pip install ipywidgets
# !pip install scikit-learn

# Restart kernel afterwards.

In [3]:
import os
import ast
import sys
import json
import torch
import pickle
import subprocess

sys.path.append('../')

import pandas as pd

from tqdm.notebook import tqdm
from llamafactory.chat import ChatModel
from llamafactory.extras.misc import torch_gc
from sklearn.metrics import classification_report
# from utils.post_processing import post_process_acc

In [4]:
try:    
    assert torch.cuda.is_available() is True
    
except AssertionError:
    
    print("Please set up a GPU before using LLaMA Factory...")

## Parameters

In [5]:
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

In [6]:
AM_DIR = os.path.abspath(os.path.join(os.path.join(ROOT_DIR, os.pardir), "am_work"))

In [7]:
DATASET_DIR = os.path.join(ROOT_DIR, "datasets")

In [8]:
LLAMA_FACTORY_DIR = os.path.join(AM_DIR, "coling_2025/LLaMA-Factory")

In [9]:
LLAMA_FACTORY_DIR

'/Utilisateurs/umushtaq/am_work/coling_2025/LLaMA-Factory'

In [10]:
BASE_MODEL = "unsloth/Qwen2-7B-Instruct-bnb-4bit"

In [11]:
TASK = "er"

In [12]:
# TAGS = 1
# TAGS = "wtags" if TAGS == 1 else "wotags"

In [13]:
# CONTEXT = "essay" # essay or paragraph

In [14]:
OUTPUT_DIR = os.path.join(ROOT_DIR, "finetuned_models", f"""comics_{TASK}_{BASE_MODEL.split("/")[1]}""")

In [15]:
OUTPUT_DIR

'/Utilisateurs/umushtaq/er_work/finetuned_models/comics_er_Qwen2-7B-Instruct-bnb-4bit'

## Load Dataset

In [16]:
# *** TRAIN DATASET NAME *** #

train_dataset_name = f"""comics_utterance_train.json"""
test_dataset_name = f"""comics_utterance_test.json"""

#train_dataset_name = f"""PE_{TASK}_{CONTEXT}_train.json"""
train_dataset_file = os.path.join(DATASET_DIR, train_dataset_name)

# *** TEST DATASET NAME *** #

#test_dataset_name = f"""PE_{TASK}_{CONTEXT}_test.json"""
test_dataset_file = os.path.join(DATASET_DIR, test_dataset_name)

In [17]:
train_dataset_file, test_dataset_file

('/Utilisateurs/umushtaq/er_work/datasets/comics_utterance_train.json',
 '/Utilisateurs/umushtaq/er_work/datasets/comics_utterance_test.json')

## Fine-tune Model

In [18]:
if not os.path.exists(os.path.join(ROOT_DIR, "ft_arg_files")):
    os.mkdir(os.path.join(ROOT_DIR, "ft_arg_files"))

In [19]:
# *** TRAIN FILE ***

# model_name = f"""{train_dataset_name.split(".")[0].split("train")[0]}{BASE_MODEL.split("/")[1]}"""

train_file = os.path.join(ROOT_DIR, "ft_arg_files", f"""{train_dataset_name.split(".")[0].split("train")[0]}{BASE_MODEL.split("/")[1]}.json""")

In [20]:
dataset_info_line =  {
  "file_name": f"{train_dataset_file}",
  "columns": {
    "prompt": "instruction",
    "query": "input",
    "response": "output"
  }
}

In [21]:
dataset_info_line

{'file_name': '/Utilisateurs/umushtaq/er_work/datasets/comics_utterance_train.json',
 'columns': {'prompt': 'instruction', 'query': 'input', 'response': 'output'}}

In [22]:
with open(os.path.join(LLAMA_FACTORY_DIR, "data/dataset_info.json"), "r") as jsonFile:
    data = json.load(jsonFile)

data["comics_er"] = dataset_info_line

with open(os.path.join(LLAMA_FACTORY_DIR, "data/dataset_info.json"), "w") as jsonFile:
    json.dump(data, jsonFile)

### Training Args

In [23]:
NB_EPOCHS = 3

In [24]:
args = dict(
  stage="sft",                           # do supervised fine-tuning
  do_train=True,
  model_name_or_path=BASE_MODEL,         # use bnb-4bit-quantized Llama-3-8B-Instruct model
  dataset="comics_er",           # use alpaca and identity datasets
  template="qwen",                     # use llama3 prompt template
  finetuning_type="lora",                # use LoRA adapters to save memory
  lora_target="all",                     # attach LoRA adapters to all linear layers
  output_dir=OUTPUT_DIR,                 # the path to save LoRA adapters
  overwrite_output_dir=True,             # overrides existing output contents
  per_device_train_batch_size=2,         # the batch size
  gradient_accumulation_steps=4,         # the gradient accumulation steps
  lr_scheduler_type="cosine",            # use cosine learning rate scheduler
  logging_steps=10,                      # log every 10 steps
  warmup_ratio=0.1,                      # use warmup scheduler
  save_steps=3000,                       # save checkpoint every 1000 steps
  learning_rate=5e-5,                    # the learning rate
  num_train_epochs=NB_EPOCHS,            # the epochs of training
  max_samples=5000,                       # use 500 examples in each dataset
  max_grad_norm=1.0,                     # clip gradient norm to 1.0
  quantization_bit=4,                    # use 4-bit QLoRA
  loraplus_lr_ratio=16.0,                # use LoRA+ algorithm with lambda=16.0
  fp16=True,                             # use float16 mixed precision training
  report_to="none"                       # discards wandb
)

In [25]:
json.dump(args, open(train_file, "w", encoding="utf-8"), indent=2)

In [26]:
train_file

'/Utilisateurs/umushtaq/er_work/ft_arg_files/comics_utterance_Qwen2-7B-Instruct-bnb-4bit.json'

In [27]:
p = subprocess.Popen(["llamafactory-cli", "train", train_file], cwd=LLAMA_FACTORY_DIR)

In [28]:
p.wait()

09/17/2024 20:36:11 - INFO - llamafactory.cli - Initializing distributed tasks at: 127.0.0.1:28716


W0917 20:36:12.734000 139658647946560 torch/distributed/run.py:757] 
W0917 20:36:12.734000 139658647946560 torch/distributed/run.py:757] *****************************************
W0917 20:36:12.734000 139658647946560 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0917 20:36:12.734000 139658647946560 torch/distributed/run.py:757] *****************************************


09/17/2024 20:36:21 - INFO - llamafactory.hparams.parser - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, compute dtype: torch.float16
09/17/2024 20:36:21 - INFO - llamafactory.hparams.parser - Process rank: 1, device: cuda:1, n_gpu: 1, distributed training: True, compute dtype: torch.float16


[INFO|tokenization_utils_base.py:2269] 2024-09-17 20:36:22,157 >> loading file vocab.json from cache at /Utilisateurs/umushtaq/.cache/huggingface/hub/models--unsloth--Qwen2-7B-Instruct-bnb-4bit/snapshots/26ac6deb2f1f4929ecc7ece49178be31aa1e0755/vocab.json
[INFO|tokenization_utils_base.py:2269] 2024-09-17 20:36:22,157 >> loading file merges.txt from cache at /Utilisateurs/umushtaq/.cache/huggingface/hub/models--unsloth--Qwen2-7B-Instruct-bnb-4bit/snapshots/26ac6deb2f1f4929ecc7ece49178be31aa1e0755/merges.txt
[INFO|tokenization_utils_base.py:2269] 2024-09-17 20:36:22,157 >> loading file tokenizer.json from cache at /Utilisateurs/umushtaq/.cache/huggingface/hub/models--unsloth--Qwen2-7B-Instruct-bnb-4bit/snapshots/26ac6deb2f1f4929ecc7ece49178be31aa1e0755/tokenizer.json
[INFO|tokenization_utils_base.py:2269] 2024-09-17 20:36:22,158 >> loading file added_tokens.json from cache at /Utilisateurs/umushtaq/.cache/huggingface/hub/models--unsloth--Qwen2-7B-Instruct-bnb-4bit/snapshots/26ac6deb2f1f4

09/17/2024 20:36:22 - INFO - llamafactory.data.template - Replace eos token: <|im_end|>
09/17/2024 20:36:22 - INFO - llamafactory.data.template - Replace eos token: <|im_end|>
09/17/2024 20:36:22 - INFO - llamafactory.data.loader - Loading dataset /Utilisateurs/umushtaq/er_work/datasets/comics_utterance_train.json...


[INFO|tokenization_utils_base.py:2513] 2024-09-17 20:36:22,408 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Converting format of dataset: 100%|██████████| 3506/3506 [00:00<00:00, 47574.50 examples/s]


09/17/2024 20:36:23 - INFO - llamafactory.data.loader - Loading dataset /Utilisateurs/umushtaq/er_work/datasets/comics_utterance_train.json...


Running tokenizer on dataset: 100%|██████████| 3506/3506 [00:03<00:00, 1030.49 examples/s]


training example:
input_ids:
[151644, 8948, 198, 2610, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 14374, 1446, 525, 458, 6203, 304, 5748, 5956, 18320, 13, 1446, 525, 2661, 458, 8621, 4160, 578, 504, 264, 19724, 2311, 43810, 553, 366, 1381, 1472, 1381, 29, 9492, 13, 4615, 3383, 374, 311, 48129, 1817, 21532, 681, 438, 825, 476, 803, 279, 2701, 19772, 6846, 25, 330, 10743, 261, 1, 320, 1093, 701, 330, 4839, 70, 590, 1, 320, 17625, 701, 330, 90371, 1, 320, 11419, 701, 330, 59665, 2090, 1, 320, 7778, 701, 330, 23043, 9671, 1, 320, 59782, 8, 476, 330, 79771, 1, 320, 26145, 568, 1446, 1969, 470, 264, 1140, 315, 19772, 6846, 304, 2701, 4718, 3561, 25, 5212, 1607, 22504, 5956, 16833, 788, 4383, 73353, 4790, 320, 495, 11583, 330, 73353, 4790, 320, 495, 9940, 2503, 330, 73353, 4790, 320, 495, 8, 92446, 1380, 1817, 2392, 330, 73353, 16833, 320, 495, 9940, 374, 12575, 553, 825, 16144, 803, 315, 279, 2701, 94775, 19772, 536, 9201, 25, 330, 1093, 497, 330, 17625, 497, 330, 11419, 497,

[INFO|configuration_utils.py:733] 2024-09-17 20:36:27,058 >> loading configuration file config.json from cache at /Utilisateurs/umushtaq/.cache/huggingface/hub/models--unsloth--Qwen2-7B-Instruct-bnb-4bit/snapshots/26ac6deb2f1f4929ecc7ece49178be31aa1e0755/config.json
[INFO|configuration_utils.py:800] 2024-09-17 20:36:27,061 >> Model config Qwen2Config {
  "_name_or_path": "unsloth/Qwen2-7B-Instruct-bnb-4bit",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 3584,
  "initializer_range": 0.02,
  "intermediate_size": 18944,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 28,
  "num_hidden_layers": 28,
  "num_key_value_heads": 4,
  "pad_token_id": 151643,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "

09/17/2024 20:37:23 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
09/17/2024 20:37:23 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
09/17/2024 20:37:23 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.
09/17/2024 20:37:23 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA
09/17/2024 20:37:23 - INFO - llamafactory.model.model_utils.misc - Found linear modules: v_proj,o_proj,q_proj,down_proj,gate_proj,k_proj,up_proj
09/17/2024 20:37:24 - INFO - llamafactory.model.loader - trainable params: 20,185,088 || all params: 7,635,801,600 || trainable%: 0.2643
09/17/2024 20:37:24 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.
09/17/2024 20:37:24 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
09/17/2024 20:37:24 - INFO - llamafactory.model.adapter - Upcasting traina

[INFO|trainer.py:648] 2024-09-17 20:37:24,784 >> Using auto half precision backend


09/17/2024 20:37:25 - INFO - llamafactory.train.trainer_utils - Using LoRA+ optimizer with loraplus lr ratio 16.00.


[INFO|trainer.py:2134] 2024-09-17 20:37:25,248 >> ***** Running training *****
[INFO|trainer.py:2135] 2024-09-17 20:37:25,261 >>   Num examples = 3,506
[INFO|trainer.py:2136] 2024-09-17 20:37:25,261 >>   Num Epochs = 3
[INFO|trainer.py:2137] 2024-09-17 20:37:25,261 >>   Instantaneous batch size per device = 2
[INFO|trainer.py:2140] 2024-09-17 20:37:25,261 >>   Total train batch size (w. parallel, distributed & accumulation) = 16
[INFO|trainer.py:2141] 2024-09-17 20:37:25,261 >>   Gradient Accumulation steps = 4
[INFO|trainer.py:2142] 2024-09-17 20:37:25,261 >>   Total optimization steps = 657
[INFO|trainer.py:2143] 2024-09-17 20:37:25,264 >>   Number of trainable parameters = 20,185,088
  2%|▏         | 10/657 [00:26<28:37,  2.65s/it]

{'loss': 0.5513, 'grad_norm': 1.1023401021957397, 'learning_rate': 7.5757575757575764e-06, 'epoch': 0.05}


  3%|▎         | 20/657 [00:53<28:03,  2.64s/it]

{'loss': 0.2996, 'grad_norm': 1.0976431369781494, 'learning_rate': 1.5151515151515153e-05, 'epoch': 0.09}


  5%|▍         | 30/657 [01:19<27:48,  2.66s/it]

{'loss': 0.2764, 'grad_norm': 1.1571595668792725, 'learning_rate': 2.272727272727273e-05, 'epoch': 0.14}


  6%|▌         | 40/657 [01:46<27:13,  2.65s/it]

{'loss': 0.2651, 'grad_norm': 1.1237519979476929, 'learning_rate': 3.0303030303030306e-05, 'epoch': 0.18}


  8%|▊         | 50/657 [02:12<26:35,  2.63s/it]

{'loss': 0.2523, 'grad_norm': 0.5649328827857971, 'learning_rate': 3.787878787878788e-05, 'epoch': 0.23}


  9%|▉         | 60/657 [02:39<26:38,  2.68s/it]

{'loss': 0.2438, 'grad_norm': 0.552849292755127, 'learning_rate': 4.545454545454546e-05, 'epoch': 0.27}


 11%|█         | 70/657 [03:05<25:39,  2.62s/it]

{'loss': 0.2467, 'grad_norm': 1.1220053434371948, 'learning_rate': 4.999434882941783e-05, 'epoch': 0.32}


 12%|█▏        | 80/657 [03:31<25:16,  2.63s/it]

{'loss': 0.245, 'grad_norm': 0.7973174452781677, 'learning_rate': 4.993080249767159e-05, 'epoch': 0.36}


 14%|█▎        | 90/657 [03:58<25:01,  2.65s/it]

{'loss': 0.2366, 'grad_norm': 0.9576094150543213, 'learning_rate': 4.979682598982912e-05, 'epoch': 0.41}


 15%|█▌        | 100/657 [04:24<24:36,  2.65s/it]

{'loss': 0.2421, 'grad_norm': 1.114205241203308, 'learning_rate': 4.959279779306672e-05, 'epoch': 0.46}


 17%|█▋        | 110/657 [04:51<24:09,  2.65s/it]

{'loss': 0.2407, 'grad_norm': 0.6632283926010132, 'learning_rate': 4.931929429243368e-05, 'epoch': 0.5}


 18%|█▊        | 120/657 [05:17<23:47,  2.66s/it]

{'loss': 0.2234, 'grad_norm': 0.5057310461997986, 'learning_rate': 4.8977088142549285e-05, 'epoch': 0.55}


 20%|█▉        | 130/657 [05:44<23:02,  2.62s/it]

{'loss': 0.2275, 'grad_norm': 0.4836157262325287, 'learning_rate': 4.856714608483312e-05, 'epoch': 0.59}


 21%|██▏       | 140/657 [06:10<22:35,  2.62s/it]

{'loss': 0.2254, 'grad_norm': 0.6469013690948486, 'learning_rate': 4.8090626216434944e-05, 'epoch': 0.64}


 23%|██▎       | 150/657 [06:36<22:29,  2.66s/it]

{'loss': 0.2149, 'grad_norm': 1.969303011894226, 'learning_rate': 4.754887471857969e-05, 'epoch': 0.68}


 24%|██▍       | 160/657 [07:03<22:14,  2.68s/it]

{'loss': 0.2551, 'grad_norm': 1.0166857242584229, 'learning_rate': 4.694342205356988e-05, 'epoch': 0.73}


 26%|██▌       | 170/657 [07:29<21:25,  2.64s/it]

{'loss': 0.2194, 'grad_norm': 1.0679407119750977, 'learning_rate': 4.627597864118919e-05, 'epoch': 0.78}


 27%|██▋       | 180/657 [07:56<20:55,  2.63s/it]

{'loss': 0.2377, 'grad_norm': 1.2205135822296143, 'learning_rate': 4.554843002672129e-05, 'epoch': 0.82}


 29%|██▉       | 190/657 [08:22<20:30,  2.63s/it]

{'loss': 0.2251, 'grad_norm': 0.7987896203994751, 'learning_rate': 4.476283155423465e-05, 'epoch': 0.87}


 30%|███       | 200/657 [08:48<20:14,  2.66s/it]

{'loss': 0.235, 'grad_norm': 0.7591536045074463, 'learning_rate': 4.3921402560181175e-05, 'epoch': 0.91}


 32%|███▏      | 210/657 [09:15<19:41,  2.64s/it]

{'loss': 0.222, 'grad_norm': 0.6437086462974548, 'learning_rate': 4.302652010371205e-05, 'epoch': 0.96}


 33%|███▎      | 220/657 [09:41<19:15,  2.64s/it]

{'loss': 0.2074, 'grad_norm': 0.789050817489624, 'learning_rate': 4.208071225142282e-05, 'epoch': 1.0}


 35%|███▌      | 230/657 [10:08<18:55,  2.66s/it]

{'loss': 0.1949, 'grad_norm': 0.6989786028862, 'learning_rate': 4.108665093549844e-05, 'epoch': 1.05}


 37%|███▋      | 240/657 [10:34<18:22,  2.64s/it]

{'loss': 0.2118, 'grad_norm': 0.6337319612503052, 'learning_rate': 4.0047144405434175e-05, 'epoch': 1.09}


 38%|███▊      | 250/657 [11:01<17:53,  2.64s/it]

{'loss': 0.1844, 'grad_norm': 0.5998358726501465, 'learning_rate': 3.896512929465659e-05, 'epoch': 1.14}


 40%|███▉      | 260/657 [11:27<17:29,  2.64s/it]

{'loss': 0.2047, 'grad_norm': 0.7406709790229797, 'learning_rate': 3.7843662324456466e-05, 'epoch': 1.19}


 41%|████      | 270/657 [11:53<16:56,  2.63s/it]

{'loss': 0.2071, 'grad_norm': 1.0060415267944336, 'learning_rate': 3.668591166867035e-05, 'epoch': 1.23}


 43%|████▎     | 280/657 [12:20<16:30,  2.63s/it]

{'loss': 0.1986, 'grad_norm': 1.3233298063278198, 'learning_rate': 3.5495148003505717e-05, 'epoch': 1.28}


 44%|████▍     | 290/657 [12:46<16:04,  2.63s/it]

{'loss': 0.2183, 'grad_norm': 0.4204199016094208, 'learning_rate': 3.4274735267794245e-05, 'epoch': 1.32}


 46%|████▌     | 300/657 [13:12<15:34,  2.62s/it]

{'loss': 0.1777, 'grad_norm': 0.4864746332168579, 'learning_rate': 3.3028121159775656e-05, 'epoch': 1.37}


 47%|████▋     | 310/657 [13:39<15:09,  2.62s/it]

{'loss': 0.1799, 'grad_norm': 0.9694530963897705, 'learning_rate': 3.1758827397259074e-05, 'epoch': 1.41}


 49%|████▊     | 320/657 [14:05<14:47,  2.63s/it]

{'loss': 0.1873, 'grad_norm': 0.5777468681335449, 'learning_rate': 3.0470439768677116e-05, 'epoch': 1.46}


 50%|█████     | 330/657 [14:31<14:20,  2.63s/it]

{'loss': 0.2075, 'grad_norm': 0.6403128504753113, 'learning_rate': 2.9166598003138766e-05, 'epoch': 1.51}


 52%|█████▏    | 340/657 [14:58<13:56,  2.64s/it]

{'loss': 0.1824, 'grad_norm': 1.1541911363601685, 'learning_rate': 2.785098548809844e-05, 'epoch': 1.55}


 53%|█████▎    | 350/657 [15:24<13:22,  2.61s/it]

{'loss': 0.1875, 'grad_norm': 1.0687892436981201, 'learning_rate': 2.652731886368906e-05, 'epoch': 1.6}


 55%|█████▍    | 360/657 [15:51<13:01,  2.63s/it]

{'loss': 0.1956, 'grad_norm': 0.5424371361732483, 'learning_rate': 2.5199337523115418e-05, 'epoch': 1.64}


 56%|█████▋    | 370/657 [16:17<12:43,  2.66s/it]

{'loss': 0.1954, 'grad_norm': 0.5226247906684875, 'learning_rate': 2.3870793048769537e-05, 'epoch': 1.69}


 58%|█████▊    | 380/657 [16:44<12:15,  2.65s/it]

{'loss': 0.1603, 'grad_norm': 0.4974578619003296, 'learning_rate': 2.254543861391121e-05, 'epoch': 1.73}


 59%|█████▉    | 390/657 [17:10<11:41,  2.63s/it]

{'loss': 0.2102, 'grad_norm': 0.7681999802589417, 'learning_rate': 2.1227018379854383e-05, 'epoch': 1.78}


 61%|██████    | 400/657 [17:36<11:15,  2.63s/it]

{'loss': 0.1968, 'grad_norm': 0.6189435720443726, 'learning_rate': 1.991925691861241e-05, 'epoch': 1.82}


 62%|██████▏   | 410/657 [18:03<10:52,  2.64s/it]

{'loss': 0.1895, 'grad_norm': 0.6123371720314026, 'learning_rate': 1.8625848690883686e-05, 'epoch': 1.87}


 64%|██████▍   | 420/657 [18:29<10:22,  2.63s/it]

{'loss': 0.1894, 'grad_norm': 0.8038066625595093, 'learning_rate': 1.735044760910251e-05, 'epoch': 1.92}


 65%|██████▌   | 430/657 [18:56<09:58,  2.64s/it]

{'loss': 0.1818, 'grad_norm': 0.6552383303642273, 'learning_rate': 1.609665671503987e-05, 'epoch': 1.96}


 67%|██████▋   | 440/657 [19:22<09:31,  2.63s/it]

{'loss': 0.1746, 'grad_norm': 0.5469232201576233, 'learning_rate': 1.4868018001115166e-05, 'epoch': 2.01}


 68%|██████▊   | 450/657 [19:48<09:06,  2.64s/it]

{'loss': 0.1246, 'grad_norm': 0.4772845208644867, 'learning_rate': 1.3668002404174047e-05, 'epoch': 2.05}


 70%|███████   | 460/657 [20:15<08:38,  2.63s/it]

{'loss': 0.1173, 'grad_norm': 0.6368279457092285, 'learning_rate': 1.2500000000000006e-05, 'epoch': 2.1}


 72%|███████▏  | 470/657 [20:41<08:13,  2.64s/it]

{'loss': 0.0953, 'grad_norm': 0.7478751540184021, 'learning_rate': 1.136731042626073e-05, 'epoch': 2.14}


 73%|███████▎  | 480/657 [21:07<07:48,  2.64s/it]

{'loss': 0.0988, 'grad_norm': 0.8873948454856873, 'learning_rate': 1.027313356094443e-05, 'epoch': 2.19}


 75%|███████▍  | 490/657 [21:34<07:19,  2.63s/it]

{'loss': 0.1025, 'grad_norm': 0.7643134593963623, 'learning_rate': 9.220560482619956e-06, 'epoch': 2.23}


 76%|███████▌  | 500/657 [22:00<06:53,  2.63s/it]

{'loss': 0.0899, 'grad_norm': 0.7233508229255676, 'learning_rate': 8.21256473805811e-06, 'epoch': 2.28}


 78%|███████▊  | 510/657 [22:26<06:26,  2.63s/it]

{'loss': 0.1002, 'grad_norm': 0.9916441440582275, 'learning_rate': 7.251993941883428e-06, 'epoch': 2.33}


 79%|███████▉  | 520/657 [22:53<06:00,  2.63s/it]

{'loss': 0.1052, 'grad_norm': 1.2063912153244019, 'learning_rate': 6.3415617319875716e-06, 'epoch': 2.37}


 81%|████████  | 530/657 [23:19<05:34,  2.63s/it]

{'loss': 0.1063, 'grad_norm': 0.6956425905227661, 'learning_rate': 5.483840103430599e-06, 'epoch': 2.42}


 82%|████████▏ | 540/657 [23:46<05:10,  2.66s/it]

{'loss': 0.1091, 'grad_norm': 0.8245587944984436, 'learning_rate': 4.681252142486841e-06, 'epoch': 2.46}


 84%|████████▎ | 550/657 [24:12<04:40,  2.62s/it]

{'loss': 0.1132, 'grad_norm': 0.853794276714325, 'learning_rate': 3.936065181362211e-06, 'epoch': 2.51}


 85%|████████▌ | 560/657 [24:38<04:14,  2.62s/it]

{'loss': 0.1084, 'grad_norm': 0.8456055521965027, 'learning_rate': 3.2503843929207413e-06, 'epoch': 2.55}


 87%|████████▋ | 570/657 [25:05<03:49,  2.64s/it]

{'loss': 0.1011, 'grad_norm': 0.648411214351654, 'learning_rate': 2.6261468435155978e-06, 'epoch': 2.6}


 88%|████████▊ | 580/657 [25:31<03:22,  2.62s/it]

{'loss': 0.1002, 'grad_norm': 0.6906623244285583, 'learning_rate': 2.065116020725433e-06, 'epoch': 2.65}


 90%|████████▉ | 590/657 [25:58<02:56,  2.64s/it]

{'loss': 0.1123, 'grad_norm': 1.2719172239303589, 'learning_rate': 1.5688768514553587e-06, 'epoch': 2.69}


 91%|█████████▏| 600/657 [26:24<02:31,  2.65s/it]

{'loss': 0.1025, 'grad_norm': 0.9647970199584961, 'learning_rate': 1.138831224476533e-06, 'epoch': 2.74}


 93%|█████████▎| 610/657 [26:51<02:04,  2.64s/it]

{'loss': 0.1134, 'grad_norm': 0.6448526978492737, 'learning_rate': 7.761940300533399e-07, 'epoch': 2.78}


 94%|█████████▍| 620/657 [27:17<01:37,  2.64s/it]

{'loss': 0.0991, 'grad_norm': 0.8042908310890198, 'learning_rate': 4.819897278462521e-07, 'epoch': 2.83}


 96%|█████████▌| 630/657 [27:43<01:11,  2.64s/it]

{'loss': 0.1026, 'grad_norm': 0.809659481048584, 'learning_rate': 2.5704945278623436e-07, 'epoch': 2.87}


 97%|█████████▋| 640/657 [28:10<00:44,  2.61s/it]

{'loss': 0.1019, 'grad_norm': 0.8860611915588379, 'learning_rate': 1.0200866709657864e-07, 'epoch': 2.92}


 99%|█████████▉| 650/657 [28:36<00:18,  2.62s/it]

{'loss': 0.101, 'grad_norm': 0.8921215534210205, 'learning_rate': 1.730536509532421e-08, 'epoch': 2.96}


100%|██████████| 657/657 [28:54<00:00,  2.63s/it][INFO|trainer.py:3503] 2024-09-17 21:06:20,289 >> Saving model checkpoint to /Utilisateurs/umushtaq/er_work/finetuned_models/comics_er_Qwen2-7B-Instruct-bnb-4bit/checkpoint-657
[INFO|configuration_utils.py:733] 2024-09-17 21:06:20,681 >> loading configuration file config.json from cache at /Utilisateurs/umushtaq/.cache/huggingface/hub/models--unsloth--Qwen2-7B-Instruct-bnb-4bit/snapshots/26ac6deb2f1f4929ecc7ece49178be31aa1e0755/config.json
[INFO|configuration_utils.py:800] 2024-09-17 21:06:20,682 >> Model config Qwen2Config {
  "_name_or_path": "unsloth/Qwen2-7B-Instruct",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 3584,
  "initializer_range": 0.02,
  "intermediate_size": 18944,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 28,
  "num_hidden_layer

{'train_runtime': 1738.738, 'train_samples_per_second': 6.049, 'train_steps_per_second': 0.378, 'train_loss': 0.1841895292701605, 'epoch': 3.0}


[INFO|configuration_utils.py:733] 2024-09-17 21:06:24,304 >> loading configuration file config.json from cache at /Utilisateurs/umushtaq/.cache/huggingface/hub/models--unsloth--Qwen2-7B-Instruct-bnb-4bit/snapshots/26ac6deb2f1f4929ecc7ece49178be31aa1e0755/config.json
[INFO|configuration_utils.py:800] 2024-09-17 21:06:24,304 >> Model config Qwen2Config {
  "_name_or_path": "unsloth/Qwen2-7B-Instruct",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 3584,
  "initializer_range": 0.02,
  "intermediate_size": 18944,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 28,
  "num_hidden_layers": 28,
  "num_key_value_heads": 4,
  "pad_token_id": 151643,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
 

***** train metrics *****
  epoch                    =      2.9966
  total_flos               = 101340307GF
  train_loss               =      0.1842
  train_runtime            =  0:28:58.73
  train_samples_per_second =       6.049
  train_steps_per_second   =       0.378


[INFO|modelcard.py:449] 2024-09-17 21:06:25,641 >> Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}


0

## Inference on the fine-tuned model

In [29]:
OUTPUT_DIR

'/Utilisateurs/umushtaq/er_work/finetuned_models/comics_er_Qwen2-7B-Instruct-bnb-4bit'

In [30]:
os.listdir(OUTPUT_DIR)

['trainer_log.jsonl',
 'checkpoint-657',
 'README.md',
 'adapter_model.safetensors',
 'adapter_config.json',
 'tokenizer_config.json',
 'special_tokens_map.json',
 'added_tokens.json',
 'vocab.json',
 'merges.txt',
 'tokenizer.json',
 'training_args.bin',
 'train_results.json',
 'all_results.json',
 'trainer_state.json']

In [31]:
args = dict(
  model_name_or_path=BASE_MODEL, # use bnb-4bit-quantized Llama-3-8B-Instruct model
  adapter_name_or_path=OUTPUT_DIR,            # load the saved LoRA adapters
  template="qwen",                     # same to the one in training
  finetuning_type="lora",                  # same to the one in training
  quantization_bit=4,                    # load 4-bit quantized model
)


In [32]:
model = ChatModel(args)

[INFO|tokenization_utils_base.py:2269] 2024-09-17 21:06:28,952 >> loading file vocab.json from cache at /Utilisateurs/umushtaq/.cache/huggingface/hub/models--unsloth--Qwen2-7B-Instruct-bnb-4bit/snapshots/26ac6deb2f1f4929ecc7ece49178be31aa1e0755/vocab.json
[INFO|tokenization_utils_base.py:2269] 2024-09-17 21:06:28,954 >> loading file merges.txt from cache at /Utilisateurs/umushtaq/.cache/huggingface/hub/models--unsloth--Qwen2-7B-Instruct-bnb-4bit/snapshots/26ac6deb2f1f4929ecc7ece49178be31aa1e0755/merges.txt
[INFO|tokenization_utils_base.py:2269] 2024-09-17 21:06:28,956 >> loading file tokenizer.json from cache at /Utilisateurs/umushtaq/.cache/huggingface/hub/models--unsloth--Qwen2-7B-Instruct-bnb-4bit/snapshots/26ac6deb2f1f4929ecc7ece49178be31aa1e0755/tokenizer.json
[INFO|tokenization_utils_base.py:2269] 2024-09-17 21:06:28,958 >> loading file added_tokens.json from cache at /Utilisateurs/umushtaq/.cache/huggingface/hub/models--unsloth--Qwen2-7B-Instruct-bnb-4bit/snapshots/26ac6deb2f1f4

09/17/2024 21:06:29 - INFO - llamafactory.data.template - Replace eos token: <|im_end|>


[INFO|configuration_utils.py:733] 2024-09-17 21:06:29,256 >> loading configuration file config.json from cache at /Utilisateurs/umushtaq/.cache/huggingface/hub/models--unsloth--Qwen2-7B-Instruct-bnb-4bit/snapshots/26ac6deb2f1f4929ecc7ece49178be31aa1e0755/config.json
[INFO|configuration_utils.py:800] 2024-09-17 21:06:29,260 >> Model config Qwen2Config {
  "_name_or_path": "unsloth/Qwen2-7B-Instruct-bnb-4bit",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "hidden_act": "silu",
  "hidden_size": 3584,
  "initializer_range": 0.02,
  "intermediate_size": 18944,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 28,
  "num_hidden_layers": 28,
  "num_key_value_heads": 4,
  "pad_token_id": 151643,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "

09/17/2024 21:06:29 - INFO - llamafactory.model.model_utils.quantization - Loading ?-bit BITSANDBYTES-quantized model.
09/17/2024 21:06:29 - INFO - llamafactory.model.patcher - Using KV cache for faster generation.


[INFO|modeling_utils.py:3678] 2024-09-17 21:06:29,527 >> loading weights file model.safetensors from cache at /Utilisateurs/umushtaq/.cache/huggingface/hub/models--unsloth--Qwen2-7B-Instruct-bnb-4bit/snapshots/26ac6deb2f1f4929ecc7ece49178be31aa1e0755/model.safetensors
[INFO|modeling_utils.py:1606] 2024-09-17 21:06:29,560 >> Instantiating Qwen2ForCausalLM model under default dtype torch.bfloat16.
[INFO|configuration_utils.py:1038] 2024-09-17 21:06:29,564 >> Generate config GenerationConfig {
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "pad_token_id": 151643
}

[INFO|quantizer_bnb_4bit.py:106] 2024-09-17 21:06:30,712 >> target_dtype {target_dtype} is replaced by `CustomDtype.INT4` for 4-bit BnB quantization
[INFO|modeling_utils.py:4507] 2024-09-17 21:06:32,872 >> All model checkpoint weights were used when initializing Qwen2ForCausalLM.

[INFO|modeling_utils.py:4515] 2024-09-17 21:06:32,874 >> All the weights of Qwen2ForCausalLM were initialized from the model checkpoint at uns

09/17/2024 21:06:33 - INFO - llamafactory.model.model_utils.attention - Using torch SDPA for faster training and inference.
09/17/2024 21:06:33 - INFO - llamafactory.model.adapter - Loaded adapter(s): /Utilisateurs/umushtaq/er_work/finetuned_models/comics_er_Qwen2-7B-Instruct-bnb-4bit
09/17/2024 21:06:33 - INFO - llamafactory.model.loader - all params: 7,635,801,600


In [33]:
with open(test_dataset_file, "r+") as fh:
    test_dataset = json.load(fh)

In [34]:
test_prompts = []
test_grounds = []

for sample in test_dataset:
    test_prompts.append("\nUser:" + sample["instruction"] + sample["input"])
    test_grounds.append(sample["output"])

In [None]:
test_predictions = []

for prompt in tqdm(test_prompts):

    messages = []
    messages.append({"role": "user", "content": prompt})

    response = ""
    
    for new_text in model.stream_chat(messages):
        #print(new_text, end="", flush=True)
        response += new_text
        #print()
    test_predictions.append({"role": "assistant", "content": response})

    torch_gc()

  0%|          | 0/1776 [00:00<?, ?it/s]

In [None]:
with open(os.path.join(OUTPUT_DIR, f"""comics_{TASK}_results_{NB_EPOCHS}.pickle"""), 'wb') as fh:
    results_d = {"ground_truths": test_grounds,
                 "predictions": test_predictions    
        
    }
    pickle.dump(results_d, fh)

In [None]:
os.listdir(OUTPUT_DIR)

## Post-processing

In [None]:
NB_EPOCHS

In [None]:
with open(os.path.join(OUTPUT_DIR, f"""comics_{TASK}_results_{NB_EPOCHS}.pickle"""), "rb") as fh:
        
        results = pickle.load(fh)

In [None]:
grounds = results["ground_truths"]
preds = results["predictions"]

In [None]:
len(grounds), len(preds)

In [None]:
grounds_l = []

for i in range(len(grounds)):
    grounds_l.append(json.loads(grounds[i])['list_emotion_classes'])

In [None]:
len(grounds_l)

In [None]:
grounds_l

In [None]:
preds = [x["content"] for x in preds]   

In [None]:
preds_l = []

for i in range(len(preds)):
    preds_l.append(json.loads(preds[i])['list_emotion_classes'])

In [None]:
len(preds_l)

In [None]:
for idx, (i,j) in enumerate(zip(grounds_l, preds_l)):
    if len(i) != len(j):
        print(idx)

In [None]:
# def opposite_acc(component_type):

#     if component_type == "Premise":
#         return "Claim"
#     elif component_type == "Claim":
#         return "Premise"
#     elif component_type == "MajorClaim":
#         return "Claim"

# def harmonize_preds_acc(grounds, preds):

#     l1, l2 = len(preds), len(grounds)
#     if l1 < l2:
#         diff = l2 - l1
#         preds = preds + [opposite_acc(x) for x in grounds[l1:]]
#     else:
#         preds = preds[:l2]
        
#     return preds 

# def post_process_acc(results):

#     grounds = results["ground_truths"]
#     preds = results["predictions"]
    
#     grounds = [json.loads(x)["component_types"] for x in grounds]  
    
#     preds = [x["content"] for x in preds]    
#     preds = [json.loads(x)["component_types"] for x in preds]
    
#     for i,(x,y) in enumerate(zip(grounds, preds)):
    
#         if len(x) != len(y):
            
#             preds[i] = harmonize_preds_acc(x, y)
            
#     task_preds = [item for row in preds for item in row]
#     task_grounds = [item for row in grounds for item in row]

#     return task_grounds, task_preds

In [None]:
# "Anger" (AN), "Disgust" (DI), "Fear" (FE), "Sadness" (SA), "Surprise" (SU) or "Joy" (JO)

In [None]:
def opposite_acc(component_type):

    if component_type == "AN":
        return "SU"
    elif component_type == "DI":
        return "JO"
    elif component_type == "FE":
        return "SA"
    elif component_type == "SA":
        return "AN"
    elif component_type == "SU":
        return "DI"
    elif component_type == "JO":
        return "FE"
    elif component_type == "Neutral":
        return "SA"

def harmonize_preds_acc(grounds, preds):

    l1, l2 = len(preds), len(grounds)
    if l1 < l2:
        diff = l2 - l1
        preds = preds + [opposite_acc(x) for x in grounds[l1:]]
    else:
        preds = preds[:l2]
        
    return preds 

In [None]:
for i,(x,y) in enumerate(zip(grounds_l, preds_l)):

    if len(x) != len(y):
        
        preds_l[i] = harmonize_preds_acc(x, y)

In [None]:
task_preds = [item for row in preds_l for item in row]
task_grounds = [item for row in grounds_l for item in row]

In [None]:
task_preds

In [None]:
task_grounds

In [None]:
task_grounds = ['Neutral' if x == ['Neutral'] else x for x in task_grounds]
task_preds = ['Neutral' if x == ['Neutral'] else x for x in task_preds]

In [None]:
# sanity check: 
len(task_preds) == len(task_grounds)

In [None]:
len(task_preds)

In [None]:
len(task_grounds)

In [None]:
#task_grounds, task_preds = post_process_acc(results)

In [None]:
# sanity check: 
#len(task_preds) == len(task_grounds)

## Results

In [None]:
print(classification_report(task_grounds, task_preds, digits=3))

In [None]:
with open(f"""{OUTPUT_DIR}/classification_report_llama3.1_5.pickle""", 'wb') as fh:
    
    pickle.dump(classification_report(task_grounds, task_preds, output_dict=True), fh)