In [1]:
!nvidia-smi

Mon Jan 19 22:21:39 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
PROJECT_ROOT = "/content/drive/MyDrive/moodmate"

os.makedirs(f"{PROJECT_ROOT}/models", exist_ok=True)
os.makedirs(f"{PROJECT_ROOT}/data", exist_ok=True)
os.makedirs(f"{PROJECT_ROOT}/logs", exist_ok=True)

In [None]:
!pip install transformers accelerate peft datasets bitsandbytes sentencepiece

In [5]:
## Importing necessary libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset, Dataset
import pandas as pd
import torch

In [6]:
# loading the final dataset
## for reproduction purposes, we are using the final cleaned dataset directly from the github repo created for the dataset
df = pd.read_csv(
    "https://raw.githubusercontent.com/nyarderr/moodmate-data/refs/heads/main/goemotions_final.csv"
)
df.head()

Unnamed: 0,text,final_emotion
0,"""If you don't wear BROWN AND ORANGE...YOU DON...",anger
1,"""What do Scottish people look like?"" How I wo...",neutral
2,"### A surprise, to be sure, but a welcome one",surprise
3,"'*Pray*, v. To ask that the laws of the unive...",neutral
4,">it'll get invaded by tankie, unfortunately. ...",neutral


In [7]:
## Convert the dataset to Hugging Face Dataset format
def to_hf_dataset(row):
    return {
        "text": row["text"],
        "labels": row["final_emotion"],
    }

dataset = df.apply(to_hf_dataset, axis=1, result_type="expand")
hf_dataset = Dataset.from_pandas(dataset)
hf_dataset = hf_dataset.train_test_split(test_size=0.1, seed=42)
hf_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 49529
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 5504
    })
})

In [None]:
## Load Qwen tokenizer and model
#model_name = "Qwen/Qwen1.5-1.8B"
model_name = "Qwen/Qwen1.5-0.5B"

## Loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

## 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

## Loading the model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map="auto",

)

## Preparing the model for k-bit training
model = prepare_model_for_kbit_training(model)

In [None]:
### define tokenizer function
def tokenize(batch):
    ## full training string
    prompt = (f"Instrcution: Identify the emotion of the following text.\n"
              f"Text:{batch['text']}\n"
              f"Emotion:")

    full_text = f"{prompt} {batch['labels']}"


    ## tokenizing full text
    tokenized = tokenizer(full_text, padding='max_length', truncation=True, max_length=256)

    ## labels only
    labels = tokenized["input_ids"].copy()

    ## getting prompt token ids
    prompt_tokens = tokenizer(prompt, padding='max_length', truncation=True, max_length=256)["input_ids"]

    ## masking labels
    for i in range(len(prompt_tokens)):
        if prompt_tokens[i] != tokenizer.pad_token_id:
            labels[i] = -100

    tokenized["labels"] = labels
    return tokenized

## Applying the tokenization function to the dataset
tokenized_dataset = hf_dataset.map(tokenize, batched=False)
tokenized_dataset

In [None]:
## Applying LoRA to the model
#LoraConfig?
lora_config = LoraConfig(
    r=8, # rank(number of wheels)
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1, # regularization
    bias="none",
    task_type="CAUSAL_LM" # casual language modeling task
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
## Training arguments
training_args = TrainingArguments(
    output_dir=f"{PROJECT_ROOT}/models/qwen-lora-goemotions",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    warmup_steps=50,
    max_steps=2500,
    learning_rate=2e-4,
    fp16=True,  # half precision training
    logging_steps=20,
    report_to="wandb",
    save_steps=200,
    save_total_limit=2,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
)

trainer.train(resume_from_checkpoint='/content/drive/MyDrive/moodmate/models/qwen-lora-goemotions/checkpoint-1200')

In [None]:
## save the fine-tuned model
trainer.save_model(f"{PROJECT_ROOT}/models/qwen-lora-goemotions")