In [1]:
!nvidia-smi


Fri Dec 12 02:53:18 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   58C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [51]:
!pip install transformers accelerate peft datasets bitsandbytes sentencepiece




In [25]:
## Importing necessary libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
from datasets import load_dataset, Dataset
import pandas as pd
import torch

In [7]:
# loading the final dataset
df = pd.read_csv(
    "https://raw.githubusercontent.com/nyarderr/moodmate/refs/heads/main/data/processed/goemotions_final.csv?token=GHSAT0AAAAAADKMR3EPFNSSUDO4UTI2755A2J3SJRA"
)
df.head()

Unnamed: 0,text,final_emotion
0,"""If you don't wear BROWN AND ORANGE...YOU DON...",anger
1,"""What do Scottish people look like?"" How I wo...",neutral
2,"### A surprise, to be sure, but a welcome one",surprise
3,"'*Pray*, v. To ask that the laws of the unive...",neutral
4,">it'll get invaded by tankie, unfortunately. ...",neutral


In [60]:
## Convert the dataset to Hugging Face Dataset format

def to_hf_dataset(row):
    return {
        "text": f"Instruction: Identify the emotion of the following text.\nText:{row['text']}\nEmotion:",
        "labels": row['final_emotion']
    }

dataset = df.apply(to_hf_dataset, axis=1, result_type='expand')
#dataset = dataset.to_frame().reset_index(drop=True)

## Convert to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(dataset)


## Splitting the dataset into train and test sets
hf_dataset = hf_dataset.train_test_split(test_size=0.1, seed=42)
hf_dataset


DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 49529
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 5504
    })
})

In [52]:
### It is important to note that since we are using Hugging Face's Trainer API, we need to login to Hugging Face Hub to access tokens and models.SInce in vs code notebook this is not possible, we will have to login via terminal using the command:
### huggingface-cli login


In [54]:
## Load Qwen tokenizer and model
model_name = "Qwen/Qwen1.5-1.8B"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    load_in_8bit=True,
    device_map="auto"
)

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

In [49]:
#AutoTokenizer.from_pretrained?

In [61]:
## Batch Tokenization function
def tokenize(batch):
    inputs = tokenizer(batch['text'], padding='max_length', truncation=True, max_length=512)
    labels = tokenizer(batch['labels'], padding='max_length', truncation=True, max_length=32)
    inputs['labels'] = labels['input_ids']
    return inputs

## Tokenizing the dataset
tokenized_dataset = hf_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/49529 [00:00<?, ? examples/s]

Map:   0%|          | 0/5504 [00:00<?, ? examples/s]

In [62]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 49529
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 5504
    })
})

In [65]:
## Applying LoRA to the model
#LoraConfig?
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,572,864 || all params: 1,838,401,536 || trainable%: 0.0856


In [68]:
## Training arguments
training_args = TrainingArguments(
    output_dir="../models/qwen-lora-goemotions",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=50,
    max_steps=500,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=20,
    save_steps=200)