In [3]:
!nvidia-smi


/bin/bash: nvidia-smi: command not found


In [1]:
import torch

print(torch.backends.mps.is_available())
print(torch.backends.mps.is_built())


True
True


In [2]:
!pip install transformers accelerate peft datasets bitsandbytes sentencepiece

Collecting accelerate
  Downloading accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting peft
  Downloading peft-0.18.0-py3-none-any.whl.metadata (14 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.0-py3-none-macosx_14_0_arm64.whl.metadata (10 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (10 kB)
Downloading accelerate-1.12.0-py3-none-any.whl (380 kB)
Downloading peft-0.18.0-py3-none-any.whl (556 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m2.5 MB/s[0m  [33m0:00:00[0m
[?25hDownloading bitsandbytes-0.49.0-py3-none-macosx_14_0_arm64.whl (129 kB)
Downloading sentencepiece-0.2.1-cp311-cp311-macosx_11_0_arm64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m3.0 MB/s[0m  [33m0:00:00[0mm eta [36m0:00:01[0m
[?25hInstalling collected packages: sentencepiece, bitsandbytes, accelerate, peft
[2K   [90m━━━━━━━━━━━━━━

In [3]:
## Importing necessary libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset, Dataset
import pandas as pd
import torch

In [4]:
# loading the final dataset
## for reproduction purposes, we are using the final cleaned dataset directly from the github repo created for the dataset
df = pd.read_csv(
    "https://raw.githubusercontent.com/nyarderr/moodmate-data/refs/heads/main/goemotions_final.csv"
)
df.head()

Unnamed: 0,text,final_emotion
0,"""If you don't wear BROWN AND ORANGE...YOU DON...",anger
1,"""What do Scottish people look like?"" How I wo...",neutral
2,"### A surprise, to be sure, but a welcome one",surprise
3,"'*Pray*, v. To ask that the laws of the unive...",neutral
4,">it'll get invaded by tankie, unfortunately. ...",neutral


In [5]:
## Convert the dataset to Hugging Face Dataset format

def to_hf_dataset(row):
    return {
        "text": f"Instruction: Identify the emotion of the following text.\nText:{row['text']}\nEmotion:",
        "labels": row['final_emotion']
    }

dataset = df.apply(to_hf_dataset, axis=1, result_type='expand')
#dataset = dataset.to_frame().reset_index(drop=True)

## Convert to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(dataset)


## Splitting the dataset into train and test sets
hf_dataset = hf_dataset.train_test_split(test_size=0.1, seed=42)
hf_dataset


DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 49529
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 5504
    })
})

In [6]:
### It is important to note that since we are using Hugging Face's Trainer API, we need to login to Hugging Face Hub to access tokens and models.SInce in vs code notebook this is not possible, we will have to login via terminal using the command:
### huggingface-cli login

In [7]:
## Load Qwen tokenizer and model
model_name = "Qwen/Qwen1.5-1.8B"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    load_in_8bit=True,
    device_map="auto"
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

Cancellation requested; stopping current tasks.


KeyboardInterrupt: 

In [8]:
#AutoTokenizer.from_pretrained?

In [9]:
## Batch Tokenization function
def tokenize(batch):
    inputs = tokenizer(batch['text'], padding='max_length', truncation=True, max_length=512)
    labels = tokenizer(batch['labels'], padding='max_length', truncation=True, max_length=32)
    inputs['labels'] = labels['input_ids']
    return inputs

## Tokenizing the dataset
tokenized_dataset = hf_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/49529 [00:00<?, ? examples/s]

Map:   0%|          | 0/5504 [00:00<?, ? examples/s]

In [10]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 49529
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 5504
    })
})

In [11]:
## Applying LoRA to the model
#LoraConfig?
lora_config = LoraConfig(
    r=8, # rank(number of wheels)
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1, # regularization
    bias="none",
    task_type="CAUSAL_LM" # casual language modeling task
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,572,864 || all params: 1,838,401,536 || trainable%: 0.0856


In [12]:
### choose max steps to epochs
batch_size = 4*4  # per_device_train_batch_size * gradient_accumulation_steps
num_samples = len(tokenized_dataset['train'])
steps_per_epoch = num_samples // batch_size
steps_per_epoch # 3095 steps per epoch

3095

In [13]:
## Training arguments
training_args = TrainingArguments(
    output_dir="../models/qwen-lora-goemotions",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=50,
    max_steps=3095,
    learning_rate=2e-4,
    fp16=True,  #half precision training
    logging_steps=20,
    report_to="wandb",
    save_steps=200)

: 