In [1]:
!pip install "numpy>=2.0.0" --quiet
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" --quiet
!pip install "transformers>=4.41.0" --quiet
!pip install "peft>=0.11.1" --quiet
!pip install "accelerate>=0.30.1" --quiet
!pip install "bitsandbytes>=0.43.1" --quiet
!pip install "trl>=0.8.6" --quiet
!pip install "datasets>=2.18.0" --quiet
!pip install "scikit-learn>=1.4.2" --quiet
!pip install "sentence-transformers>=2.7.0" --quiet
!pip install faiss-cpu --quiet

[0m  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[0m

In [1]:
from unsloth import FastLanguageModel
import os
import torch
import uuid
import pickle
import zipfile
import shutil
import numpy as np
from datasets import load_dataset, concatenate_datasets
from transformers import TrainingArguments
from trl import SFTTrainer
from peft import LoraConfig, PeftModel
import faiss
from sentence_transformers import SentenceTransformer
import traceback

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
try:
    ed_dataset = load_dataset("facebook/empathetic_dialogues", trust_remote_code=True)
    goemotions_dataset = load_dataset("go_emotions")

    EOS_TOKEN = "<|eot_id|>"
    BOS_TOKEN = "<|begin_of_text|>"

    def format_llama3_instruct(system_prompt, conversation_history):
        if not conversation_history: return None
        formatted_string = BOS_TOKEN
        formatted_string += f"<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}{EOS_TOKEN}"
        for turn in conversation_history:
            role = turn.get("role")
            content = turn.get("content")
            if role and content:
                formatted_string += f"<|start_header_id|>{role}<|end_header_id|>\n\n{content}{EOS_TOKEN}"
        return formatted_string

    def format_ed_for_sft(example):
        system_prompt = f"You are an empathetic assistant responding to someone feeling {example['context']}. Be understanding and supportive."
        history = [{'role': 'user', 'content': example['prompt']}]
        target_response = example['utterance']
        if not example['prompt'] or not target_response: return {"text": None}
        history_with_response = history + [{'role': 'assistant', 'content': target_response}]
        return {"text": format_llama3_instruct(system_prompt, history_with_response)}

    formatted_ed = ed_dataset['train'].map(format_ed_for_sft, num_proc=os.cpu_count(), remove_columns=ed_dataset['train'].column_names)
    formatted_ed = formatted_ed.filter(lambda example: example['text'] is not None and len(example['text']) > len(BOS_TOKEN) + 50)

    emotion_label_names = goemotions_dataset['train'].features['labels'].feature.names
    def format_goemotions_for_sft(example):
        user_text = example['text']
        emotion_indices = example['labels']
        if not user_text: return {"text": None}
        detected_emotions = [emotion_label_names[i] for i in emotion_indices if i < len(emotion_label_names)]
        emotion_string = ", ".join(detected_emotions) if detected_emotions else "a mix of feelings"
        system_prompt = f"You are an empathetic assistant. The user seems to be expressing feelings related to: {emotion_string}. Respond kindly."
        history = [{'role': 'user', 'content': user_text}]
        assistant_response = "I understand. Could you tell me a bit more about that?"
        history_with_response = history + [{'role': 'assistant', 'content': assistant_response}]
        return {"text": format_llama3_instruct(system_prompt, history_with_response)}

    formatted_goemotions = goemotions_dataset['train'].map(format_goemotions_for_sft, num_proc=os.cpu_count(), remove_columns=goemotions_dataset['train'].column_names)
    formatted_goemotions = formatted_goemotions.filter(lambda example: example['text'] is not None and len(example['text']) > len(BOS_TOKEN) + 50)

    combined_dataset_full = concatenate_datasets([formatted_ed, formatted_goemotions]).shuffle(seed=42)

    num_samples_to_train = 4000
    if len(combined_dataset_full) < num_samples_to_train:
        combined_dataset_subset = combined_dataset_full
    else:
        combined_dataset_subset = combined_dataset_full.select(range(num_samples_to_train))

    personal_narratives = [
        "I grew up in a small town near the mountains and loved hiking.", "My favorite pet was a golden retriever named Max.",
        "I studied computer science in college and enjoyed coding.", "One memorable trip was visiting the Grand Canyon.",
        "I volunteer at the local animal shelter on weekends.", "My favorite food is pizza, especially with mushrooms.",
        "I learned to play the guitar when I was a teenager.", "I broke my arm falling off a bike when I was 10 years old.",
        "My family has a tradition of baking cookies every holiday season.", "I am passionate about environmental conservation."
    ]

except Exception as e:
    traceback.print_exc(); raise

In [3]:
faiss_index_path = "/content/personal_narratives.faiss"
narratives_path = "/content/personal_narratives.pkl"

In [4]:
embedding_model_name = "all-MiniLM-L6-v2"
try:
    embedding_model = SentenceTransformer(embedding_model_name)
    embedding_dim = embedding_model.get_sentence_embedding_dimension()
    narrative_embeddings = embedding_model.encode(personal_narratives, convert_to_numpy=True, show_progress_bar=True)
    if narrative_embeddings.dtype != np.float32:
        narrative_embeddings = narrative_embeddings.astype(np.float32)
    index = faiss.IndexFlatL2(embedding_dim)
    index.add(narrative_embeddings)
    os.makedirs(os.path.dirname(faiss_index_path), exist_ok=True)
    faiss.write_index(index, faiss_index_path)
    with open(narratives_path, 'wb') as f: pickle.dump(personal_narratives, f)
except Exception as e:
    traceback.print_exc(); raise

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
model_name_to_load = "unsloth/llama-3-8b-Instruct-bnb-4bit"
max_seq_length = 512
lora_r = 16
lora_alpha = 16
lora_dropout = 0.05
output_dir_checkpoints = "/content/llama3_aac_finetuned_checkpoints"
warmup_steps = 5
num_train_epochs = 1
learning_rate = 2e-4
per_device_train_batch_size_debug = 1
gradient_accumulation_steps_debug = 1
logging_steps_debug = 10

In [6]:
try:
    if not torch.cuda.is_available(): raise RuntimeError("CUDA (GPU) is not available.")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_name_to_load,
        max_seq_length = max_seq_length,
        dtype = None,
        load_in_4bit = True,
        use_exact_model_name = True,
    )
    tokenizer.pad_token = tokenizer.eos_token
    model = FastLanguageModel.get_peft_model(
        model,
        r = lora_r,
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj"],
        lora_alpha = lora_alpha,
        lora_dropout = lora_dropout,
        bias = "none",
        use_gradient_checkpointing = "unsloth",
        random_state = 42,
    )
    use_bf16 = torch.cuda.is_bf16_supported()
    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = combined_dataset_subset,
        dataset_text_field = "text",
        max_seq_length = max_seq_length,
        args = TrainingArguments(
            output_dir=output_dir_checkpoints,
            per_device_train_batch_size = per_device_train_batch_size_debug,
            gradient_accumulation_steps = gradient_accumulation_steps_debug,
            logging_steps = logging_steps_debug,
            warmup_steps = warmup_steps,
            num_train_epochs = num_train_epochs,
            learning_rate = learning_rate,
            fp16 = not use_bf16,
            bf16 = use_bf16,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 42,
            save_strategy = "epoch",
            report_to = "none",
        ),
    )
    torch.cuda.empty_cache()
    trainer_stats = trainer.train()
    final_adapter_path = "/content/final_lora_adapters_8k"
    os.makedirs(final_adapter_path, exist_ok=True)
    model.save_pretrained(final_adapter_path)
    tokenizer.save_pretrained(final_adapter_path)

except Exception as e:
    traceback.print_exc(); raise

==((====))==  Unsloth 2025.4.8: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.4.8 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/4000 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,000 | Num Epochs = 1 | Total steps = 4,000
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 1 x 1) = 1
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Step,Training Loss
10,0.784
20,0.1839
30,0.0407
40,0.0275
50,0.0228
60,0.0144
70,0.0087
80,0.0036
90,0.0012
100,0.0003


Step,Training Loss
10,0.784
20,0.1839
30,0.0407
40,0.0275
50,0.0228
60,0.0144
70,0.0087
80,0.0036
90,0.0012
100,0.0003


Unsloth: Will smartly offload gradients to save VRAM!


In [7]:
final_adapter_path = "/content/final_lora_adapters_8k"
merged_model_dir = "/content/merged_llama3_aac_finetuned_8k"
zip_filename = "/content/merged_llama3_aac_finetuned_model_8k.zip"
base_model_name_for_merge = "unsloth/llama-3-8b-Instruct-bnb-4bit"

In [9]:
!zip -r /content/merged_llama3_aac_finetuned_model_8k.zip /content/final_lora_adapters_8k

  adding: content/final_lora_adapters_8k/ (stored 0%)
  adding: content/final_lora_adapters_8k/adapter_model.safetensors (deflated 7%)
  adding: content/final_lora_adapters_8k/special_tokens_map.json (deflated 63%)
  adding: content/final_lora_adapters_8k/tokenizer.json (deflated 85%)
  adding: content/final_lora_adapters_8k/tokenizer_config.json (deflated 96%)
  adding: content/final_lora_adapters_8k/adapter_config.json (deflated 56%)
  adding: content/final_lora_adapters_8k/README.md (deflated 66%)
