In [1]:
#######################################################
# 1) Install Dependencies (Unsloth, Transformers, TRL)
#######################################################
!pip install unsloth --upgrade          # If not installed yet
!pip install trl transformers datasets  # If needed

Collecting unsloth
  Downloading unsloth-2024.12.12-py3-none-any.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting unsloth_zoo>=2024.12.7 (from unsloth)
  Downloading unsloth_zoo-2024.12.7-py3-none-any.whl.metadata (16 kB)
Collecting torch>=2.4.0 (from unsloth)
  Downloading torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl.metadata (28 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.5-py3-none-any.whl.metadata (9.4 kB)
Collecting transformers!=4.47.0,>

In [2]:
#################################################
# 2) Import Libraries and Load Base Model (BF16)
#################################################
import torch
from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import Dataset
import pandas as pd
import random

# We are using A100 with 40 GB VRAM, so enable 4-bit quantization to reduce memory usage
load_in_4bit = True   # Enable 4-bit quantization
max_seq_length = 2048  # Unsloth handles RoPE scaling internally
dtype = torch.float16   # Use FP16 for mixed precision

base_model_name = "unsloth/Llama-3.3-70B-Instruct"  # Example 8B model

# Load the base model and tokenizer with 4-bit quantization
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name     = base_model_name,
    max_seq_length = max_seq_length,
    dtype          = dtype,         # Use FP16
    load_in_4bit   = load_in_4bit,  # Enable 4-bit quantization
    # Optional: Configure device offloading if necessary
    # For example, to offload to CPU when not in use:
    # offload_folder = "offload",
    # offload_state_dict = True,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.12: Fast Llama patching. Transformers: 4.47.1.
   \\   /|    GPU: NVIDIA H100 80GB HBM3. Max memory: 79.205 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/331k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/4.75G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [3]:
#################################
# 3) Apply LoRA (PEFT) Adapters
#################################
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules = ["q_proj","k_proj","v_proj","o_proj",
                      "gate_proj","up_proj","down_proj"],
    lora_alpha=16,
    lora_dropout=0.1,  # Consider adding some dropout for regularization
    bias="none",
    use_gradient_checkpointing=True,  # Enable gradient checkpointing to save memory
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.12.12 patched 80 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [4]:
########################################
# 4) Define the Training Prompt Templates
########################################
# Two templates: one with instructions, one without

# Template with instructions
instruction_template = """#######################################################
# Instruction:
{instruction}
#######################################################

Diagnosis: {diagnose}

User's Input:
{user_input}

Your Response:
{ideal_answer}
{eos_token}
"""

# Template without instructions
no_instruction_template = """Diagnosis: {diagnose}

User's Input:
{user_input}

Your Response:
{ideal_answer}
{eos_token}
"""

EOS_TOKEN = tokenizer.eos_token

In [6]:
############################################
# 5) Prepare Augmented Dataset
############################################
def formatting_prompts_func(examples):
    instructions  = examples["instruction"]      # if present
    user_inputs   = examples["input"]           # user's mental health text
    diagnoses     = examples["diagnose_output"] # your diagnosis label
    outputs       = examples["output"]          # final response

    texts = []
    for instr, inp, diag, outp in zip(instructions, user_inputs, diagnoses, outputs):
        # Randomly decide whether to include instruction or not
        include_instruction = random.choice([True, False])

        if include_instruction:
            # Fill the instruction template
            text = instruction_template.format(
                instruction   = instr,
                user_input    = inp,
                diagnose      = diag,
                ideal_answer  = outp,
                eos_token     = EOS_TOKEN
            )
        else:
            # Fill the no-instruction template
            text = no_instruction_template.format(
                user_input    = inp,
                diagnose      = diag,
                ideal_answer  = outp,
                eos_token     = EOS_TOKEN
            )
        texts.append(text)
    return {"text": texts}

# Example CSV with columns: instruction, input, diagnose_output, output
csv_path = "./Updated_LLM_new_dataset.csv"  # Edit your path
df = pd.read_csv(csv_path)

# Select only necessary columns
df = df[['instruction', 'input', 'diagnose_output', 'output']]

# Shuffle the dataset to ensure random distribution of instruction-based and instruction-free examples
df = df.sample(frac=1, random_state=3407).reset_index(drop=True)

dataset = Dataset.from_pandas(df)
dataset = dataset.map(formatting_prompts_func, batched=True)

Map:   0%|          | 0/8580 [00:00<?, ? examples/s]

In [None]:
##############################################
# 6) Fine-Tune Using TRL's SFTTrainer (FP16 with 4-bit Quantization)
##############################################
training_args = TrainingArguments(
    per_device_train_batch_size = 1,  # Reduce batch size to fit memory constraints
    gradient_accumulation_steps = 4,  # Accumulate gradients to simulate a larger batch size
    max_steps = 1000,                   # Increase steps for better training; adjust as needed
    learning_rate = 2e-4,
    # On an A100, use mixed precision (FP16)
    fp16 = True,                        # Enable FP16 mixed precision
    bf16 = False,                       # Disable BF16
    logging_steps = 10,                 # Adjust logging frequency
    optim = "adamw_8bit",               # Use 8-bit Adam optimizer to save memory
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 3407,
    output_dir = "outputs",
    save_steps = 500,                   # Save checkpoints periodically
    save_total_limit = 3,                # Limit the number of saved checkpoints
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,               # Adjust based on CPU cores; reduce if necessary
    packing = False,
    args = training_args,
)

# Show GPU info
gpu_stats = torch.cuda.get_device_properties(0)
print(f"GPU = {gpu_stats.name}, Total VRAM = {round(gpu_stats.total_memory/1e9,2)} GB")

trainer_stats = trainer.train()
print("Training completed. Stats:", trainer_stats)

Map (num_proc=2):   0%|          | 0/8580 [00:00<?, ? examples/s]

GPU = NVIDIA H100 80GB HBM3, Total VRAM = 85.05 GB


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 8,580 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 4
\        /    Total batch size = 4 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 207,093,760


Step,Training Loss
10,1.1731
20,0.7725
30,0.7985
40,0.7267
50,0.8423
60,0.6291
70,0.7867
80,0.6853
90,0.7037
100,0.7314


In [None]:
######################################
# 7) Inference: Provide user & label
######################################
FastLanguageModel.for_inference(model)  # 2x faster inference with Unsloth

def mental_health_inference(user_message, diagnosis_label):
    # During inference, do not include instructions
    prompt = f"""Diagnosis: {diagnosis_label}

User's Input:
{user_message}

Your Response:
"""
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=512)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test an inference example
user_msg = "I feel constant anxiety around people and can't focus on my tasks."
diag_lbl = "Social Anxiety Disorder"
print("MODEL RESPONSE:\n", mental_health_inference(user_msg, diag_lbl))

In [None]:
##########################################
# 8) Save the LoRA-Finetuned Model
##########################################
model.save_pretrained("LLAMA_3_3_70B_A100")
tokenizer.save_pretrained("LLAMA_3_3_70B_A100")