In [35]:
# 1. Install dependencies
!pip install -q --upgrade unsloth "transformers>=4.40.0" "accelerate" "bitsandbytes" "datasets" "peft" "scipy" "wandb"

# 2. Environment fixes for Triton/Flash-Attn issues
import os
os.environ["TRITON_DISABLE_LINE_INFO"] = "1"

# 3. Load Unsloth LLaMA 3 model with Flash Attention disabled
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    max_seq_length=4096,
    dtype=None,
    load_in_4bit=True,
    use_flash_attention_2=False  # Set to False to avoid Triton-related issues
)

model = FastLanguageModel.get_peft_model(
    model,
    r=64,
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing=True,
    random_state=42,
    use_rslora=False,
    loftq_config=None,
)

# 4. Load medical CoT dataset
from datasets import load_dataset
datasetdict = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", 'en')

# 5. Preprocess data
def format_cot(example):
    return {
        "text": f"### Question:\n{example['Question']}\n\n### Reasoning:\n{example['Complex_CoT']}\n\n### Answer:\n{example['Response']}"
    }

dataset = datasetdict['train']
dataset = dataset.map(format_cot)
dataset = dataset.train_test_split(test_size=0.05)

print(dataset)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/362.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.1/362.1 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25h==((====))==  Unsloth 2025.5.7: Fast Llama patching. Transformers: 4.52.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
DatasetDict({
    train: Dataset({
        features: ['Question', 'Complex_CoT', 'Response', 'text'],
        num_rows: 18718
    })
    test: Dataset({
        features: ['Question', 'Complex_CoT', 'Response', 'text'],
        num_rows: 986
    })
})


In [36]:
# 6. Tokenize data
tokenizer.pad_token = tokenizer.eos_token
dataset = dataset.map(lambda x: tokenizer(x["text"]), batched=True)

# 7. Prepare model for training
FastLanguageModel.for_training(model)

from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir = "medical-mistral-lora",
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    gradient_accumulation_steps = 4,
    num_train_epochs = 3,
    learning_rate = 2e-4,
    logging_dir = "./logs",
    logging_steps = 10,
    save_steps = 500,
    save_total_limit = 2,
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    args=training_args,
    data_collator=data_collator,
)

# 8. Train the model (safe now!)
trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 18,718 | Num Epochs = 3 | Total steps = 7,020
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 97,255,424/3,000,000,000 (3.24% trained)


RuntimeError: PassManager::run failed