In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [10]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None  # Auto-detects the best dtype
load_in_4bit = True  # Use 4-bit quantization to reduce memory usage

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/mistral-7b-v0.3",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)


==((====))==  Unsloth 2024.12.8: Fast Mistral patching. Transformers: 4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [12]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,  # You can optimize this later
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)


Unsloth: Already have LoRA adapters! We shall skip this step.


In [13]:
from datasets import load_dataset

dataset = load_dataset("imdb", split="train")
dataset = dataset.map(lambda x: {"text": x["text"], "label": x["label"]})  # Map to expected format

# Optionally, truncate sequences that exceed max_seq_length
dataset = dataset.map(lambda x: {'text': tokenizer(x['text'], padding='max_length', truncation=True, max_length=max_seq_length)['input_ids']})


Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [14]:
def formatting_prompts_func(examples):
    texts = examples["text"]
    labels = examples["label"]
    formatted_texts = []
    for text, label in zip(texts, labels):
        text = f"Sentiment: {text} Answer with a sentiment label: {label}"
        formatted_texts.append(text)
    return {"text": formatted_texts}

dataset = dataset.map(formatting_prompts_func, batched=True)


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [16]:
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=1,  # Changed to 1 to disable multiprocessing for the dataset.map operation
    packing=False,  # This can be turned on if sequences are short
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=60,  # Adjust based on the size of the dataset
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",  # Use this for WandB etc
    ),
)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [17]:
#@title Show current memory stats
import torch

# Get the GPU properties
gpu_stats = torch.cuda.get_device_properties(0)

# Get the current GPU memory usage
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

# Print the stats
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")


GPU = Tesla T4. Max memory = 14.748 GB.
13.322 GB of memory reserved.


In [18]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 25,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,0.8907
2,1.0136
3,0.8862
4,0.897
5,0.754
6,0.6355
7,1.009
8,0.9764
9,0.9686
10,0.7909


In [19]:
# After training is done, track the final memory stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)

# Calculate the percentage of memory used during training
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

# Print the stats
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime'] / 60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")


2437.9495 seconds used for training.
40.63 minutes used for training.
Peak reserved memory = 13.322 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 90.331 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [20]:
model.save_pretrained("finetuned_mistral_imdb")
tokenizer.save_pretrained("finetuned_mistral_imdb")

('finetuned_mistral_imdb/tokenizer_config.json',
 'finetuned_mistral_imdb/special_tokens_map.json',
 'finetuned_mistral_imdb/tokenizer.model',
 'finetuned_mistral_imdb/added_tokens.json',
 'finetuned_mistral_imdb/tokenizer.json')

In [21]:
FastLanguageModel.for_inference(model)  # Enable faster inference

inputs = tokenizer(
    ["The movie was fantastic!"],  # Example text input for sentiment analysis
    return_tensors="pt",
    truncation=True,
    max_length=max_seq_length,
).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=64)
tokenizer.batch_decode(outputs)


['<s> The movie was fantastic! I loved it! I loved the story, the characters, the acting, the music, the cinematography, the directing, the editing, the sound, the costumes, the sets, the props, the special effects, the makeup, the hair, the lighting, the sound, the sound, the sound']