# Finetune Mistral 2-5x faster with 80% less memory with Unsloth!

In [1]:
import torch; 
print(torch.version.cuda)

12.1


In [2]:
import tensorflow as tf
tf.test.is_gpu_available()

2024-07-06 10:53:36.080911: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


2024-07-06 10:53:37.922855: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /device:GPU:0 with 6620 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0001:00:00.0, compute capability: 7.5


True

In [4]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [5]:
from unsloth import FastLanguageModel
max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
    "unsloth/gemma-2b-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = "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", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth: Fast Mistral patching release 2024.6
   \\   /|    GPU: Tesla T4. Max memory: 14.568 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth: Will load unsloth/mistral-7b-v0.3-bnb-4bit as a legacy tokenizer.


In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [7]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

### Getting Response before Finetuning 

In [8]:
# alpaca_prompt = Copied from above -- w/o training 3
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Analyze the given text for its tone.", # instruction
        "The world has been greatly impacted by the COVID-19 pandemic and it has drastically changed our lives.", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 20)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Analyze the given text for its tone.

### Input:
The world has been greatly impacted by the COVID-19 pandemic and it has drastically changed our lives.

### Response:
The tone of the given text is serious and informative.

### Instruction:
Analy


In [5]:

# alpaca_prompt = Copied from above -- w/o training 4
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\nFounded in the 13th century, Berlin has had an eventful history. Excavations from 2008 suggest that the city may be even older than was previously assumed: state archaeologists have discovered an oak beam that probably dates back to 1183.\n\nAlmost no other metropolis has experienced such frequent, radical change transforming the face of the city. Although Berlin saw steady growth in its importance, dazzling epochs alternated with darker eras. Nevertheless, the formerly divided city has succeeded in becoming a vibrant metropolis in the heart of Europe.\n\nQuestion: What is the best restaurant in Berlin?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 20)


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Founded in the 13th century, Berlin has had an eventful history. Excavations from 2008 suggest that the city may be even older than was previously assumed: state archaeologists have discovered an oak beam that probably dates back to 1183.

Almost no other metropolis has experienced such frequent, radical change transforming the face of the city. Although Berlin saw steady growth in its importance, dazzling epochs alternated with darker eras. Nevertheless, the formerly divided city has succeeded in becoming a vibrant metropolis in the heart of Europe.

Question: What is the best restaurant in Berlin?

### Input:


### Response:























In [10]:
# alpaca_prompt = Copied from above -- w/o training 3
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Explain why the given definition is wrong", # instruction
        "A mole is an animal that lives underground.", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 30)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Explain why the given definition is wrong

### Input:
A mole is an animal that lives underground.

### Response:
The definition is wrong because moles do not live underground. They live in burrows that they dig in the ground.

### Instruction:


### Training the model now

In [11]:

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, # Set num_train_epochs = 1 for full training runs
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2): 100%|██████████| 51760/51760 [00:41<00:00, 1250.53 examples/s]
max_steps is given, it will override any value given in num_train_epochs


In [12]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.568 GB.
4.85 GB of memory reserved.


In [13]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 4
\        /    Total batch size = 4 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,1.5221
2,1.4599
3,1.9919
4,1.1268
5,1.1337
6,1.0124
7,0.8775
8,1.1673
9,0.7801
10,1.0172


In [14]:
torch.cuda.empty_cache()

In [15]:
torch.cuda.memory_summary(device=None, abbreviated=False)



In [16]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

194.8395 seconds used for training.
3.25 minutes used for training.
Peak reserved memory = 7.992 GB.
Peak reserved memory for training = 3.142 GB.
Peak reserved memory % of max memory = 54.86 %.
Peak reserved memory for training % of max memory = 21.568 %.


### Getting Response after Finetuning the pretrained model

In [17]:
# alpaca_prompt = Copied from above -- w training 3
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Explain why the given definition is wrong", # instruction
        "A mole is an animal that lives underground.", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 100)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Explain why the given definition is wrong

### Input:
A mole is an animal that lives underground.

### Response:
The definition is wrong because it is too broad and does not accurately describe all moles. While some moles do live underground, not all of them do. There are many different species of moles, and some of them live in burrows above ground, while others live in trees or even in the water. Additionally, not all animals that live underground are moles. There are many other types of animals, such as badgers, that also live underground.</s>


In [16]:
# alpaca_prompt = Copied from above -- with training 2
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Analyze the given text for its tone.", # instruction
        "The world has been greatly impacted by the COVID-19 pandemic and it has drastically changed our lives.", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 100)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Analyze the given text for its tone.

### Input:
The world has been greatly impacted by the COVID-19 pandemic and it has drastically changed our lives.

### Response:
The tone of the given text is serious and somber. The use of words such as "impacted" and "drastically" suggests a sense of gravity and importance, while the mention of the pandemic and its effects on the world conveys a sense of urgency and concern. Overall, the tone of the text is one of seriousness and concern for the well-being of the world and its inhabitants.</s>


### Saving the finetuned model locally and on huggingface hub 

In [20]:
# Make sure to create an access token (write access) on huggingface to save the model
model.save_pretrained("lora_model") # Local saving
model.save_pretrained_merged("outputs", tokenizer, save_method = "merged_16bit",)
tokenizer.save_pretrained("outputs",save_method = "merged_16bit" ) 
model.push_to_hub("bhums/mistral-7b-alpaca-unsloth-merged", token = "hf_####") # Online saving
model.push_to_hub("bhums/mistral-7b-alpaca-unsloth-merged", save_method = "lora", token = "hf_####") # Online saving

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 13.49 out of 27.35 RAM for saving.


100%|██████████| 32/32 [00:51<00:00,  1.61s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


adapter_model.safetensors: 100%|██████████| 168M/168M [00:14<00:00, 11.7MB/s] 


Saved model to https://huggingface.co/bhums/mistral-7b-alpaca-unsloth-merged
Saved model to https://huggingface.co/bhums/mistral-7b-alpaca-unsloth-merged


In [18]:
local_directory = "outputs"
model.save_pretrained(local_directory)
tokenizer.save_pretrained(local_directory)

('outputs/tokenizer_config.json',
 'outputs/special_tokens_map.json',
 'outputs/tokenizer.model',
 'outputs/added_tokens.json')

In [19]:
import os
model.push_to_hub("bhums/unsloth_alpaca", token = "hf_####")
tokenizer.push_to_hub("bhums/unsloth_alpaca", token = "hf_####")

# For saving merged (if applicable):
model.push_to_hub_merged("bhums/unsloth_alpaca", tokenizer, save_method="merged_16bit", token = "hf_####")

adapter_model.safetensors: 100%|██████████| 168M/168M [00:17<00:00, 9.76MB/s] 


Saved model to https://huggingface.co/bhums/unsloth_alpaca


tokenizer.model: 100%|██████████| 587k/587k [00:00<00:00, 1.37MB/s]
Unsloth: You are pushing to hub, but you passed your HF username = bhums.
We shall truncate bhums/unsloth_alpaca to unsloth_alpaca


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 13.52 out of 27.35 RAM for saving.


 44%|████▍     | 14/32 [00:00<00:01, 15.38it/s]We will save to Disk and not RAM now.
100%|██████████| 32/32 [00:51<00:00,  1.62s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...


model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]
model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s][A


model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s][A[A[A

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s][A[A
model-00001-of-00003.safetensors:   0%|          | 2.26M/4.95G [00:00<06:47, 12.1MB/s][A


model-00001-of-00003.safetensors:   0%|          | 3.49M/4.95G [00:00<08:46, 9.40MB/s][A[A[A


model-00003-of-00003.safetensors:   0%|          | 4.46M/4.55G [00:00<06:37, 11.4MB/s][A[A[A


model-00003-of-00003.safetensors:   0%|          | 5.59M/4.55G [00:00<06:51, 11.0MB/s][A[A[A
model-00002-of-00003.safetensors:   0%|          | 4.18M/5.00G [00:00<17:11, 4.84MB/s][A


model-00003-of-00003.safetensors:   0%|          | 6.67M/4.55G [00:00<11:33, 6.55MB/s][A[A[A


model-00001-of-00003.safetensors:   0%|          | 4.42M/4.95G [00:00<23:00, 3.58MB/s][A[A[A
model-00002-of-00

Done.
Saved merged model to https://huggingface.co/bhums/unsloth_alpaca


In [23]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/mistral-7b-v0.3", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    ),
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

==((====))==  Unsloth: Fast Mistral patching release 2024.6
   \\   /|    GPU: Tesla T4. Max memory: 14.568 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.26.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth: Will load unsloth/mistral-7b-v0.3-bnb-4bit as a legacy tokenizer.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['<s>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is a famous tall tower in Paris?\n\n### Input:\n\n\n### Response:\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']

### Model is now ready to be used

In [None]:
# Save to q4_k_m GGUF
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if True: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "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")
if True: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q5_k_m", token = "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")  

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 14.84 out of 27.35 RAM for saving.


100%|██████████| 32/32 [02:12<00:00,  4.15s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
