In [1]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [2]:
import torch
print(torch.cuda.is_available())  # Should return True if CUDA is available


False


## Load base models using unsloth

In [4]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/SmolLM2-135M-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

NotImplementedError: Unsloth: No NVIDIA GPU found? Unsloth currently only supports GPUs!

Note: you may need to restart the kernel to use updated packages.


ERROR: No matching distribution found for triton


## Model Output generation

In [41]:

def generate_output(model, input_text):
    # alpaca_prompt = Copied from above
    FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
    inputs = tokenizer(
        [
            alpaca_prompt.format(
                "answer the following question",  # instruction
                input_text,  # input
                "",  # output - leave this blank for generation!
            )
        ],
        return_tensors="pt",
    ).to("cuda")

    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    response = decoded_output.split("### Response:")[-1].strip()

    return response



## Create dataframe for andom test examples

In [42]:
import pandas as pd

# Define multiple inputs
inputs = [
    "What are the causes of hypertension?",
    "How is asthma diagnosed?",
    "Can you provide a brief summary of Hashimoto's thyroiditis?",
    "What are the symptoms of anemia?",
    "What is the treatment for migraine?",
    "What is the difference between Type 1 and Type 2 diabetes?",
    "How can osteoporosis be prevented?",
    "What are the early signs of Alzheimer's disease?",
]

# Generate outputs for all inputs
outputs = [generate_output(model, inp) for inp in inputs]

# Create a DataFrame
data_original = {"Input": inputs, "original model Output": outputs}
df_original = pd.DataFrame(data)

# Save the DataFrame to a CSV file (optional)
# df.to_csv("outputs.csv", index=False)

In [43]:
df_original

Unnamed: 0,Input,original model Output
0,What are the causes of hypertension?,### Input:\nWhat are the causes of hypertensio...
1,How is asthma diagnosed?,### Input:
2,Can you provide a brief summary of Hashimoto's...,### Input:\nCan you provide a brief summary of...
3,What are the symptoms of anemia?,
4,What is the treatment for migraine?,
5,What is the difference between Type 1 and Type...,Type 1 diabetes is characterized by hyperglyce...
6,How can osteoporosis be prevented?,
7,What are the early signs of Alzheimer's disease?,### Input:\n\n### Input:\n\n### Input:\n\n### ...


## Add LoRA adapters

In [12]:
new_model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.1.5 patched 30 layers with 30 QKV layers, 30 O layers and 30 MLP layers.


# Data Preperation
dataset link - https://huggingface.co/datasets/Shekswess/medical_llama3_instruct_dataset_short


In [13]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("Shekswess/medical_llama3_instruct_dataset_short", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

README.md:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

<a name="Train"></a>
## Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [19]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = new_model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 2, # Set this for 1 full training run.
        # max_steps = 100, # Set this for 60 steps
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 20,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
5.984 GB of memory reserved.


In [20]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,000 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 500
 "-____-"     Number of trainable parameters = 4,884,480


Step,Training Loss
1,1.8274
2,2.5079
3,2.241
4,1.7809
5,1.7995
6,2.0089
7,1.9813
8,2.2901
9,1.8687
10,1.6928


Step,Training Loss
1,1.8274
2,2.5079
3,2.241
4,1.7809
5,1.7995
6,2.0089
7,1.9813
8,2.2901
9,1.8687
10,1.6928


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

462.7198 seconds used for training.
7.71 minutes used for training.
Peak reserved memory = 7.922 GB.
Peak reserved memory for training = 1.938 GB.
Peak reserved memory % of max memory = 53.716 %.
Peak reserved memory for training % of max memory = 13.141 %.



## Inference


In [16]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(new_model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "answer the following question", # instruction
        "Can you provide a brief summary of Hashimoto's thyroiditis?",
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = new_model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

["Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nanswer the following question\n\n### Input:\nCan you provide a brief summary of Hashimoto's thyroiditis?\n\n### Response:\nHashimoto's thyroiditis is an autoimmune disorder that affects the thyroid gland. It is characterized by an overactive thyroid that produces excessive amounts of thyroid hormones. The overactive thyroid is called an autoimmune thyroiditis. Hashimoto's thyroiditis is a chronic autoimmune disorder that affects the thyroid gland. It is estimated that 1"]

In [47]:
import pandas as pd

# Define multiple inputs
inputs = [
    "What are the causes of hypertension?",
    "How is asthma diagnosed?",
    "Can you provide a brief summary of Hashimoto's thyroiditis?",
    "What are the symptoms of anemia?",
    "What is the treatment for migraine?",
    "What is the difference between Type 1 and Type 2 diabetes?",
    "How can osteoporosis be prevented?",
    "What are the early signs of Alzheimer's disease?",
]

# Generate outputs for all inputs
outputs = [generate_output(new_model, inp) for inp in inputs]

# Create a DataFrame
data_finetuned = {"Input": inputs,"fineTuned model Output": outputs}
df_finetuned = pd.DataFrame(data_finetuned)


In [48]:
df_finetuned

Unnamed: 0,Input,fineTuned model Output
0,What are the causes of hypertension?,Hypertension is a common condition that affect...
1,How is asthma diagnosed?,The diagnosis of asthma is made by a thorough ...
2,Can you provide a brief summary of Hashimoto's...,Hashimoto's thyroiditis is an autoimmune disor...
3,What are the symptoms of anemia?,"Symptoms of anemia include fatigue, weakness, ..."
4,What is the treatment for migraine?,The treatment of migraine is aimed at relievin...
5,What is the difference between Type 1 and Type...,Type 1 diabetes is an autoimmune disorder that...
6,How can osteoporosis be prevented?,Osteoporosis is a disease that affects the bon...
7,What are the early signs of Alzheimer's disease?,Early signs of Alzheimer's disease include: 1....


## Compare results of original and fine tuned model

In [50]:
import pandas as pd
last_col_df_finetuned = df_finetuned.iloc[:, -1]

# Assuming `df` and `df2` are already defined
combined_df = pd.concat([df_original, last_col_df_finetuned], axis=1)

# Display the combined DataFrame
combined_df


Unnamed: 0,Input,original model Output,fineTuned model Output
0,What are the causes of hypertension?,### Input:\nWhat are the causes of hypertensio...,Hypertension is a common condition that affect...
1,How is asthma diagnosed?,### Input:,The diagnosis of asthma is made by a thorough ...
2,Can you provide a brief summary of Hashimoto's...,### Input:\nCan you provide a brief summary of...,Hashimoto's thyroiditis is an autoimmune disor...
3,What are the symptoms of anemia?,,"Symptoms of anemia include fatigue, weakness, ..."
4,What is the treatment for migraine?,,The treatment of migraine is aimed at relievin...
5,What is the difference between Type 1 and Type...,Type 1 diabetes is characterized by hyperglyce...,Type 1 diabetes is an autoimmune disorder that...
6,How can osteoporosis be prevented?,,Osteoporosis is a disease that affects the bon...
7,What are the early signs of Alzheimer's disease?,### Input:\n\n### Input:\n\n### Input:\n\n### ...,Early signs of Alzheimer's disease include: 1....


## You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [39]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(new_model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Answer the following question.", # instruction
        "How is asthma diagnosed", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = new_model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Answer the following question.

### Input:
How is asthma diagnosed

### Response:
These are the most common causes of asthma:
* Asthma is a chronic disease that affects the airways of the lungs. It is a disorder that causes the airways to narrow and become inflamed. This inflammation can cause the airways to narrow and narrow even further. This can lead to a narrowing of the airways, which can cause the airways to close. This can cause the airways to become blocked, which can cause the airways to close again. This can cause the airways to become inflamed again, which can cause the airways to close again. This can cause the airways to become inflamed again, which can cause the airways to close again. This can



## Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.


In [26]:
import os
new_model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")
# new_model.push_to_hub("mayura25/SmolLM2-135M-medical-dataset-finetuned", token = os.environ.get("HF_TOKEN")) # Online saving
# tokenizer.push_to_hub("mayura25/SmolLM2-135M-medical-dataset-tokenizer", token = os.environ.get("HF_TOKEN")) # Online saving


# model.save_pretrained("lora_model")
# model.save_pretrained_merged("outputs", tokenizer, save_method = "merged_16bit",)
# model.push_to_hub_merged("praison/mistral-7b-oig-unsloth-merged", tokenizer, save_method = "merged_16bit", token = os.environ.get("HF_TOKEN"))
# model.push_to_hub("praison/mistral-7b-oig-unsloth", tokenizer, save_method = "lora", token = os.environ.get("HF_TOKEN"))

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/vocab.json',
 'lora_model/merges.txt',
 'lora_model/added_tokens.json',
 'lora_model/tokenizer.json')

In [29]:
import os
new_model.push_to_hub_merged("mayura25/SmolLM2-135M-medical-dataset-finetuned-merged", tokenizer, save_method = "merged_16bit", token = os.environ.get("HF_TOKEN"))
new_model.push_to_hub("mayura25/SmolLM2-135M-medical-dataset-finetuned", tokenizer, save_method = "lora", token = os.environ.get("HF_TOKEN"))


Unsloth: You are pushing to hub, but you passed your HF username = mayura25.
We shall truncate mayura25/SmolLM2-135M-medical-dataset-finetuned-merged to SmolLM2-135M-medical-dataset-finetuned-merged
Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 111.9M


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.74 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 30/30 [00:00<00:00, 124.91it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving SmolLM2-135M-medical-dataset-finetuned-merged/pytorch_model.bin...


README.md:   0%|          | 0.00/579 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/269M [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/mayura25/SmolLM2-135M-medical-dataset-finetuned-merged


README.md:   0%|          | 0.00/579 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Saved model to https://huggingface.co/mayura25/SmolLM2-135M-medical-dataset-finetuned


## load the LoRA adapters we just saved for inference, set `False` to `True`:

In [None]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
What is a famous tall tower in Paris?

### Input:


### Response:
One of the most famous and iconic tall towers in Paris is the Eiffel Tower. Standing at 324 meters (1,063 feet) tall, this wrought iron tower is a symbol of the city and a must-see attraction for tourists from all over the world.<|end_of_text|>


You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [None]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")