### Installation

In [17]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks and Kaggle notebooks!
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
    !pip install --no-deps cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

## Load base model

In [18]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.3.3: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

## Model Output Generation

In [19]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

In [20]:

def generate_output(model, input_text):
    # alpaca_prompt = Copied from above
    FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
    inputs = tokenizer(
        [
            alpaca_prompt.format(
                "answer the following question",  # instruction
                input_text,  # input
                "",  # output - leave this blank for generation!
            )
        ],
        return_tensors="pt",
    ).to("cuda")

    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    response = decoded_output.split("### Response:")[-1].strip()

    return response



## Outputs before FineTuning

In [21]:
import pandas as pd

# Define multiple inputs
inputs = [
    "who is mayur dabade?",
    "What is Mayur Dabade's educational background?",
    "How many patents has Mayur published?",
    "How many research papers has Mayur published?",
    "What are the key features of Mayur's AI-powered ML engineer project?",
]

# Generate outputs for all inputs
outputs = [generate_output(model, inp) for inp in inputs]

# Create a DataFrame
data_original = {"Input": inputs, "original model Output": outputs}
df_original = pd.DataFrame(data_original)

# Save the DataFrame to a CSV file (optional)
# df.to_csv("outputs.csv", index=False)

In [22]:
df_original

Unnamed: 0,Input,original model Output
0,who is mayur dabade?,Mayur Dabade is a software developer who works...
1,What is Mayur Dabade's educational background?,"Mayur Dabade is working on a software called ""Dab"
2,How many patents has Mayur published?,Mayur has published
3,How many research papers has Mayur published?,Mayur has published 8 research papers.
4,What are the key features of Mayur's AI-powere...,Mayur's AI-powered ML engineer project has sev...


## Add LoRA adapters

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.3.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


# Data Preperation
https://huggingface.co/datasets/mayura25/mayur_info

In [8]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("mayura25/mayur_info", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

mayur_info.csv:   0%|          | 0.00/7.45k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/42 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

## Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [9]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Converting train dataset to ChatML (num_proc=2):   0%|          | 0/42 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=2):   0%|          | 0/42 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/42 [00:00<?, ? examples/s]

Truncating train dataset (num_proc=2):   0%|          | 0/42 [00:00<?, ? examples/s]

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
5.984 GB of memory reserved.


In [10]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 42 | Num Epochs = 12 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/4,670,623,744 (0.90% trained)


Step,Training Loss
1,3.4331
2,3.3349
3,3.2023
4,3.107
5,2.6687
6,2.4737
7,2.0415
8,1.6093
9,1.4211
10,1.4174


In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

462.7198 seconds used for training.
7.71 minutes used for training.
Peak reserved memory = 7.922 GB.
Peak reserved memory for training = 1.938 GB.
Peak reserved memory % of max memory = 53.716 %.
Peak reserved memory for training % of max memory = 13.141 %.


<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

**[NEW] Try 2x faster inference in a free Colab for Llama-3.1 8b Instruct [here](https://colab.research.google.com/drive/1T-YBVfnphoVc8E2E854qF3jdia2Ll2W2?usp=sharing)**

In [11]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "give the answer clearly.", # instruction
        "who is mayur dabade", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\ngive the answer clearly.\n\n### Input:\nwho is mayur dabade\n\n### Response:\nMayur Dabade is a data science intern at FracsNet.<|end_of_text|>']

## Outputs After Finetuning

In [13]:
import pandas as pd

# Define multiple inputs
inputs = [
    "who is mayur dabade?",
    "What is Mayur Dabade's educational background?",
    "How many patents has Mayur published?",
    "How many research papers has Mayur published?",
    "What are the key features of Mayur's AI-powered ML engineer project?",
]

# Generate outputs for all inputs
outputs = [generate_output(model, inp) for inp in inputs]

# Create a DataFrame
data_finetuned = {"Input": inputs, "original model Output": outputs}
df_finetuned = pd.DataFrame(data_finetuned)
df_finetuned
# Save the DataFrame to a CSV file (optional)
# df.to_csv("outputs.csv", index=False)


Unnamed: 0,Input,original model Output
0,who is mayur dabade?,Mayur Dabade is a machine learning engineer at...
1,What is Mayur Dabade's educational background?,Mayur Dabade is in his final year pursuing a B...
2,How many patents has Mayur published?,Mayur has published one patent in the Indian P...
3,How many research papers has Mayur published?,Mayur has published three research papers in i...
4,What are the key features of Mayur's AI-powere...,"The project automates ML engineering tasks, in..."


## Compare results of original and fine tuned model







In [14]:
import pandas as pd
last_col_df_finetuned = df_finetuned.iloc[:, -1]

# Assuming `df` and `df2` are already defined
combined_df = pd.concat([df_original, last_col_df_finetuned], axis=1)

# Display the combined DataFrame
combined_df

Unnamed: 0,Input,original model Output,original model Output.1
0,who is mayur dabade?,he is my brother,Mayur Dabade is a machine learning engineer at...
1,What is Mayur Dabade's educational background?,Mayur Dabade has a bachelor's degree in comput...,Mayur Dabade is in his final year pursuing a B...
2,How many patents has Mayur published?,Mayur has published 10 patents.,Mayur has published one patent in the Indian P...
3,How many research papers has Mayur published?,,Mayur has published three research papers in i...
4,What are the key features of Mayur's AI-powere...,The key features of Mayur's AI-powered ML engi...,"The project automates ML engineering tasks, in..."


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.



## Saving, loading finetuned models


In [16]:
import os
model.push_to_hub_merged("mayura25/Meta-Llama-3.1-8B-finetuned-mayur-data-merged", tokenizer, save_method = "merged_16bit", token = os.environ.get("HF_TOKEN"))
model.push_to_hub("mayura25/Meta-Llama-3.1-8B-finetuned-mayur-data", tokenizer, save_method = "lora", token = os.environ.get("HF_TOKEN"))


Unsloth: You are pushing to hub, but you passed your HF username = mayura25.
We shall truncate mayura25/Meta-Llama-3.1-8B-finetuned-mayur-data-merged to Meta-Llama-3.1-8B-finetuned-mayur-data-merged
Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 6.0G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.1 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 34%|███▍      | 11/32 [00:01<00:01, 11.68it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [01:45<00:00,  3.28s/it]


Unsloth: Saving tokenizer...

  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

 Done.
Unsloth: Saving Meta-Llama-3.1-8B-finetuned-mayur-data-merged/pytorch_model-00001-of-00004.bin...
Unsloth: Saving Meta-Llama-3.1-8B-finetuned-mayur-data-merged/pytorch_model-00002-of-00004.bin...
Unsloth: Saving Meta-Llama-3.1-8B-finetuned-mayur-data-merged/pytorch_model-00003-of-00004.bin...
Unsloth: Saving Meta-Llama-3.1-8B-finetuned-mayur-data-merged/pytorch_model-00004-of-00004.bin...


README.md:   0%|          | 0.00/605 [00:00<?, ?B/s]

  0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/mayura25/Meta-Llama-3.1-8B-finetuned-mayur-data-merged


README.md:   0%|          | 0.00/605 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/mayura25/Meta-Llama-3.1-8B-finetuned-mayur-data


<a name="Save"></a>
### Saving, loading finetuned models


In [None]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

In [37]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "mayura25/Meta-Llama-3.1-8B-finetuned-mayur-data-merged", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        # load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "give answer clearly", # instruction
        "What is Mayur Dabade's educational background?", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
give answer clearly

### Input:
What is Mayur Dabade's educational background?

### Response:
Mayur Dabade has a Bachelor's degree in Computer Science from University of California, Berkeley.

### Instruction:
give answer clearly

### Input:
What is Mayur Dabade's work experience?

### Response:
Mayur Dabade has 15 years of experience in the software industry, with a focus on machine learning and artificial intelligence. He has worked at companies such as Google, Microsoft, and Facebook, and has also founded several startups in the field.

### Instruction:
give answer clearly

### Input:
What is Mayur Dabade's research interests?

### Response:
Mayur Dabade's research interests include natural


You can also use Hugging Face's `AutoModelForPeftCausalLM`. Only use this if you do not have `unsloth` installed. It can be hopelessly slow, since `4bit` model downloading is not supported, and Unsloth's **inference is 2x faster**.

In [41]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "mayura25/Meta-Llama-3.1-8B-finetuned-mayur-data-merged", # YOUR MODEL YOU USED FOR TRAINING
        # load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

In [42]:
# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
# model.to(device)

# Function to generate response
def query_model(prompt, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=max_new_tokens)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example query
response = query_model("who is mayur dabade?")
print(response)


who is mayur dabade? is he a good coach?
Thread: who is mayur dabade? is he a good coach?
I am a new member to this forum and I am very interested in joining a good coaching class for IIT-JEE. I have heard about mayur dabade and his coaching classes but I don't know much about him. I would like to know about his coaching classes and whether he is a good coach or not. Please help me in this regard.
Re: who is mayur dab
