install deps

In [1]:
# Install unsloth library
# Please restart session when asked to restart session.
# Rerun this code to make sure that all libraries are installed.

#core
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton


#unsloth and training
!pip install --no-deps cut_cross_entropy unsloth_zoo
!pip install unsloth

#for datasets and tokenizers
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install pandas

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting xformers==0.0.29
  Downloading xformers-0.0.29-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl
  Downloading trl-0.16.0-py3-none-any.whl.metadata (12 kB)
Downloading xformers-0.0.29-cp311-cp311-manylinux_2_28_x86_64.whl (15.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m71.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl (76.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trl-0.16.0-py3-none-any.whl (335 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m335.7/335.7 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xformers, trl, bitsandbytes
Successfully installed bitsandbytes-0.45.4 trl-0.16.0 xformers-



load the qwen model

In [2]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-Instruct",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)
chat_model = model
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.50.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

Unsloth 2025.3.19 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


write a test csv trainingdata file to the colab dir


In [3]:
%%writefile training_data.csv
text
"User: Hello! Assistant: Hi How can i help?"
"User: What is first aid and why is it so important? Assistant: First aid is the immediate care provided to a person who is injured or suddenly ill. It can save lives, reduce pain, and prevent a condition from worsening before professional medical services arrive."
"User: What steps should I take if I believe someone is having a heart attack? Assistant: Call Emergency services immediately, have the person sit or lie down, loosen any tight clothing and if they are conscious and not allergic, consider giving them aspirin."
"User: A family member fell and has an open wound that won’t stop bleeding. What’s the best first aid? Assistant: Apply direct pressure to the wound using a clean cloth or gauze to help control bleeding. Keep the injured area elevated if possible, and seek professional medical attention if the bleeding is heavy or doesn’t stop."
"User: How do I assist a person who is unresponsive and not breathing? Assistant: Immediately call emergency services. Begin cardiopulmonary resuscitation (CPR) by pushing hard and fast on the center of their chest at around 100–120 compressions per minute until help arrives or they regain consciousness. If you’re trained and it’s appropriate, provide rescue breaths as well."
"User: My hiking partner broke their arm out on the trail. What’s the immediate first aid? Assistant: Stabilize the injured limb and immobilize it using a splint or sling to prevent movement. Place padding around the injury if possible, and seek medical help. Avoid straightening or moving the limb if you suspect a fracture."
"User: What should I do if someone is showing signs of shock after an injury? Assistant: Have them lie down on their back and, if there’s no head, neck, or spinal injury, elevate their legs slightly. Keep them warm with a blanket but don’t overheat them. Call for emergency services and monitor their condition until help arrives."
"User: How do I respond to a person who’s having a severe allergic reaction? Assistant: Call emergency services right away, and if the person carries an epinephrine auto‐injector (EpiPen), help them use it promptly. Keep them calm and monitor their breathing until professional help arrives."


Writing training_data.csv


load and parse csv to write sample training data

In [4]:
import pandas as pd

#TRAINING_DATA = "training_data.csv"
TRAINING_DATA = "training-data-cot-gptgenerated.csv" #Make sure to add this file to the dir if you wanna use it!




train_prompt_style = """
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful and knowledgeable first aid assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
Let's think step by step. {} Therefore, {}<|eot_id|>
"""


df = pd.read_csv(TRAINING_DATA) #load csv
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    inputs = examples["Question"]
    cots = examples["Cot"]
    outputs = examples["Answer"]
    texts = []
    for input, cot, output in zip(inputs, cots, outputs):
        text = train_prompt_style.format(input, cot, output) + EOS_TOKEN
        texts.append(text)
    return {
        "text": texts,
    }

print out data just so we can see

In [5]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.map(formatting_prompts_func, batched = True,)
print(TRAINING_DATA)  #works

print(f"\n{dataset[0]['text']}")
print(f"\n{dataset[2]['text']}")
print(f"\n{dataset[4]['text']}")



Map:   0%|          | 0/14 [00:00<?, ? examples/s]

training-data-cot-gptgenerated.csv


<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful and knowledgeable first aid assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
What should I do if someone has a deep cut that is bleeding heavily?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
Let's think step by step. Heavy bleeding from a deep cut can be life-threatening. The goal is to stop the bleeding quickly. Direct pressure helps compress blood vessels. Elevating the injury above the heart level slows blood flow. Therefore, Apply firm pressure to the wound with a clean cloth or bandage to stop the bleeding. If possible, elevate the injured area above heart level. Seek emergency medical help if bleeding continues after 10 minutes of pressure.<|eot_id|>
<|eot_id|>


<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful and knowledgeable first aid assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
What are the steps

trainer

In [6]:

from trl import SFTConfig, SFTTrainer

training_args = SFTConfig(output_dir="./output",
                          max_seq_length=max_seq_length,
                          max_steps = 50,
                          learning_rate = 2e-4)


#tokenize the data before giving it to the trainer
tokenized_training_data = dataset.map(lambda e: tokenizer(e["text"]), batched=True, remove_columns=dataset.column_names)



trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_training_data,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=training_args,
)


Map:   0%|          | 0/14 [00:00<?, ? examples/s]

In [7]:
# Disable all WANDB setting.
import os
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_DISABLED"] = "true"

import wandb
wandb.init(mode="disabled")

# Start to fine-tune the LLM
trainer_stats = trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 14 | Num Epochs = 25 | Total steps = 50
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 11,272,192/1,000,000,000 (1.13% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.6868
2,3.7291
3,3.6621
4,3.3312
5,3.0395
6,3.0557
7,2.5337
8,2.2796
9,2.0298
10,1.7778




Ask it a question here

In [8]:

question = "What should I do if someone gets a deep cut across their heel that is bleeding alot?"

prompt_style = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful and knowledgeable first aid assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>
{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

FastLanguageModel.for_inference(model)  # Unsloth has 2x faster inference!
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs)
print(response)
answer_sep = "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
print(f"Prompt:\n{question}")
print(f"\n\nResponse:")
print(response[0].split(answer_sep)[1].replace("<|eot_id|>",""))

["<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are a helpful and knowledgeable first aid assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\nWhat should I do if someone gets a deep cut across their heel that is bleeding alot?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\nLet's think step by step. Deep cuts can cause significant blood loss. Severe bleeding needs medical attention. Therefore, Apply pressure to stop the bleed. Seek emergency help if the person continues to bleed heavily.<|eot_id|>"]
Prompt:
What should I do if someone gets a deep cut across their heel that is bleeding alot?


Response:

Let's think step by step. Deep cuts can cause significant blood loss. Severe bleeding needs medical attention. Therefore, Apply pressure to stop the bleed. Seek emergency help if the person continues to bleed heavily.


Save Model

In [79]:
model.save_pretrained_gguf("dir", tokenizer, quantization_method = "q8_0")

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 1.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 4.2 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 25.77it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving dir/pytorch_model.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at dir into f16 GGUF format.
The output location will be /content/dir/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: dir
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {32}
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model.bin'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> F16, shape = {2048, 128256}
INFO:hf-to-

100%|██████████| 16/16 [00:00<00:00, 43.78it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving dir/pytorch_model.bin...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at dir into q8_0 GGUF format.
The output location will be /content/dir/unsloth.Q8_0.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: dir
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {32}
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model.bin'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> Q8_0, shape = {2048, 1

100%|██████████| 16/16 [00:00<00:00, 43.43it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving dir/pytorch_model.bin...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['f16'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at dir into f16 GGUF format.
The output location will be /content/dir/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: dir
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {32}
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model.bin'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> F16, shape = {2048, 12825

In [82]:
!pip install -qU langchain-huggingface
!pip install -qU langchain-community
!pip install faiss-cpu
!pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.8.tar.gz (67.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.3.8-cp311-cp311-linux_x86_64.whl size=5959604 sha256=c7996f1657d5ba405827

Load dataset and LLM

In [97]:
from langchain_community.document_loaders import TextLoader
from langchain_community.llms import LlamaCpp

data = '/content/rag_doc.txt'
desired_context_length = 4096

loader = TextLoader(data)

docs = loader.load()
llm = LlamaCpp(model_path="/content/dir/unsloth.Q8_0.gguf", n_ctx=desired_context_length)

llama_model_loader: loaded meta data with 28 key-value pairs and 147 tensors from /content/dir/unsloth.Q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Dir
llama_model_loader: - kv   3:                         general.size_label str              = 1.2B
llama_model_loader: - kv   4:                          llama.block_count u32              = 16
llama_model_loader: - kv   5:                       llama.context_length u32              = 131072
llama_model_loader: - kv   6:                     llama.embedding_length u32              = 2048
llama_model_loader: - kv   7:                  llama.feed_forward_length u32      

In [108]:
print(docs[0].metadata)
print(docs[0].page_content[0:500])

{'source': '/content/rag_doc.txt'}
New Patient:
Here's a summary of the discharge information, focusing on injuries and treatment plan:

**Injuries:**

*   **Chronic Unstable Infected Right Ankle Joint:** The discharge diagnosis mentions a chronic, unstable, and infected right ankle joint. The summary does not specify the cause of the infection.
*   **Aortic Stenosis:** Diagnosed via echocardiogram. No additional details provided.
*   **History of PVD (Peripheral Vascular Disease)**
*   **History of HTN (Hypertension)**
*   **His


Embedding Model

In [99]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings



text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200,separators=['New Patient:'])
splits = text_splitter.split_documents(docs)

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")


Vector Store

In [100]:
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
import faiss

embedding_dim = len(embedding_model.embed_query("hello world"))
index = faiss.IndexFlatL2(embedding_dim)

vector_store = FAISS(
    embedding_function=embedding_model,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)


vector_store.add_texts([d.page_content for d in splits])
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 5})

Create Chain

In [110]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

NameError: name 'context' is not defined

Ask Questions Here

In [109]:
results = rag_chain.invoke({"input": "Treatment plan for degloving injury."})
print(results['answer'])

Llama.generate: 45 prefix-match hit, remaining 1831 prompt tokens to eval
llama_perf_context_print:        load time =   68118.11 ms
llama_perf_context_print: prompt eval time =  130124.81 ms /  1831 tokens (   71.07 ms per token,    14.07 tokens per second)
llama_perf_context_print:        eval time =    9637.59 ms /    50 runs   (  192.75 ms per token,     5.19 tokens per second)
llama_perf_context_print:       total time =  139927.55 ms /  1881 tokens


 Physical therapy and wound care are important components of the treatment plan. The patient's safety is ensured through close monitoring of vital signs during physical therapy. Wound care instructions are provided to ensure proper healing. Provisions for post-treatment care are discussed.
