# **Fine Tunning DeepSeek-r1 - 8B Model**

In [None]:
!pip install unsloth
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [None]:
from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer
from unsloth import is_bfloat16_supported
from huggingface_hub import login
from transformers import TrainingArguments
from datasets import load_dataset
import wandb

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True

In [None]:
#Checking Huggingface Token
from google.colab import userdata
from huggingface_hub import login
hf_token = userdata.get('HF_TOKEN')
login(hf_token)

In [None]:
# Step 1: Install wandb (if not already installed)
!pip install -q wandb

# Step 2: Import wandb
import wandb

# Step 3: Login securely using wandb API key
# Use input or environment variable (safer in shared notebooks)
import os

# Prompt user to paste token if not already set
if "WB_API" not in os.environ:
    os.environ["WB_API"] = input("Paste your WandB API Key here: ")

wandb.login(key=os.environ["WB_API"])

# Step 4: Initialize the wandb run
run = wandb.init(
    project='Fine-tune-DeepSeek-R1-Distill-Llama-8B on Medical COT Dataset',
    job_type="training",
    anonymous="allow"
)


In [None]:
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-Distill-Llama-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token,
)

In [None]:
prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.
Please answer the following medical question.

### Question:
{}

### Response:
<think>{}"""

In [None]:
question = "A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings, what would cystometry most likely reveal about her residual volume and detrusor contractions?"


FastLanguageModel.for_inference(model)  # Unsloth has 2x faster inference!
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs)
print(response[0].split("### Response:")[1])

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

In [None]:
train_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.
Please answer the following medical question.

### Question:
{}

### Response:
<think>
{}
</think>
{}"""

In [None]:
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN


def formatting_prompts_func(examples):
    inputs = examples["Question"]
    cots = examples["Complex_CoT"]
    outputs = examples["Response"]
    texts = []
    for input, cot, output in zip(inputs, cots, outputs):
        text = train_prompt_style.format(input, cot, output) + EOS_TOKEN
        texts.append(text)
    return {
        "text": texts,
    }

In [None]:
from datasets import load_dataset
dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT","en", split = "train[0:500]",trust_remote_code=True)
dataset = dataset.map(formatting_prompts_func, batched = True,)
dataset["text"][0]

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        # Use num_train_epochs = 1, warmup_ratio for full training runs!
        warmup_steps=5,
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

In [None]:
import os
os.environ["UNSLOTH_RETURN_LOGITS"]="1"
os.environ["TRITON_DISABLE_LINE_INFO"]="1"
os.environ["TRITON_INTERPRET"]="1"
trainer_stats = trainer.train()


In [None]:
wandb.finish()


In [None]:
import gradio as gr

# ✅ Reuse your already loaded model and tokenizer
# model and tokenizer must already be defined in previous cells
# You also already called: FastLanguageModel.for_inference(model)

# Prompt style from your original code
prompt_style = "### Instruction:\n{}\n\n### Response:"

# Inference function
def generate_response(question):
    try:
        inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=1200,
            use_cache=True,
        )

        response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        final_answer = response.split("### Response:")[-1].strip()
        return final_answer

    except Exception as e:
        return f"❌ Error: {str(e)}"

# Gradio interface
demo = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(
        label="Enter Clinical Question",
        lines=5,
        placeholder="Type your medical prompt here...",
    ),
    outputs=gr.Textbox(label="Model Response"),
    title="🧠 Clinical QA with Your Unsloth Model",
    theme="default"
)

# Launch Gradio app
demo.launch(share=True)  # set share=False if not needed


In [None]:
question = "A 59-year-old man presents with a fever, chills, night sweats, and generalized fatigue, and is found to have a 12 mm vegetation on the aortic valve. Blood cultures indicate gram-positive, catalase-negative, gamma-hemolytic cocci in chains that do not grow in a 6.5% NaCl medium. What is the most likely predisposing factor for this patient's condition?"

inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

outputs = model.generate(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    max_new_tokens=1200,
    use_cache=True,
)
response = tokenizer.batch_decode(outputs)
print(response[0].split("### Response:")[1])

In [None]:
new_model_online = "Manojkumar28/DeepSeek-R1-Medical-COT"
new_model_local = "DeepSeek-R1-Medical-COT"
model.save_pretrained(new_model_local) # Local saving
tokenizer.save_pretrained(new_model_local)

In [None]:
from huggingface_hub import create_repo, upload_folder

# 🔁 Reuse existing repo or create it
create_repo(repo_id="Manojkumar28/DeepSeek-R1-Medical-COT", private=True, exist_ok=True)

# 🚀 Upload folder (with progress and resume support)
upload_folder(
    folder_path="./DeepSeek-R1-Medical-COT",
    repo_id="Manojkumar28/DeepSeek-R1-Medical-COT",
    repo_type="model"
)


In [None]:
!ls /content/DeepSeek-R1-Medical-COT


In [None]:
!zip -r DeepSeek-R1-Medical-COT.zip DeepSeek-R1-Medical-COT
from google.colab import files
files.download("DeepSeek-R1-Medical-COT.zip")


In [None]:
!pip install bitsandbytes accelerate


In [None]:
!huggingface-cli login


# **Basics Of FineTunning**




**What is Transformer ?**

    A Transformer is a special type of AI model that reads the entire sentence
    all at once, instead of word by word. This allows it to understand the
    relationships between words in the sentence, helping it grasp the true
    meaning and context much better.

**Example:**

    In the sentence,

    “A dog is eating cheese because it was hungry,”

    the word  “it” refers to the dog.

    Transformers can easily understand this connection by
    looking at the whole sentence at once. Older models
    like RNNs and LSTMs read word by word and often
    struggled to correctly understand such relationships.
    

**How Transformers Work:**

    1. See the whole sentence at once: Transformers look
       at every word in the sentence together, not just one
       at a time.

    2. Use attention to focus: They use a method called
       “self-attention” to figure out which words are
       important and how they relate to each other.

    3. Understand word relationships: By paying attention
       to these connections, transformers understand the
       meaning and context better.





**What is an LLM (Large Language Model)?**

    An LLM is an AI model trained on a huge amount of text data. It learns
    patterns, grammar, facts, and reasoning from this data. When you give it
    text as input, the model processes it and generates a meaningful text
    output—like answering questions, writing essays, or having a conversation.


**How Large Language Models (LLMs) Work — Simply:**

    They read lots of text to learn how language works.

    They remember how words connect to each other.

    They look at your whole sentence at once to understand what you mean.

    Then, they write back a smart and relevant answer based on what they learned.

**Training Vs Fine Tuning**

    Training means building an AI model from scratch by teaching it with a
    large amount of data.

    Fine-tuning means taking an already trained model and modifying it using
    your own specific data to make it perform better for your particular needs.

**PEFT (Parameter-Efficient Fine-Tuning):**
    

    PEFT is the category or family of methods where:
      You don’t fine-tune the full model
      You only update a small portion

    Methods:
      1. LoRA 🔥
      2. Adapter Tuning
      3. Prefix Tuning
      4. Prompt Tuning
      5. BitFit


**UnSloth Library**

    Unsloth is a library used to improve training speed and reduce GPU
    memory pressure while fine-tuning large language models.
    
    Unsloth speeds up LLM fine-tuning and lowers GPU usage

**Steps**

    1. Installing the packages
    2. Initializing model and tokenizer
    3. Adding LoRA Adapters
    4. Data Preparation
    5. Training the Model
    6. Now to start the training of the model, using this trainer
    7. Interfacing the fine tunned Model

