In [1]:
import unsloth


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
from unsloth import FastLanguageModel, to_sharegpt
from datasets import load_dataset, Dataset
import torch
from trl import SFTTrainer
from transformers.training_args import TrainingArguments
from unsloth import is_bfloat16_supported
import pandas as pd

In [None]:
# Define parameters
model_name = "meta-llama/Llama-3.2-3B-Instruct"
max_seq_length = 2048
dtype = None
load_in_4bit = True
token = "your HF_Token"  #replace by the token created from hugging face


In [5]:
# Declare model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name= model_name,
    max_seq_length= max_seq_length,
    dtype= dtype,
    load_in_4bit= load_in_4bit,
    token= token,
)


In [None]:
from huggingface_hub import HfFolder
from datasets import config

# Địa chỉ cache cho các tệp từ Hugging Face Hub (mô hình, tokenizer)
cache_dir = config.HF_DATASETS_CACHE

print(cache_dir)


In [6]:
# Load dataset
dataset_eng_qa = load_dataset("lavita/medical-qa-datasets", "all-processed", split = "train")
dataset_eng_qa_2 = load_dataset("eashuu/medical_qa", split = "train")
dataset_viet_qa = load_dataset("hungnm/vietnamese-medical-qa", split="train")
dataset_viet_diagnosis = load_dataset("PB3002/ViMedical_Disease", split="train")
dataset_vipubmed = load_dataset("VietAI/vi_pubmed", split = "pubmed22")


Resolving data files:   0%|          | 0/89 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/87 [00:00<?, ?it/s]

In [7]:
print(dataset_eng_qa.column_names)
print(dataset_eng_qa_2.column_names)
print(dataset_viet_qa.column_names)
print(dataset_viet_diagnosis.column_names)
print(dataset_vipubmed.column_names)


['instruction', 'input', 'output', '__index_level_0__']
['instruction', 'input', 'output']
['answer', 'question']
['Disease', 'Question']
['en', 'vi']


In [8]:
# Data preprocessing
def format_eng_qa(example):
    answer = example['output']
    output = answer.replace("chatbot", "telemedicine agent system")
    output = output.replace("Chatbot", "Telemedicine agent system")
    return {
        'instruction': example['instruction'],
        'input': example['input'],
        'output': output
    }
dataset_eng_qa = dataset_eng_qa.map(format_eng_qa)


In [9]:
def format_viet_qa(example):
    # Return dict directly, not wrapped in a list
    return {
        "instruction": "đọc thông tin sau và giải đáp câu hỏi của bệnh nhân",
        "input": example["question"],
        "output": example["answer"],
    }
dataset_viet_qa = dataset_viet_qa.map(format_viet_qa)


In [10]:
def format_diagnosis(example):
    disease = example["Disease"]
    answer = (
        f"Dựa trên triệu chứng bạn mô tả, có thể bạn đang mắc bệnh {disease}. "
        "Tuy nhiên, đây chỉ là đánh giá sơ bộ. Bạn nên đi khám bác sĩ để được chẩn đoán chính xác."
    )
    return {
        "instruction": "Dựa vào những triệu chứng mà bệnh nhân mô tả, hãy đưa ra chẩn đoán bệnh.",
        "input": example["Question"],
        "output": answer,
    }

def format_vipubmed(example):
    instruction = "Hãy đọc văn bản tiếng anh do người dùng cung cấp và dịch sang tiếng việt."
    en_text = example['en']
    input = f"Bạn hãy giúp tôi dịch văn bản sau đây sang tiếng Việt: {en_text}"
    output = f"Văn bản sau khi được dịch là:\n\t{example['vi']}"
    return {
        'instruction': instruction,
        'input': input,
        'output': output,
    }

dataset_eng_qa = dataset_eng_qa.map(format_eng_qa)
dataset_viet_qa = dataset_viet_qa.map(format_viet_qa)
dataset_viet_diagnosis = dataset_viet_diagnosis.map(format_diagnosis)
dataset_vipubmed = dataset_vipubmed.map(format_vipubmed)

dataset = pd.concat([
    pd.DataFrame(dataset_eng_qa),
    pd.DataFrame(dataset_eng_qa_2),
    pd.DataFrame(dataset_viet_qa),
    pd.DataFrame(dataset_viet_diagnosis),
    pd.DataFrame(dataset_vipubmed),
], ignore_index=True)


In [12]:
dataset = pd.concat([
    pd.DataFrame(dataset_eng_qa),
    pd.DataFrame(dataset_eng_qa_2[:5000]),
    pd.DataFrame(dataset_viet_qa),
], ignore_index= True)


In [13]:
dataset=pd.concat([dataset,pd.DataFrame(dataset_viet_diagnosis),], ignore_index= True)


In [14]:
dataset = pd.concat([dataset, pd.DataFrame(dataset_vipubmed[:10000]),], ignore_index= True)


In [1]:
dataset = to_sharegpt(
    Dataset.from_pandas(dataset),
    merged_prompt="{instruction}[[\nYour input is:\n{input}]]",
    output_column_name="output",
    conversation_extension=3,
)


NameError: name 'to_sharegpt' is not defined

In [None]:
# Define chat template
chat_template ="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{SYSTEM}<|eot_id|><start_header_id|>user<|end_header_id|>
{INPUT}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{OUTPUT}<|eot_id|>
"""


In [None]:
# Create fine-tune model
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules= ["q_proj", "k_proj", "v_proj", 
                     "gate_proj", "up_proj", "down_proj"],
    lora_alpha= 16,
    lora_dropout= 0,
    bias = "none",
    use_gradient_checkpointing= "unsloth",
    random_state= 3407,
    use_rslora= False,
    loftq_config= None
)


In [None]:
def formatting_func(batch):
    # batch is a dict: {"conversations": [ [...], [...], ... ]}
    formatted_texts = []

    system_prompt = "You are a helpful medical assistant. Answer based on the patient's description."

    for conversation in batch["conversations"]:  # Each `conversation` is a list of {'from': ..., 'value': ...}
        text = "<|begin_of_text|>"

        for msg in conversation:
            if not isinstance(msg, dict):
                continue
            if "from" not in msg or "value" not in msg:
                continue

            role = msg["from"]
            content = msg["value"].strip()

            if role == "human":
                if "If you are a doctor" in content and "Your input is:" in content:
                    try:
                        user_input = content.split("Your input is:\n", 1)[1]
                    except IndexError:
                        user_input = content
                    text += f"<|start_header_id|>system<|end_header_id|>\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n{user_input}<|eot_id|>"
                else:
                    text += f"<|start_header_id|>user<|end_header_id|>\n{content}<|eot_id|>"
            elif role == "gpt":
                text += f"<|start_header_id|>assistant<|end_header_id|>\n{content}<|eot_id|>"

        formatted_texts.append(text)

    return formatted_texts  # List of strings, one per example

trainer = SFTTrainer(
    model = model,
    processing_class= tokenizer,
    train_dataset= dataset,
    peft_config= model.peft_config,
    formatting_func = formatting_func,
    args= TrainingArguments(
        output_dir= "./output",
        per_device_train_batch_size= 2,
        gradient_accumulation_steps= 4,
        warmup_steps= 5,
        max_steps= 60,
        num_train_epochs= 1,
        learning_rate= 2e-4,
        fp16= not is_bfloat16_supported(),
        bf16= is_bfloat16_supported(),
        logging_steps= 1,
        optim="adamw_8bit",
        weight_decay= 0.01,
        lr_scheduler_type= "linear",
        seed= 3407,
    )
)
trainer.train()


In [None]:
# Save model
model.save_pretrained("../models/fine_tuned_model")
tokenizer.save_pretrained("../models/fine_tuned_model")
