In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import json
from datasets import Dataset


In [None]:
import json

with open("drive/MyDrive/biology_finetune/double_questions_dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print("Total questions:", len(data))


Total questions: 2772


# Preparing the dataset

In [None]:
import json

input_file = "drive/MyDrive/biology_finetune/double_questions_dataset.json"
output_file = "double_questions_dataset_no_COT.json"

# Load the dataset
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# Remove answer3 from every entry if it exists
for item in data:
    if "answer3" in item:
        del item["answer3"]

# Save cleaned dataset
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print("Done! Saved cleaned file as:", output_file)


Done! Saved cleaned file as: double_questions_dataset_no_COT.json


In [None]:
import json
import os

# === INPUT AND OUTPUT FILES ===
input_file = "drive/MyDrive/biology_finetune/double_questions_dataset_no_COT.json"
output_file = "double_questions_dataset_no_COT_final.json"

# === LOAD DATASET ===
try:
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    print(f"Loaded: {len(data)} items from {input_file}")
except FileNotFoundError:
    print(f"Error: Could not find {input_file}. Please check the file path.")
    data = []

rewritten = []

for item in data:
    q = item.get("question", "").strip()
    if not q:
        continue

    a1 = item.get("answer1", "").strip()
    a2 = item.get("answer2", "").strip()
    a3 = item.get("answer3", "").strip()

    # SHORT ANSWER 1
    if a1:
        rewritten.append({
            "messages": [
                {"role": "user", "content": q},
                {"role": "assistant", "content": a1}
            ]
        })

    # SHORT ANSWER 2
    if a2:
        rewritten.append({
            "messages": [
                {"role": "user", "content": q},
                {"role": "assistant", "content": a2}
            ]
        })

    # COT ANSWER 3
    if a3:
        rewritten.append({
            "messages": [
                {"role": "user", "content": q},
                {"role": "assistant", "content": a3}
            ]
        })

print(f"Final dataset rows: {len(rewritten)}")

# === SAVE OUTPUT ===
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(rewritten, f, ensure_ascii=False, indent=2)

print(f"Saved to: {output_file}")


Loaded: 2772 items from drive/MyDrive/biology_finetune/double_questions_dataset_no_COT.json
Final dataset rows: 5544
Saved to: double_questions_dataset_no_COT_final.json


In [None]:
import json
import os

# === INPUT AND OUTPUT FILES ===
input_file = "drive/MyDrive/biology_finetune/double_questions_dataset.json"
output_file = "double_questions_dataset_with_system_prompt.json"

# === SYSTEM PROMPTS ===
# Prompt for Short/Standard Answers (Answer 1 & 2)
system_short = (
    "Sen bir biyoloji asistanısın. Soruyu açık, anlaşılır ve orta uzunlukta bir açıklamayla cevapla. "
    "Gereksiz ayrıntılara girme, mantık adımlarını veya çözüm sürecini açıkça göstermeden "
    "sonuç odaklı ve öğretici bir yanıt üret. "
    "Yanıtın doğal, akıcı ve bilgilendirici olmasına dikkat et."
)

# Prompt for Chain of Thought Answers (Answer 3)
system_cot = (
    "Sen uzman bir biyoloji öğretmenisin. Soruyu adım adım düşünerek cevapla. "
    "Önce problemi analiz et, ardından mantıklı bir akış içinde gerekçeleri sırala ve "
    "her adımda neden-sonuç ilişkisini açıkla. "
    "En sonunda ise net ve güçlü bir sonuç paragrafı yaz."
)

# === LOAD DATASET ===
try:
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    print(f"Loaded: {len(data)} items from {input_file}")
except FileNotFoundError:
    print(f"Error: Could not find {input_file}. Please check the file path.")
    data = []

rewritten = []

for item in data:
    q = item.get("question", "").strip()

    # Skip if question is missing
    if not q:
        continue

    # Extract answers safely
    a1 = item.get("answer1", "").strip()
    a2 = item.get("answer2", "").strip()
    a3 = item.get("answer3", "").strip()

    # --- SHORT ANSWER 1 (Only add if answer exists) ---
    if a1:
        rewritten.append({
            "messages": [
                {"role": "system", "content": system_short},
                {"role": "user", "content": q},
                {"role": "assistant", "content": a1}
            ]
        })

    # --- SHORT ANSWER 2 (Only add if answer exists) ---
    if a2:
        rewritten.append({
            "messages": [
                {"role": "system", "content": system_short},
                {"role": "user", "content": q},
                {"role": "assistant", "content": a2}
            ]
        })

    # --- COT ANSWER 3 (Only add if answer exists) ---
    if a3:
        rewritten.append({
            "messages": [
                {"role": "system", "content": system_cot},
                {"role": "user", "content": q},
                {"role": "assistant", "content": a3}
            ]
        })

print(f"Final dataset rows: {len(rewritten)}")

# === SAVE OUTPUT ===
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(rewritten, f, ensure_ascii=False, indent=2)

print(f"Saved to: {output_file}")

Loaded: 2772 items from drive/MyDrive/biology_finetune/double_questions_dataset.json
Final dataset rows: 8316
Saved to: double_questions_dataset_with_system_prompt.json


In [None]:
pip install transformers accelerate datasets peft trl bitsandbytes




# Training

In [None]:
import json
from datasets import load_dataset

# === INPUT AND OUTPUT PATHS ===
input_path = "/content/drive/MyDrive/biology_finetune/double_questions_dataset_no_COT_final.json"
output_path = "/content/drive/MyDrive/biology_finetune/double_questions_dataset_no_COT_final_shuffled.json"

# === LOAD DATASET ===
ds = load_dataset(
    "json",
    data_files=input_path,
)["train"]

print("Loaded samples:", len(ds))

# === SHUFFLE ===
ds = ds.shuffle(seed=42)

print("Shuffled samples:", len(ds))

# === SAVE AS HUMAN-READABLE JSONL (UTF-8, NO ESCAPED CHARACTERS) ===
with open(output_path, "w", encoding="utf-8") as f:
    for item in ds:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print("Saved human-readable shuffled file to:", output_path)


Loaded samples: 5544
Shuffled samples: 5544
Saved human-readable shuffled file to: /content/drive/MyDrive/biology_finetune/double_questions_dataset_no_COT_final_shuffled.json


In [None]:
from datasets import load_dataset
data_file = "/content/drive/MyDrive/biology_finetune/double_questions_dataset_no_COT_final_shuffled.json"
# Load your full dataset
dataset = load_dataset("json", data_files=data_file)

full_ds = dataset["train"]

# Create train / validation / test splits
split_ds = full_ds.train_test_split(test_size=0.10, seed=42)  # 10% test
temp_train = split_ds["train"]

split_val = temp_train.train_test_split(test_size=0.055, seed=42)
# ~5% validation (5.5% of 90% ≈ 5%)

train_ds = split_val["train"]
val_ds   = split_val["test"]
test_ds  = split_ds["test"]

print("Train samples:", len(train_ds))
print("Validation samples:", len(val_ds))
print("Test samples:", len(test_ds))


Generating train split: 0 examples [00:00, ? examples/s]

Train samples: 4714
Validation samples: 275
Test samples: 555


In [None]:
train_ds.to_json("train_ds.json")
print("Saved as train_ds.json")


Creating json from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Saved as train_ds.json


In [None]:
trainer.train()


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
30,3.6654,2.33457,2.427356,31708.0,0.564244
60,2.4841,2.2055,2.377972,63368.0,0.580514
90,2.4841,2.126905,2.276145,95405.0,0.589871
120,2.1497,2.056637,2.231077,127258.0,0.598177
150,2.0137,2.003849,2.15521,159715.0,0.608715
180,2.0137,1.946282,2.078686,191382.0,0.617679
210,1.9581,1.903612,2.055263,222168.0,0.6248
240,1.9581,1.872038,2.029203,253614.0,0.628125
270,1.8683,1.847081,2.007316,285549.0,0.63302
300,1.8189,1.820567,1.986217,317437.0,0.636923


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using

TrainOutput(global_step=2950, training_loss=1.072493334624727, metrics={'train_runtime': 2571.0235, 'train_samples_per_second': 9.168, 'train_steps_per_second': 1.147, 'total_flos': 3.0445902372980736e+16, 'train_loss': 1.072493334624727, 'epoch': 5.0})

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig
from trl import SFTConfig, SFTTrainer
import inspect

# 1. Disable external logging tools
os.environ["WANDB_DISABLED"] = "true"

model_name = "Qwen/Qwen2.5-1.5B"

# 2. Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.padding_side = "right"

# 3. Load Model (A100 Optimized)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16,        # bfloat16 is best for A100
)

# 4. Define Config with REPORT_TO FIX
sft_config = SFTConfig(
    output_dir="/content/drive/MyDrive/biology_finetune/fourth_model_double_questions_no_prompt_no_COT",
    max_length=2048,
    packing=False,
    logging_strategy="steps",
    logging_first_step=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,  # Increased to 4 (A100 handles this easily)
    gradient_accumulation_steps=2,  # Adjusted to keep effective batch size = 8
    learning_rate=2e-4,
    bf16=True,
    logging_steps=50,
    save_steps=500,
    save_total_limit=50,
    eval_strategy="steps",      # <--- REQUIRED: Turn on validation
    eval_steps=30,              # <--- REQUIRED: Test model every 30 steps
    do_eval=True,

    report_to="none",

)

# 5. Define LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

# 6. Create Trainer
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    processing_class=tokenizer,
    peft_config=lora_config,
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!


Tokenizing train dataset:   0%|          | 0/4714 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/4714 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/275 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/275 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


### Comparison between baseline and final checkpoin

In [None]:
EVAL_QUESTION = "Soru: Asit-baz kimyası biyokimyada neden önemlidir?\nCevap:"


In [None]:
model_name = "Qwen/Qwen2.5-1.5B"

In [None]:
from transformers import pipeline

baseline_pipe = pipeline(
    "text-generation",
    model=model_name,        # your original Qwen model
    tokenizer=tokenizer,
    max_new_tokens=200
)

print("===== BASELINE MODEL =====")
print(baseline_pipe(EVAL_QUESTION)[0]["generated_text"])


Device set to use cuda:0


===== BASELINE MODEL =====
Soru: Asit-baz kimyası biyokimyada neden önemlidir?
Cevap: Asit-baz kimyası biyokimyada önemli bir rol oynamaktadır. Bu kimyasılarda, inşaat ve construction projelerindeki biyokimya ürünlerinin üretiminde kayda değer yer alır. Ayrıca, farklı inşaat projelerini desteklemek için kullanılan kimyasılarda da asit-baz kimiyasların kullanılması önemlidir.


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel

base_model_name = "Qwen/Qwen2.5-1.5B"

# 1. Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# 2. Load BASE model (required before PEFT model)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype="auto",
    device_map="auto"
)

# 3. Path to your LoRA checkpoint
mid_ckpt = "/content/drive/MyDrive/biology_finetune/fourth_model_double_questions_no_prompt_no_COT/checkpoint-2950"

# 4. Load LoRA weights on top of the base model
mid_model = PeftModel.from_pretrained(
    base_model,
    mid_ckpt
)
mid_pipe = pipeline(
    "text-generation",
    model=mid_model,
    tokenizer=tokenizer
)
prompt = "Bilimsel yöntemin temel kavramları nelerdir?"

print(mid_pipe(prompt)[0]["generated_text"])


Device set to use cuda:0


Bilimsel yöntemin temel kavramları nelerdir? Bilimsel yöntem, doğal dünyaya yönelik soruların cevaplarını bulmak için deney ve kontrol uygulamalarına dayanık bir şekilde yaklaşır. Bu yöntem, gözlemlerden başlayarak, hipotezleri oluşturarak, deneyler yaparak ve analiz edilmiş sonuçlarla yeni bilgiyi desteklemesini sağlamaktır. Yani, gözlemle hipotezi oluşturmaya, deneyler yapmaya ve sonuçları analiz etmeyi içerir.


In [None]:
# ================================
# 1. Load Model + Tokenizer + LoRA
# ================================
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
import torch
import json
from tqdm import tqdm

base_model_name = "Qwen/Qwen2.5-1.5B"

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype="auto",
    device_map="auto"
)

lora_ckpt = "/content/drive/MyDrive/biology_finetune/fourth_model_double_questions_no_prompt_no_COT/checkpoint-2950"

model = PeftModel.from_pretrained(base_model, lora_ckpt)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=200,
    temperature=0.3
)


# ================================
# 2. Load Test Dataset (NDJSON FIX)
# ================================
test_path = "/content/test_ds.json"

test_data = []
with open(test_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:
            test_data.append(json.loads(line))

print("Total samples in test set:", len(test_data))

# Take only the first 50 samples
test_data = test_data[:50]


# Extract questions
questions = [item["messages"][0]["content"] for item in test_data]


# ================================
# 3. Run Model on First 50 Samples
# ================================
results = []

for q in tqdm(questions, desc="Running model on first 50 samples"):
    output = pipe(q)[0]["generated_text"]
    results.append({
        "question": q,
        "model_answer": output
    })


# ================================
# 4. Save Output
# ================================
output_file = "/content/test_predictions_50.json"

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

print("Saved predictions to:", output_file)


Device set to use cuda:0


Total samples in test set: 555


Running model on first 50 samples:  20%|██        | 10/50 [01:47<06:47, 10.18s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Running model on first 50 samples: 100%|██████████| 50/50 [08:22<00:00, 10.05s/it]

Saved predictions to: /content/test_predictions_50.json





In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel

base_model_name = "Qwen/Qwen2.5-1.5B"

# 1. Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# 2. Load BASE model (required before PEFT model)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype="auto",
    device_map="auto"
)

# 3. Path to your LoRA checkpoint
mid_ckpt = "/content/drive/MyDrive/biology_finetune/biology_lora/checkpoint-4000"

# 4. Load LoRA weights on top of the base model
mid_model = PeftModel.from_pretrained(
    base_model,
    mid_ckpt
)
mid_pipe = pipeline(
    "text-generation",
    model=mid_model,
    tokenizer=tokenizer)

prompt = "Asit-baz dengesi biyokimyada neden önemlidir? Soruya acik ve anlasilir bir sekilde, ogretici ve sonuc odakli bir cevap ver. Cevap orta uzunlukta, akici ve bilgilendirici olsun. Gereksiz ayrintilara girme."

print(mid_pipe(prompt)[0]["generated_text"])


Device set to use cuda:0
The model 'PeftModel' is not supported for text-generation. Supported models are ['PeftModelForCausalLM', 'ApertusForCausalLM', 'ArceeForCausalLM', 'AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BitNetForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'BltForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV2ForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'DogeForCausalLM', 'Dots1ForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'Ernie4_5ForCausalLM', 'Ernie4_5_MoeForCausalLM', 'Exaone4ForCausalLM', 'FalconForCausalLM', 'FalconH1ForCausalLM', 'FalconMambaForCausalLM', 'FlexOlmoForCausalLM', '

Asit-baz dengesi biyokimyada neden önemlidir? Soruya acik ve anlasilir bir sekilde, ogretici ve sonuc odakli bir cevap ver. Cevap orta uzunlukta, akici ve bilgilendirici olsun. Gereksiz ayrintilara girme. Asit-baz dengesinin biyokimyada önemini açıklarken, biyoloji bilgilerini uygun bir şekilde kullanabilir misiniz?

Asit-baz dengesinin biyokimyada önemini anlamak için, biyoteknoloji ve biyokimyada önemli olan biyoteknik alanlarında neden önemlidir hakkında daha ayrıntı alabiliriz.

1. **İnsan Genotipi**: Asit-baz dengesini anlamak için, insan üreme deneyleri için kullanılan genetik testleri yapmak önemlidir. Bu testler, insan üreme gücünün hangi genlerle başlangıçta birbirine bağlı olduğunu belirlemek için kullanılır. Bu, insanın üreme potansiyelini değerlendirmek için gerekli bilgiye sahiptir.

2. **Genetik Çeşitlilik**: İnsanlar tek bir birim olarak varırlar ve genetik çeşitlilik inanılmaz derecede yüksek. Bu, insan için oldukça kolay bir biyokimye kavuşturmak için gereksinim


In [None]:
pip install transformers accelerate sentencepiece
