# 📦 Install Dependencies

The following command installs all necessary libraries for training an instruction-tuned transformer using Unsloth, Hugging Face's PEFT, and TRL.

In [None]:
# Install dependencies
!pip install -q unsloth trl peft accelerate bitsandbytes datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.1/47.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m277.0/277.0 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m366.3/366.3 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.4/147.4 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip install -U transformers unsloth




# 🧠 Load Model (Unsloth + Mistral 7B 4-bit)

This snippet loads a memory-efficient 4-bit quantized version of Mistral-7B and prepares it for PEFT using LoRA.



In [None]:
# Load model
from unsloth import FastLanguageModel
from transformers import AutoTokenizer

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True
)

model = FastLanguageModel.get_peft_model(
    model,
    r=8,
    target_modules=["q_proj", "v_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing=True
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.1: Fast Mistral patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/4.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.13k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Not an error, but Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Not an error, but Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.6.1 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


# Upload DS File

In [None]:
# Upload dataset file
from google.colab import files
uploaded = files.upload()

# Split the Data to Train/Validtion/Test

In [None]:
# Load and split dataset
from datasets import load_dataset
dataset = load_dataset("json", data_files="essay_grading_data_500_high_quality.jsonl", split="train")

dataset = dataset.train_test_split(test_size=0.2, seed=42)
val_test = dataset["test"].train_test_split(test_size=0.5, seed=42)
dataset = {
    "train": dataset["train"],
    "validation": val_test["train"],
    "test": val_test["test"]
}



# 🧾 Format for Instruction Tuning

This function extracts components from a unified text field (like a prompt) and formats them into an instruction-style input/output pair for supervised fine-tuning.



In [None]:
import re

def format_example(example):

    text = example['text'] if isinstance(example, dict) and 'text' in example else example

    question_match = re.search(r"Question:\s*(.*?)\n", text)
    ref_answer_match = re.search(r"Reference Answer:\s*(.*?)\n", text)
    mark_scheme_match = re.search(r"Mark Scheme:\s*(.*?)\n", text)
    student_answer_match = re.search(r"Student Answer:\s*(.*)", text)


    question = question_match.group(1).strip() if question_match else ""
    reference_answer = ref_answer_match.group(1).strip() if ref_answer_match else ""
    mark_scheme = mark_scheme_match.group(1).strip() if mark_scheme_match else ""
    student_answer = student_answer_match.group(1).strip() if student_answer_match else ""

    instruction = "Grade the following student essay and explain the rationale."
    input_text = f"""Question: {question}
    Reference Answer: {reference_answer}
    Mark Scheme: {mark_scheme}
    Student Answer: {student_answer}"""

    output_text = "Score: \nRationale: "

    return {"input": f"{instruction}\n\n{input_text}", "output": output_text}


In [None]:
def formatting_func(example):
    return [f"{example['input']}\n{example['output']}"]


In [None]:
def prepare(example):
    return {"text": formatting_func(example)[0]}

dataset["train"] = dataset["train"].map(prepare)
dataset["test"] = dataset["test"].map(prepare)
dataset["validation"] = dataset["validation"].map(prepare)



# 🏋️‍♂️ Train the Essay Grading Model

This section sets up the `SFTTrainer` from the `trl` library to fine-tune the model on the formatted essay dataset. It disables Weights & Biases tracking and uses memory-efficient optimization with 8-bit AdamW.

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    dataset_text_field="text",
    max_seq_length=2048,
    formatting_func=None,
    args=TrainingArguments(
        output_dir="./essay-grader-model",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=1,
        learning_rate=2e-4,
        logging_steps=10,
        save_strategy="epoch",
        save_total_limit=1,
        fp16=True,
        optim="paged_adamw_8bit"
    )
)
trainer.train()





# 🧪 Test Data

This script evaluates the model on a test dataset by generating a response, extracting the predicted score, and comparing it to the ground truth score.


In [None]:
from tqdm import tqdm

correct = 0
total = len(dataset["test"])

for ex in tqdm(dataset["test"]):
    prompt = ex['input'] + "\nResponse:"

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=200)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predicted_score = None
    for line in response.split("\n"):
        if "Score:" in line:
            try:
                predicted_score = int(line.split("Score:")[1].strip())
                break
            except ValueError:
                continue

    actual_score = None
    for line in ex['output'].split("\n"):
        if "Score:" in line:
            try:
                actual_score = int(line.split("Score:")[1].strip())
                break
            except ValueError:
                continue

    if predicted_score is not None and actual_score is not None and predicted_score == actual_score:
        correct += 1

print(f"\nTest Accuracy: {correct}/{total} = {correct / total:.2%}")


# Modify the data so that we do not use regular expressions...and we also use metrics so that the evaluation is correct.

# Upload dataset file


In [None]:
# Upload dataset file
from google.colab import files
uploaded = files.upload()

Saving synthetic_essay_data_1000.jsonl to synthetic_essay_data_1000.jsonl


# Split the Data to Train/Validtion/Test

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

df = pd.read_json("synthetic_essay_data_1000.jsonl", lines=True)

if "__index_level_0__" in df.columns:
    df = df.drop(columns=["__index_level_0__"])

dataset = Dataset.from_pandas(df)

train_test = dataset.train_test_split(test_size=0.2, seed=42)
val_test = train_test['test'].train_test_split(test_size=0.5, seed=42)

dataset = DatasetDict({
    "train": train_test['train'],
    "validation": val_test['train'],
    "test": val_test['test']
})


In [None]:
!pip install -q trl


# 🛠️ Format Dataset for Supervised Fine-Tuning

This function takes a structured example (with fields like `question`, `reference_answer`, `mark_scheme`, etc.) and formats it into an instruction-style prompt/response pair. The dataset is then transformed using `.map()`.

In [None]:
def format_example(example):
    instruction = "Grade the following student essay and explain the rationale."
    input_text = (
        f"Question: {example['question']}\n"
        f"Reference Answer: {example['reference_answer']}\n"
        f"Mark Scheme: {example['mark_scheme']}\n"
        f"Student Answer: {example['student_answer']}"
    )
    output_text = f"Score: {example['score']}\nRationale: {example['rationale']}"
    return {"input": f"{instruction}\n\n{input_text}", "output": output_text}
dataset = dataset.map(format_example)


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

# 🧹 Prepare Dataset for SFTTrainer

This step converts each example into a unified text field (`text`) by concatenating the formatted instruction and response. It prepares the dataset for use with `SFTTrainer` by ensuring each example has a single `text` field.


In [None]:
def formatting_func(example):
    return [f"{example['input']}\n{example['output']}"]


In [None]:
def prepare(example):
    return {"text": formatting_func(example)[0]}

dataset["train"] = dataset["train"].map(prepare)
dataset["test"] = dataset["test"].map(prepare)
dataset["validation"] = dataset["validation"].map(prepare)


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

# 🏋️‍♂️ Train the Essay Grading Model

This section sets up the `SFTTrainer` from the `trl` library to fine-tune the model on the formatted essay dataset. It disables Weights & Biases tracking and uses memory-efficient optimization with 8-bit AdamW.

In [None]:
import os

os.environ["WANDB_DISABLED"] = "true"

from trl import SFTTrainer
from transformers import TrainingArguments


trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    dataset_text_field="input",
    max_seq_length=2048,
    formatting_func=None,
    args=TrainingArguments(
        output_dir="./essay-grader-model",
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        num_train_epochs=1,
        learning_rate=2e-4,
        logging_steps=10,
        save_strategy="epoch",
        save_total_limit=1,
        fp16=False,
        optim="paged_adamw_8bit"
    )
)
trainer.train()






Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Unsloth: Tokenizing ["text"]:   0%|          | 0/800 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"]:   0%|          | 0/100 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 800 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 3,407,872/7,000,000,000 (0.05% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,1.5368
20,0.5178
30,0.2653
40,0.1919
50,0.1504
60,0.1203
70,0.1184
80,0.1049
90,0.1036
100,0.1032


TrainOutput(global_step=100, training_loss=0.3212700629234314, metrics={'train_runtime': 591.6698, 'train_samples_per_second': 1.352, 'train_steps_per_second': 0.169, 'total_flos': 5692392625274880.0, 'train_loss': 0.3212700629234314})

# 📊 Evaluate Model Performance on Test Set

This code runs the model on each test example, extracts predicted and actual scores, and calculates accuracy and mean absolute error (MAE).


In [None]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, mean_absolute_error

predicted_scores = []
actual_scores = []

for ex in tqdm(dataset["test"]):
    prompt = (
        "Grade the following student essay and explain the rationale.\n\n"
        f"Question: {ex['question']}\n"
        f"Reference Answer: {ex['reference_answer']}\n"
        f"Mark Scheme: {ex['mark_scheme']}\n"
        f"Student Answer: {ex['student_answer']}\n"
        "Response:"
    )

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=200)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predicted_score = None
    for line in response.split("\n"):
        if "Score:" in line:
            try:
                predicted_score = int(line.split("Score:")[1].strip())
                break
            except ValueError:
                continue

    actual_score = None
    for line in ex["output"].split("\n"):
        if "Score:" in line:
            try:
                actual_score = int(line.split("Score:")[1].strip())
                break
            except ValueError:
                continue

    if predicted_score is not None and actual_score is not None:
        predicted_scores.append(predicted_score)
        actual_scores.append(actual_score)

accuracy = accuracy_score(actual_scores, predicted_scores)
mae = mean_absolute_error(actual_scores, predicted_scores)

print(f"\nAccuracy: {accuracy:.2%}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")


100%|██████████| 100/100 [19:18<00:00, 11.58s/it]


Accuracy: 56.18%
Mean Absolute Error (MAE): 0.73





In [None]:
# Upload dataset file
from google.colab import files
uploaded = files.upload()

Saving science_qa_dataset_test.csv to science_qa_dataset_test.csv


In [None]:
import pandas as pd
import re

def extract_first_complete_score_and_rationale(text):
    matches = re.findall(r"Score:\s*(\d+)\s*Rationale:(.*?)(?:\n|$)", text, re.DOTALL)
    for score, rationale in matches:
        rationale = rationale.strip()
        if rationale and rationale.lower() != "none":
            return score, rationale
    if matches:
        return matches[0][0], matches[0][1].strip()
    return "Not found", "Not found"

# قراءة الملف بعد رفعه
df = pd.read_csv('science_qa_dataset_test.csv')

# ضُم نقاط المارك سكيما لنص واحد (مع فورمات جيسون أو قريب منه)
def build_mark_scheme(row):
    marks = []
    for i in range(1, 5):  # عدّل حسب عدد الأعمدة
        col = f"mark_scheme_{i}"
        if col in row and pd.notnull(row[col]):
            marks.append(f'"{i}": "{row[col]}"')
    return "{ " + ", ".join(marks) + " }"

df['mark_scheme_combined'] = df.apply(build_mark_scheme, axis=1)

# حضّر أعمدة النتائج
model_scores = []
model_rationales = []
model_raw_outputs = []

for idx, row in df.iterrows():
    question = str(row['question'])
    reference_answer = str(row['reference_answer'])
    mark_scheme = str(row['mark_scheme_combined'])
    student_answer = str(row['student_answer'])

    prompt = (
        "Grade the following student essay. ONLY respond in this format:\n"
        "Score: <number>\n"
        "Rationale: <reason>. Please always provide a rationale, do not leave it blank.\n\n"
        f"Question: {question}\n"
        f"Reference Answer: {reference_answer}\n"
        f"Mark Scheme: {mark_scheme}\n"
        f"Student Answer: {student_answer}\n"
    )

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.3,
        top_p=0.9,
        top_k=5
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    model_raw_outputs.append(response)

    score, rationale = extract_first_complete_score_and_rationale(response)
    model_scores.append(score)
    model_rationales.append(rationale)

df['model_score'] = model_scores
df['model_rationale'] = model_rationales
df['model_raw_output'] = model_raw_outputs

# احفظ النتائج في ملف جديد
df.to_csv("science_qa_dataset_test_with_predictions.csv", index=False)
print("✅ تم حفظ النتائج في science_qa_dataset_test_with_predictions.csv")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'

✅ تم حفظ النتائج في science_qa_dataset_test_with_predictions.csv


In [None]:
from google.colab import files
files.download('science_qa_dataset_test_with_predictions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>