In [None]:
! pip install transformers[torch]==4.55.0
! pip install accelerate
! pip install datasets
! pip install trl==0.22.0
! pip install tf-keras
! pip install numpy==1.26.0
! pip install huggingface-hub==0.35.0
! pip install jinja2==3.1.0
! pip install peft
# ssh tunneling to view tensor board
# ssh -L 6006:localhost:6006 ubuntu@129.159.45.31

# Start tensorboard UI
# tensorboard --logdir ./logs/test/ --host 127.0.0.1 --port 6006

# Docs: https://huggingface.co/docs/trl/en/sft_trainer (version=V0.19.0)
# https://www.datacamp.com/tutorial/fine-tuning-qwen3

In [None]:
SYSTEM_PROMPT = '''You are a language learning evaluator assessing the complexity of an English sentence given its context.

Rubric:
1 (A1) – Very basic words and phrases; simple self-introduction; minimal grammar.
2 (A2) – Simple sentences; familiar everyday expressions; limited range.
3 (B1) – Can write or speak in connected sentences about familiar topics; some errors.
4 (B2) – Generally fluent; can discuss abstract topics; good grammar control.
5 (C1) – Flexible, natural use of language; few errors; advanced vocabulary.
6 (C2) – Near-native mastery; precise, nuanced expression; fully natural flow.

Please give a rating between 1-6 following the rubric above.
'''

PROMPT_TEMPLATE = '''

Context: {{ context }}
Sentence: {{ sentence }}
Rating (1-6):
'''



In [None]:
import json
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd

train_df = pd.read_csv("readme/readme_en_train.csv")
train_df = train_df.dropna(subset=["Rating", "Sentence", "Paragraph"])
dev_df = pd.read_csv("readme/readme_en_val.csv")
dev_df = dev_df.dropna(subset=["Rating", "Sentence", "Paragraph"])
test_df = pd.read_csv("readme/readme_en_test.csv")
test_df = test_df.dropna(subset=["Rating", "Sentence", "Paragraph"])

train = Dataset.from_pandas(train_df)
dev = Dataset.from_pandas(dev_df)
test = Dataset.from_pandas(test_df)


dataset = DatasetDict({
    "train": train,
    "validation": dev,
    "test": test
})
print(dataset)


In [None]:
import transformers
from jinja2 import Template

model_id = "google/gemma-3-1b-it"

JINJA_PROMPT_TEMPLATE = Template(PROMPT_TEMPLATE)

tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

def preprocess(example):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": JINJA_PROMPT_TEMPLATE.render(
            context=example['Paragraph'],
            sentence=example['Sentence']
        )},
        {"role": "assistant", "content": f"The score is {example['Rating']}"}
    ]
    text = tokenizer.apply_chat_template(messages, add_generation_prompt=False)
    return {"messages": messages}


dataset = dataset.map(preprocess)
item = next(iter(dataset["train"]))
print(item['messages'])

In [None]:
from accelerate import notebook_launcher
from trl import SFTTrainer, SFTConfig

import transformers
import torch
import os, shutil

from peft import LoraConfig

peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.05,
    r=16,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
    modules_to_save=["lm_head", "embed_tokens"] # make sure to save the lm_head and embed_tokens as you train the special tokens
)

from trl import SFTTrainer, SFTConfig
from transformers import AutoModelForCausalLM, AutoTokenizer


model = transformers.AutoModelForCausalLM.from_pretrained(
                                            model_id,
                                            torch_dtype=torch.bfloat16,
                                            device_map="auto",
                                            attn_implementation="eager")


shutil.rmtree("models/readme/gemma_1b_lora/test")

def train():

    torch_dtype = model.dtype

    args = SFTConfig(
        output_dir="models/readme/gemma_1b_lora/test",              # directory to save and repository id
        max_length=512,                         # max sequence length for model and packing of the dataset
        packing=False,                          # Groups multiple samples in the dataset into a single sequence
        num_train_epochs=10,                     # number of training epochs
        per_device_train_batch_size=16,          # batch size per device during training
        per_device_eval_batch_size=16,
        gradient_checkpointing=False,           # Caching is incompatible with gradient checkpointing
        optim="adamw_torch_fused",              # use fused adamw optimizer
        logging_steps=10,                        # log every step
        save_strategy="epoch",                 # save checkpoint every epoch
        eval_strategy="epoch",                  # evaluate checkpoint every epoch
        learning_rate=1e-05,            # learning rate
        fp16=True if torch_dtype == torch.float16 else False,   # use float16 precision
        bf16=True if torch_dtype == torch.bfloat16 else False,  # use bfloat16 precision
        warmup_ratio=0.05,
        lr_scheduler_type="linear",           # use constant learning rate scheduler
        push_to_hub=False,                       # push model to hub
        report_to="tensorboard",  # or "wandb", "comet_ml", etc.
        logging_dir="./logs/gemma_1b_lora/test",
        # dataset_text_field="text",
        load_best_model_at_end=True,
        save_total_limit=1,
        dataset_kwargs={
            "add_special_tokens": False, # Template with special tokens
            "append_concat_token": True, # Add EOS token as separator token between examples
        }
    )


    trainer = SFTTrainer(
            model,
            train_dataset=dataset["train"],
            eval_dataset=dataset["validation"],
            args=args,
            processing_class=tokenizer,
            # peft_config=peft_config,
            # compute_metrics=compute_metrics
        )

    return trainer


# This will spawn multiple GPU processes directly from Jupyter:
# notebook_launcher(train, num_processes=2)
trainer = train()
trainer.train()

In [None]:
from peft import PeftModel
import tansformers

model_id = "google/gemma-3-1b-it"

# Load Model base model
base_model = transformers.AutoModelForCausalLM.from_pretrained(model_id)

# Merge LoRA and base model and save
peft_model = PeftModel.from_pretrained(base_model, "models/readme/gemma/lora/checkpoint-141")
merged_model = peft_model.merge_and_unload()
merged_model.save_pretrained("models/readme/gemma/lora/merged")

processor = AutoTokenizer.from_pretrained(model_id)
processor.save_pretrained("models/readme/gemma/lora/merged/merged")

model = merged_model


In [None]:
! export CUDA_LAUNCH_BLOCKING=1
import torch
from tqdm import tqdm
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer


model = AutoModelForCausalLM.from_pretrained(
                    "models/readme/gemma/test/checkpoint-141",
                    torch_dtype=torch.bfloat16,
                    device_map="auto",
                    attn_implementation="flash_attention_2")

tokenizer = AutoTokenizer.from_pretrained("models/readme/gemma/test/checkpoint-141")

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

def build_prompt(messages):
    return pipe.tokenizer.apply_chat_template(
        messages[:-1],
        tokenize=False,
        add_generation_prompt=True,
    )

def extract_completion(s: str) -> str:
    if "The score is" in s:
        return s.split("The score is", 1)[1].strip()
    return s.strip()

test_ds = dataset["test"]

# Precompute prompts + gold labels
prompts = []
golds = []
for example in test_ds:
    prompts.append(build_prompt(example["messages"]))
    golds.append(extract_completion(example["messages"][-1]["content"]))

# Choose a batch size based on GPU RAM
BATCH_SIZE = 1

preds = []
for i in tqdm(range(0, len(prompts), BATCH_SIZE), desc="Evaluating"):
    batch_prompts = prompts[i:i+BATCH_SIZE]

    outputs = pipe(
        batch_prompts,
        max_new_tokens=64,
        do_sample=True,         # set False for deterministic accuracy
        temperature=0.1,
        top_k=10,
        top_p=0.2,
        eos_token_id=[tokenizer.eos_token_id,
                      tokenizer.convert_tokens_to_ids("<end_of_turn>")],
        disable_compile=True,
        batch_size=BATCH_SIZE,
    )

    for prompt, out in zip(batch_prompts, outputs):
        gen = out[0]["generated_text"]
        pred = extract_completion(gen[len(prompt):].strip())
        preds.append(pred)

# Compute accuracy
acc = sum(p == g for p, g in zip(preds, golds))
print("Accuracy:", acc * 100.0 / len(golds))

In [None]:
'''
Gemma 270m (full finetune - lr 1e05) - 54.1
Gemma 270m (full finetune - lr 1e05 - max grad norm - 1.0) - 54.1
Gemma 270m (full finetune - lr 5e05 - max grad norm - 1.0) - 48.3
Gemma 270m (full finetune - lr 5e06 - max grad norm - 1.0) - Stopped the run because the learning is too slow.
Gemma 270m (lora - lr 1e04 - rank and alpha 16) - 51.35
Gemma 270m (lora - lr 5e04 - rank and alpha 16) - 49.3
Gemma 270m (lora - lr 5e05 - rank and alpha 16) - 45.6

Gemma 1b (lora - 16, 32 lr 1e04, constant lr) - 50.68
Gemma 1b (lora - 32, 64 lr 1e04, warmup linear) - 51.35
Gemma 1b (lora - 16, 32 lr 1e04, warmup linear) - 52.03
Gemma 1b (full finetune - lr 1e05, warmup linear) - 54.39

Why did I have to put gradient checkpointing False?
'''

In [None]:
# import numpy as np

# def compute_metrics(eval_pred):

#     def extract_completion(s: str) -> str:
#         if "The score is" in s:
#             return s.split("The score is", 1)[1].strip()
#         return s.strip()

#     # eval_pred is usually an EvalPrediction
#     logits = eval_pred.predictions
#     labels = eval_pred.label_ids

#     # Handle logits shape flexibly
#     logits = np.array(logits)
#     labels = np.array(labels)

#     # If logits is (B, T, V) use argmax along -1:
#     if logits.ndim == 3:
#         preds = np.argmax(logits, axis=-1)

#     # Print a few examples for debugging
#     b = labels.shape[0]

#     accuracy = []
#     for i in range(b):
#         gold_ids = labels[i]
#         pred_ids = preds[i]

#         # ignore masked positions (-100)
#         gold_ids = gold_ids[gold_ids != -100]
#         pred_ids = pred_ids[pred_ids != -100]

#         gold_str = tokenizer.decode(gold_ids, skip_special_tokens=True)
#         pred_str = tokenizer.decode(pred_ids, skip_special_tokens=True)

#         gold_completion = int(float(extract_completion(gold_str)))
#         pred_completion = 0.0
#         try:
#             pred_completion = int(float(extract_completion(pred_str)))
#         except:
#             pred_completion = 0.0

#         accuracy.append(gold_completion == pred_completion)

#     # IMPORTANT: must return a dict of numbers
#     return {"accuracy": sum(accuracy) * 1.0 / len(accuracy)}
