# **Project Setup**

In [None]:
!pip install -q evaluate rouge-score sacrebleu

In [None]:
# !pip uninstall -y unsloth
# !pip install --no-cache-dir "unsloth>=2024.5.0"

In [None]:
%%capture
import os, importlib.util
!pip install --upgrade -qqq uv
if importlib.util.find_spec("torch") is None or "COLAB_" in "".join(os.environ.keys()):
    try: import numpy, PIL; get_numpy = f"numpy=={numpy.__version__}"; get_pil = f"pillow=={PIL.__version__}"
    except: get_numpy = "numpy"; get_pil = "pillow"
    !uv pip install -qqq \
        "torch>=2.8.0" "triton>=3.4.0" {get_numpy} {get_pil} torchvision bitsandbytes "transformers==4.56.2" \
        "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo" \
        "unsloth[base] @ git+https://github.com/unslothai/unsloth" \
        git+https://github.com/triton-lang/triton.git@05b2c186c1b6c9a08375389d5efe9cb4c401c075#subdirectory=python/triton_kernels
elif importlib.util.find_spec("unsloth") is None:
    !uv pip install -qqq unsloth
!uv pip install --upgrade --no-deps transformers==4.56.2 tokenizers trl==0.22.2 unsloth unsloth_zoo

In [None]:
import unsloth
import os
import math
import json
import torch
import shutil
import random
import evaluate
import numpy as np
import pandas as pd

from peft import PeftModel
from collections import Counter
from sklearn.cluster import KMeans
from typing import List, Dict, Tuple
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
from unsloth import is_bfloat16_supported
from transformers import EarlyStoppingCallback
from safetensors.torch import load_file, save_file
from sentence_transformers import SentenceTransformer
from datasets import load_dataset, DatasetDict, Dataset, load_from_disk

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True             # auto-tuning conv/cuDNN
try:
    torch.set_float32_matmul_precision("high")    # PyTorch >= 2.0
except AttributeError:
    pass

# **Data Loader**

## Indonesian Dataset

In [None]:
ds_id = load_dataset("indonlp/cendol_collection_v2")

In [None]:
print(ds_id)

## English Dataset

In [None]:
ds_en = load_dataset("open-orca/OpenOrca")

In [None]:
print(ds_en)

# **Data Preprocess**

## Data Clustering and Sampling

**1. Indonesian Dataset**

In [None]:
dataset_id = load_dataset("lumicero/indonesian_60k")
print(dataset_id)

In [None]:
datasets_id = dataset_id.filter(lambda ex: (ex["subset_name"] not in ["indo_puisi", "dolly"] and ex["output"] is not None))

In [None]:
texts = [text for text in datasets_id['train']['output'] if text is not None]
print(texts[0])

In [None]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
embeddings_id = model.encode(texts, show_progress_bar=True, batch_size=32)

display(embeddings_id.shape)

In [None]:
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels_id = kmeans.fit_predict(embeddings_id)

cluster_distribution = Counter(cluster_labels_id)
print("Cluster size:")
display(cluster_distribution)

In [None]:
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels_id = kmeans.fit_predict(embeddings_id)

cluster_distribution = Counter(cluster_labels_id)
print("Cluster size:")
display(cluster_distribution)

In [None]:
import numpy as np

# --- 1. Sampling TEST per cluster (300 per cluster) ---
test_per_cluster = 300
test_indices = []

for cluster_id in range(n_clusters):
    # semua indeks di cluster ini (mengacu ke array `texts` / cluster_labels_id)
    cluster_indices = np.where(cluster_labels_id == cluster_id)[0]

    # ambil max 300, atau sebanyak yang tersedia kalau < 300
    n_test = min(test_per_cluster, len(cluster_indices))

    if n_test > 0:
        test_cluster_indices = np.random.choice(
            cluster_indices,
            size=n_test,
            replace=False
        )
        test_indices.extend(test_cluster_indices)

# unik + bisa diurutkan kalau mau
test_indices = sorted(set(test_indices))

print(f"Total test indices: {len(test_indices)}")

# --- 2. Sampling TRAIN per cluster (seperti kode awal, tapi tidak menyentuh test_indices) ---

total_desired_train = 10000
samples_per_cluster_train = total_desired_train // n_clusters
train_indices = []

for cluster_id in range(n_clusters):
    cluster_indices = np.where(cluster_labels_id == cluster_id)[0]

    # buang yang sudah dipakai sebagai test
    remaining_cluster_indices = np.setdiff1d(
        cluster_indices,
        np.array(test_indices),
        assume_unique=False
    )

    n_train = min(len(remaining_cluster_indices), samples_per_cluster_train)

    if n_train > 0:
        sampled_cluster_indices = np.random.choice(
            remaining_cluster_indices,
            size=n_train,
            replace=False
        )
        train_indices.extend(sampled_cluster_indices)

# Jika masih kurang dari total_desired_train, isi dari cluster besar
if len(train_indices) < total_desired_train:
    remaining = total_desired_train - len(train_indices)
    print(f"Train short by {remaining} samples, attempting to fill from larger clusters...")

    # indeks yang sudah terpakai (train + test), jangan dipakai lagi
    used_indices = set(train_indices) | set(test_indices)
    available_indices = [i for i in range(len(texts)) if i not in used_indices]

    if available_indices:
        additional_samples = np.random.choice(
            available_indices,
            size=min(remaining, len(available_indices)),
            replace=False
        )
        train_indices.extend(additional_samples)

# pastikan tidak ada overlap
train_indices = sorted(set(train_indices))
test_indices = sorted(set(test_indices))

intersection = set(train_indices) & set(test_indices)
assert len(intersection) == 0, "Ada overlap antara train dan test!"

print(f"Final number of train samples: {len(train_indices)}")
print(f"Final number of test samples : {len(test_indices)}")

# --- 3. Map kembali ke indeks asli dataset (karena sebelumnya difilter yang None) ---

original_indices = [
    i for i, text in enumerate(dataset_id['train']['output'])
    if text is not None
]

train_final_indices = [original_indices[i] for i in train_indices]
test_final_indices  = [original_indices[i] for i in test_indices]

# --- 4. Buat dataset train dan test akhir (disjoint) ---

sampled_train_dataset_id = dataset_id['train'].select(train_final_indices)
sampled_test_dataset_id  = dataset_id['train'].select(test_final_indices)

print(f"\nFinal train dataset size: {len(sampled_train_dataset_id)}")
print(f"Final test dataset size : {len(sampled_test_dataset_id)}")

In [None]:
# # Adjust sampling to get closer to 20k
# total_desired = 10000
# samples_per_cluster = total_desired // n_clusters
# sampled_indices = []

# for cluster_id in range(n_clusters):
#     cluster_indices = np.where(cluster_labels_id == cluster_id)[0]
#     n_samples = min(len(cluster_indices), samples_per_cluster)

#     if n_samples > 0:
#         sampled_cluster_indices = np.random.choice(
#             cluster_indices,
#             size=n_samples,
#             replace=False
#         )
#         sampled_indices.extend(sampled_cluster_indices)

# # If we're still short, sample more from larger clusters
# if len(sampled_indices) < total_desired:
#     remaining = total_desired - len(sampled_indices)
#     print(f"Short by {remaining} samples, attempting to fill from larger clusters...")

#     # Get indices not yet sampled from all clusters
#     used_indices = set(sampled_indices)
#     available_indices = [i for i in range(len(texts)) if i not in used_indices]

#     if available_indices:
#         additional_samples = np.random.choice(
#             available_indices,
#             size=min(remaining, len(available_indices)),
#             replace=False
#         )
#         sampled_indices.extend(additional_samples)

# # Map back to original dataset indices
# original_indices = [i for i, text in enumerate(dataset_id['train']['output']) if text is not None]
# final_indices = [original_indices[i] for i in sampled_indices]

# # Create final sampled dataset
# sampled_dataset_id = dataset_id['train'].select(final_indices)

# print(f"\nFinal number of samples: {len(sampled_dataset_id)}")

**2. English Dataset**

In [None]:
dataset_en = load_dataset("lumicero/english_60k")
print(dataset_en)

In [None]:
datasets_en = dataset_en.filter(lambda ex: ex["response"] is not None)

In [None]:
texts = [text for text in datasets_en['train']['response'] if text is not None]
print(texts[0])

In [None]:
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
embeddings_en = model.encode(texts, show_progress_bar=True, batch_size=32)

display(embeddings_en.shape)

In [None]:
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels_en = kmeans.fit_predict(embeddings_en)

cluster_distribution = Counter(cluster_labels_en)
print("Cluster size:")
display(cluster_distribution)

In [None]:
# Adjust sampling to get closer to 20k
total_desired = 10000
samples_per_cluster = total_desired // n_clusters
sampled_indices = []

for cluster_id in range(n_clusters):
    cluster_indices = np.where(cluster_labels_en == cluster_id)[0]
    n_samples = min(len(cluster_indices), samples_per_cluster)

    if n_samples > 0:
        sampled_cluster_indices = np.random.choice(
            cluster_indices,
            size=n_samples,
            replace=False
        )
        sampled_indices.extend(sampled_cluster_indices)

# If we're still short, sample more from larger clusters
if len(sampled_indices) < total_desired:
    remaining = total_desired - len(sampled_indices)
    print(f"Short by {remaining} samples, attempting to fill from larger clusters...")

    # Get indices not yet sampled from all clusters
    used_indices = set(sampled_indices)
    available_indices = [i for i in range(len(texts)) if i not in used_indices]

    if available_indices:
        additional_samples = np.random.choice(
            available_indices,
            size=min(remaining, len(available_indices)),
            replace=False
        )
        sampled_indices.extend(additional_samples)

# Map back to original dataset indices
original_indices = [i for i, text in enumerate(dataset_en['train']['response']) if text is not None]
final_indices = [original_indices[i] for i in sampled_indices]

# Create final sampled dataset
sampled_dataset_en = dataset_en['train'].select(final_indices)

print(f"\nFinal number of samples: {len(sampled_dataset_en)}")

**3. Combine all sampled data**

In [None]:
train_datasets_id = DatasetDict({
    "text": sampled_train_dataset_id
})

data_test_id = DatasetDict({
    "test": sampled_test_dataset_id
})

train_datasets_en = DatasetDict({
    "text": sampled_dataset_en
})

print(train_datasets_id)
print(data_test_id)
print(train_datasets_en)

# **Data Formatting**

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct-unsloth-bnb-4bit",
    max_seq_length = 2048,    # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,     # A bit more accurate, uses 2x memory
    full_finetuning = False,  # We have full finetuning now!
)

**1. Indonesian Dataset**

In [None]:
print(train_datasets_id)

In [None]:
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_id(examples):
    instructions = examples["input"]
    outputs      = examples["output"]
    texts = []

    for input, output in zip(instructions, outputs):
        messages = [
            {"role": "system",    "content": "Kamu adalah asisten yang ahli dalam percakapan bahasa Indonesia."},
            {"role": "user",      "content": input},
            {"role": "assistant", "content": output},
        ]

        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
        )

        texts.append(text + EOS_TOKEN)

    return {"text": texts}

In [None]:
conversations_id = train_datasets_id.map(formatting_prompts_id, batched=True)

display(conversations_id)

In [None]:
print(conversations_id["text"]["text"])

In [None]:
train_id = Dataset.from_dict({
    "text": conversations_id["text"]["text"]
})

print(train_id)

In [None]:
display(train_id["text"][0])
print(len(train_id["text"]))

**2. English Dataset**

In [None]:
print(train_datasets_en)

In [None]:
EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_en(examples):
    instructions = examples["question"]
    outputs      = examples["response"]
    texts = []

    for question, response in zip(instructions, outputs):
        messages = [
            {"role": "system",    "content": "You are an assistant who is skilled in English conversation."},
            {"role": "user",      "content": question},
            {"role": "assistant", "content": response},
        ]

        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False,
        )

        texts.append(text + EOS_TOKEN)

    return {"text": texts}

In [None]:
conversations_en = train_datasets_en.map(formatting_prompts_en, batched=True)

display(conversations_en)

In [None]:
train_en = Dataset.from_dict({
    "text": conversations_en["text"]["text"]
})

print(train_en)

In [None]:
display(train_en["text"][0])
print(len(train_en["text"]))

# **Training**

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,           # Choose any number > 0! Suggested 8, 16, 16, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

In [None]:
!pip install -q wandb

import wandb
wandb.login()

In [None]:
os.environ["WANDB_PROJECT"] = "fp-kk"

In [None]:
split = train_en.train_test_split(test_size=0.01, seed=3407)

train_dataset = split["train"]
eval_dataset  = split["test"]

print("Train and Eval Size:", len(train_dataset), len(eval_dataset))
print(train_dataset["text"][0])
print(eval_dataset["text"][0])

In [None]:
FastLanguageModel.for_training(model)

In [None]:
cfg = SFTConfig(
    dataset_text_field          = "text",
    max_seq_length              = 2048,
    packing                     = False,
    dataset_num_proc            = 2,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size  = 2,
    gradient_accumulation_steps = 4,
    warmup_steps                = 10,
    num_train_epochs            = 1.5,
    learning_rate               = 2e-4,
    optim                       = "adamw_8bit",
    weight_decay                = 0.01,
    lr_scheduler_type           = "cosine",
    fp16                        = not is_bfloat16_supported(),
    bf16                        = is_bfloat16_supported(),
    logging_steps               = 1,
    eval_strategy               = "steps",
    eval_steps                  = 10,
    save_strategy               = "no",
    output_dir                  = "./outputs",
    report_to                   = "wandb",
    run_name                    = "lora-en",
    seed                        = 3407,
)

trainer = SFTTrainer(
    model         = model,
    tokenizer     = tokenizer,
    train_dataset = train_dataset,
    eval_dataset  = eval_dataset,
    args          = cfg,
)

In [None]:
# @markdown Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
#@markdown Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

In [None]:
from pathlib import Path

final_save_dir = "/content/drive/MyDrive/College/KK/Final Project/model/lora-en"
Path(final_save_dir).mkdir(parents=True, exist_ok=True)

model.save_pretrained(final_save_dir)
tokenizer.save_pretrained(final_save_dir)

print("Model has been saved in:", final_save_dir)

# **Scenario and Evaluation**

In [None]:
# eval_id = load_dataset("lumicero/indonesian_60k")
eval_id = data_test_id
eval_en = load_dataset("lumicero/english_60k")
print(eval_id)
print(eval_en)

In [None]:
eval_id_clean = eval_id.filter(lambda ex: ex["output"] is not None)
eval_en_clean = eval_en.filter(lambda ex: ex["response"] is not None)

In [None]:
from collections import defaultdict
from datasets import Dataset, DatasetDict

def count_missing_values_any(ds, cols, name="dataset"):
    """
    Cek jumlah None / string kosong di setiap kolom `cols`.
    Bisa untuk Dataset maupun DatasetDict.
    """

    def _count_on_dataset(dset, tag):
        missing_counts = defaultdict(int)
        total = dset.num_rows

        for ex in dset:   # ex = dict per baris
            for col in cols:
                v = ex.get(col, None)
                if v is None:
                    missing_counts[col] += 1
                elif isinstance(v, str) and v.strip() == "":
                    # anggap string kosong sebagai missing value
                    missing_counts[col] += 1

        print(f"\n=== {tag} ===")
        print(f"Total rows: {total}")
        for col in cols:
            n = missing_counts[col]
            print(f"- {col}: {n} missing ({n/total:.4%})")

        return dict(missing_counts)

    # Kalau DatasetDict: loop per split
    if isinstance(ds, DatasetDict):
        results = {}
        for split_name, dset in ds.items():
            tag = f"{name}[{split_name}]"
            results[split_name] = _count_on_dataset(dset, tag)
        return results

    # Kalau Dataset biasa
    elif isinstance(ds, Dataset):
        return _count_on_dataset(ds, name)

    else:
        raise TypeError(f"Object type {type(ds)} tidak didukung. Harus Dataset atau DatasetDict.")

# Dataset Indonesia (eval_id_clean) – kolom: input, output
cols_id = ["input", "output"]
missing_id = count_missing_values_any(eval_id_clean, cols_id, name="eval_id_clean")

# Dataset Inggris (eval_en_clean) – kolom: question, response
cols_en = ["question", "response"]
missing_en = count_missing_values_any(eval_en_clean, cols_en, name="eval_en_clean")

In [None]:
for i in range(3000):
    print("=== SAMPLE", i, "===")
    print("Q_ID:", eval_id_clean["test"][i]["input"])
    print("GT  :", eval_id_clean["test"][i]["output"])
    # print("PRED:", results_detail["predictions"][i])
    print()

## Test Model

In [None]:
from sacrebleu.metrics import BLEU
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
import random
import transformers
import os
from unsloth import FastLanguageModel
from rouge_score import rouge_scorer   # <-- NEW

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    transformers.set_seed(seed)

# System prompt per bahasa
SYSTEM_PROMPT_ID = (
    "Anda adalah asisten AI yang membantu pengguna dalam bahasa Indonesia. "
)

SYSTEM_PROMPT_EN = (
    "You are a helpful AI assistant that communicates in English. "
)

MAX_SEQ_LEN      = 1024   # sama dengan di FastLanguageModel.from_pretrained
MAX_NEW_TOKENS   = 128
MAX_INPUT_LENGTH = MAX_SEQ_LEN - MAX_NEW_TOKENS  # supaya total <= 1024

def make_collate_fn(tokenizer, system_prompt, schema):
    def collate_fn(batch):
        input_id_list = []
        references = []

        for item in batch:
            if schema == "id":
                input_text = item["input"]      # ID: input/output
                reference  = item["output"]
            elif schema == "en":
                input_text = item["question"]   # EN: question/response
                reference  = item["response"]
            else:
                raise ValueError(f"Unknown schema: {schema}")

            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": input_text},
            ]

            # 1) Tokenisasi per-sample TANPA padding batch
            ids = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt",
                truncation=True,
                max_length=MAX_INPUT_LENGTH,
            )[0]  # shape: (seq_len,)

            input_id_list.append(ids)
            references.append(reference)

        # 2) Hitung panjang maksimum di batch
        max_len = max(t.shape[0] for t in input_id_list)
        pad_id  = tokenizer.pad_token_id
        batch_size = len(input_id_list)

        # 3) Siapkan tensor batched dengan LEFT PADDING manual
        input_ids = torch.full(
            (batch_size, max_len),
            pad_id,
            dtype=torch.long,
        )
        attention_mask = torch.zeros(
            (batch_size, max_len),
            dtype=torch.long,
        )

        for i, ids in enumerate(input_id_list):
            seq_len = ids.shape[0]
            # left padding: isi dari kanan
            input_ids[i, -seq_len:] = ids
            attention_mask[i, -seq_len:] = 1

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "references": references,
        }

    return collate_fn

def evaluate_model(model, tokenizer, eval_dataset, system_prompt,
                   batch_size=64, max_allowed_length=24, schema="en"):
    model.eval()
    all_predictions = []
    all_references = []        # untuk BLEU (nested list)
    all_references_text = []   # untuk ROUGE (list string)

    collate_fn = make_collate_fn(tokenizer, system_prompt, schema)

    dataloader = DataLoader(
        eval_dataset,
        num_workers=8,              # 0 dulu biar error kelihatan jelas
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=False,
        pin_memory=True,
    )

    bleu = BLEU()
    pbar = tqdm(dataloader)

    for batch in pbar:
        input_ids = batch["input_ids"].to(DEVICE)          # shape: (B, L)
        attention_mask = batch["attention_mask"].to(DEVICE)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_allowed_length,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=False,
                use_cache=True,
            )

        # semua sequence punya panjang awal yang sama: L
        seq_len = input_ids.shape[1]

        predictions = []
        for i, output in enumerate(outputs):
            # ambil token hasil generate saja (setelah input penuh)
            generated_tokens = output[seq_len:]
            decoded = tokenizer.decode(generated_tokens, skip_special_tokens=True)
            predictions.append(decoded)

        all_predictions.extend(predictions)
        all_references.extend([[ref] for ref in batch["references"]])
        all_references_text.extend(batch["references"])

    # ====== BLEU ======
    final_bleu = bleu.corpus_score(all_predictions, all_references)

    # ====== ROUGE-1, ROUGE-2, ROUGE-L ======
    scorer = rouge_scorer.RougeScorer(
        ["rouge1", "rouge2", "rougeL"],
        use_stemmer=True,
    )

    rouge1_list, rouge2_list, rougeL_list = [], [], []

    for pred, ref in zip(all_predictions, all_references_text):
        scores = scorer.score(ref, pred)
        rouge1_list.append(scores["rouge1"].fmeasure)
        rouge2_list.append(scores["rouge2"].fmeasure)
        rougeL_list.append(scores["rougeL"].fmeasure)

    # kali 100 supaya bentuknya persen seperti BLEU
    rouge1 = float(np.mean(rouge1_list) * 100)
    rouge2 = float(np.mean(rouge2_list) * 100)
    rougeL = float(np.mean(rougeL_list) * 100)

    return {
        "bleu": final_bleu.score,
        "rouge1": rouge1,
        "rouge2": rouge2,
        "rougeL": rougeL,
        "predictions": all_predictions,
        "references": all_references,
    }


def evaluate_model_multiple_runs(
    model,
    tokenizer,
    eval_dataset,
    system_prompt,
    n_runs=3,
    batch_size=128,
    max_allowed_length=24,
    schema="en",
):
    all_results = []

    for run in range(n_runs):
        set_seed(42 + run)
        results = evaluate_model(
            model,
            tokenizer,
            eval_dataset,
            system_prompt=system_prompt,
            batch_size=batch_size,
            max_allowed_length=max_allowed_length,
            schema=schema,
        )
        all_results.append(results)

        bleu_scores   = [r["bleu"]   for r in all_results]
        rouge1_scores = [r["rouge1"] for r in all_results]
        rouge2_scores = [r["rouge2"] for r in all_results]
        rougeL_scores = [r["rougeL"] for r in all_results]

        print(
            f"Run {run + 1}/{n_runs} - "
            f"BLEU: {np.mean(bleu_scores):.4f} ± {np.std(bleu_scores):.4f} | "
            f"R1: {np.mean(rouge1_scores):.4f} | "
            f"R2: {np.mean(rouge2_scores):.4f} | "
            f"RL: {np.mean(rougeL_scores):.4f}"
        )

    bleu_scores   = [r["bleu"]   for r in all_results]
    rouge1_scores = [r["rouge1"] for r in all_results]
    rouge2_scores = [r["rouge2"] for r in all_results]
    rougeL_scores = [r["rougeL"] for r in all_results]

    return {
        "bleu_mean":   float(np.mean(bleu_scores)),
        "bleu_std":    float(np.std(bleu_scores)),
        "rouge1_mean": float(np.mean(rouge1_scores)),
        "rouge1_std":  float(np.std(rouge1_scores)),
        "rouge2_mean": float(np.mean(rouge2_scores)),
        "rouge2_std":  float(np.std(rouge2_scores)),
        "rougeL_mean": float(np.mean(rougeL_scores)),
        "rougeL_std":  float(np.std(rougeL_scores)),
    }

MODEL_DIR  = "unsloth/Qwen2.5-3B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_DIR,
    max_seq_length=MAX_SEQ_LEN,
    dtype=None,
    load_in_4bit=True,
)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

tokenizer.padding_side = "left"

model.eval()
FastLanguageModel.for_inference(model)

results_en_detail = evaluate_model(
    model,
    tokenizer,
    eval_en_clean["test"],
    system_prompt=SYSTEM_PROMPT_EN,
    batch_size=90,
    max_allowed_length=MAX_NEW_TOKENS,
    schema="en",
)

results_id_detail = evaluate_model(
    model,
    tokenizer,
    eval_id_clean["test"],
    system_prompt=SYSTEM_PROMPT_ID,
    batch_size=90,
    max_allowed_length=MAX_NEW_TOKENS,
    schema="id",
)

print("\nFinal Results (test split):")
print(
    f"English (eval_en_clean['test']): "
    f"BLEU = {results_en_detail['bleu']:.4f}, "
    f"ROUGE-1 = {results_en_detail['rouge1']:.4f}, "
    f"ROUGE-2 = {results_en_detail['rouge2']:.4f}, "
    f"ROUGE-L = {results_en_detail['rougeL']:.4f}"
)
print(
    f"Indonesian (eval_id_clean['test']): "
    f"BLEU = {results_id_detail['bleu']:.4f}, "
    f"ROUGE-1 = {results_id_detail['rouge1']:.4f}, "
    f"ROUGE-2 = {results_id_detail['rouge2']:.4f}, "
    f"ROUGE-L = {results_id_detail['rougeL']:.4f}"
)

In [None]:
for i in range(3):
    print("=== SAMPLE", i, "===")
    print("Q_ID:", eval_en_clean["test"][i]["question"])
    print("GT  :", eval_en_clean["test"][i]["response"])
    print("PRED:", results_en_detail["predictions"][i])
    print()

In [None]:
for i in range(3):
    print("=== SAMPLE", i, "===")
    print("Q_ID:", eval_id_clean["test"][i]["input"])
    print("GT  :", eval_id_clean["test"][i]["output"])
    print("PRED:", results_id_detail["predictions"][i])
    print()

## Base Model

In [None]:
from sacrebleu.metrics import BLEU
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
import random
import transformers
import os
from unsloth import FastLanguageModel
from rouge_score import rouge_scorer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    transformers.set_seed(seed)

# System prompt per bahasa
SYSTEM_PROMPT_ID = (
    "Anda adalah asisten AI yang membantu pengguna dalam bahasa Indonesia. "
)

SYSTEM_PROMPT_EN = (
    "You are a helpful AI assistant that communicates in English. "
)

MAX_SEQ_LEN      = 1024   # sama dengan di FastLanguageModel.from_pretrained
MAX_NEW_TOKENS   = 128
MAX_INPUT_LENGTH = MAX_SEQ_LEN - MAX_NEW_TOKENS  # supaya total <= 1024

def make_collate_fn(tokenizer, system_prompt, schema):
    def collate_fn(batch):
        input_id_list = []
        references = []

        for item in batch:
            if schema == "id":
                input_text = item["input"]      # ID: input/output
                reference  = item["output"]
            elif schema == "en":
                input_text = item["question"]   # EN: question/response
                reference  = item["response"]
            else:
                raise ValueError(f"Unknown schema: {schema}")

            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": input_text},
            ]

            # 1) Tokenisasi per-sample TANPA padding batch
            ids = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt",
                truncation=True,
                max_length=MAX_INPUT_LENGTH,
            )[0]  # shape: (seq_len,)

            input_id_list.append(ids)
            references.append(reference)

        # 2) Hitung panjang maksimum di batch
        max_len = max(t.shape[0] for t in input_id_list)
        pad_id  = tokenizer.pad_token_id
        batch_size = len(input_id_list)

        # 3) Siapkan tensor batched dengan LEFT PADDING manual
        input_ids = torch.full(
            (batch_size, max_len),
            pad_id,
            dtype=torch.long,
        )
        attention_mask = torch.zeros(
            (batch_size, max_len),
            dtype=torch.long,
        )

        for i, ids in enumerate(input_id_list):
            seq_len = ids.shape[0]
            # left padding: isi dari kanan
            input_ids[i, -seq_len:] = ids
            attention_mask[i, -seq_len:] = 1

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "references": references,
        }

    return collate_fn

def evaluate_model(model, tokenizer, eval_dataset, system_prompt,
                   batch_size=64, max_allowed_length=24, schema="en"):
    model.eval()
    all_predictions = []
    all_references = []        # untuk BLEU (nested list)
    all_references_text = []   # untuk ROUGE (list string)

    collate_fn = make_collate_fn(tokenizer, system_prompt, schema)

    dataloader = DataLoader(
        eval_dataset,
        num_workers=8,              # 0 dulu biar error kelihatan jelas
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=False,
        pin_memory=True,
    )

    bleu = BLEU()
    pbar = tqdm(dataloader)

    for batch in pbar:
        input_ids = batch["input_ids"].to(DEVICE)          # shape: (B, L)
        attention_mask = batch["attention_mask"].to(DEVICE)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_allowed_length,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=False,
                use_cache=True,
            )

        # semua sequence punya panjang awal yang sama: L
        seq_len = input_ids.shape[1]

        predictions = []
        for i, output in enumerate(outputs):
            # ambil token hasil generate saja (setelah input penuh)
            generated_tokens = output[seq_len:]
            decoded = tokenizer.decode(generated_tokens, skip_special_tokens=True)
            predictions.append(decoded)

        all_predictions.extend(predictions)
        all_references.extend([[ref] for ref in batch["references"]])
        all_references_text.extend(batch["references"])

    # ====== BLEU ======
    final_bleu = bleu.corpus_score(all_predictions, all_references)

    # ====== ROUGE-1, ROUGE-2, ROUGE-L ======
    scorer = rouge_scorer.RougeScorer(
        ["rouge1", "rouge2", "rougeL"],
        use_stemmer=True,
    )

    rouge1_list, rouge2_list, rougeL_list = [], [], []

    for pred, ref in zip(all_predictions, all_references_text):
        scores = scorer.score(ref, pred)
        rouge1_list.append(scores["rouge1"].fmeasure)
        rouge2_list.append(scores["rouge2"].fmeasure)
        rougeL_list.append(scores["rougeL"].fmeasure)

    # kali 100 supaya bentuknya persen seperti BLEU
    rouge1 = float(np.mean(rouge1_list) * 100)
    rouge2 = float(np.mean(rouge2_list) * 100)
    rougeL = float(np.mean(rougeL_list) * 100)

    return {
        "bleu": final_bleu.score,
        "rouge1": rouge1,
        "rouge2": rouge2,
        "rougeL": rougeL,
        "predictions": all_predictions,
        "references": all_references,
    }


def evaluate_model_multiple_runs(
    model,
    tokenizer,
    eval_dataset,
    system_prompt,
    n_runs=3,
    batch_size=128,
    max_allowed_length=24,
    schema="en",
):
    all_results = []

    for run in range(n_runs):
        set_seed(42 + run)
        results = evaluate_model(
            model,
            tokenizer,
            eval_dataset,
            system_prompt=system_prompt,
            batch_size=batch_size,
            max_allowed_length=max_allowed_length,
            schema=schema,
        )
        all_results.append(results)

        bleu_scores   = [r["bleu"]   for r in all_results]
        rouge1_scores = [r["rouge1"] for r in all_results]
        rouge2_scores = [r["rouge2"] for r in all_results]
        rougeL_scores = [r["rougeL"] for r in all_results]

        print(
            f"Run {run + 1}/{n_runs} - "
            f"BLEU: {np.mean(bleu_scores):.4f} ± {np.std(bleu_scores):.4f} | "
            f"R1: {np.mean(rouge1_scores):.4f} | "
            f"R2: {np.mean(rouge2_scores):.4f} | "
            f"RL: {np.mean(rougeL_scores):.4f}"
        )

    bleu_scores   = [r["bleu"]   for r in all_results]
    rouge1_scores = [r["rouge1"] for r in all_results]
    rouge2_scores = [r["rouge2"] for r in all_results]
    rougeL_scores = [r["rougeL"] for r in all_results]

    return {
        "bleu_mean":   float(np.mean(bleu_scores)),
        "bleu_std":    float(np.std(bleu_scores)),
        "rouge1_mean": float(np.mean(rouge1_scores)),
        "rouge1_std":  float(np.std(rouge1_scores)),
        "rouge2_mean": float(np.mean(rouge2_scores)),
        "rouge2_std":  float(np.std(rouge2_scores)),
        "rougeL_mean": float(np.mean(rougeL_scores)),
        "rougeL_std":  float(np.std(rougeL_scores)),
    }

MODEL_DIR  = "unsloth/Llama-3.2-1B-Instruct-unsloth-bnb-4bit"
print(MODEL_DIR)

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_DIR,
    max_seq_length=MAX_SEQ_LEN,
    dtype=None,
    load_in_4bit=True,
)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

tokenizer.padding_side = "left"

model.eval()
FastLanguageModel.for_inference(model)

results_en_detail = evaluate_model(
    model,
    tokenizer,
    eval_en_clean["test"],
    system_prompt=SYSTEM_PROMPT_EN,
    batch_size=90,
    max_allowed_length=MAX_NEW_TOKENS,
    schema="en",
)

results_id_detail = evaluate_model(
    model,
    tokenizer,
    eval_id_clean["test"],
    system_prompt=SYSTEM_PROMPT_ID,
    batch_size=90,
    max_allowed_length=MAX_NEW_TOKENS,
    schema="id",
)

print("\nFinal Results (test split):")
print(
    f"English (eval_en_clean['test']): "
    f"BLEU = {results_en_detail['bleu']:.4f}, "
    f"ROUGE-1 = {results_en_detail['rouge1']:.4f}, "
    f"ROUGE-2 = {results_en_detail['rouge2']:.4f}, "
    f"ROUGE-L = {results_en_detail['rougeL']:.4f}"
)
print(
    f"Indonesian (eval_id_clean['test']): "
    f"BLEU = {results_id_detail['bleu']:.4f}, "
    f"ROUGE-1 = {results_id_detail['rouge1']:.4f}, "
    f"ROUGE-2 = {results_id_detail['rouge2']:.4f}, "
    f"ROUGE-L = {results_id_detail['rougeL']:.4f}"
)

## LoRA Baseline

### LoRA-ID

In [None]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [None]:
from sacrebleu.metrics import BLEU
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
import random
import transformers
import os
from unsloth import FastLanguageModel
from rouge_score import rouge_scorer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    transformers.set_seed(seed)

# System prompt per bahasa
SYSTEM_PROMPT_ID = (
    "Anda adalah asisten AI yang membantu pengguna dalam bahasa Indonesia. "
)

SYSTEM_PROMPT_EN = (
    "You are a helpful AI assistant that communicates in English. "
)

MAX_SEQ_LEN      = 1024
MAX_NEW_TOKENS   = 128
MAX_INPUT_LENGTH = MAX_SEQ_LEN - MAX_NEW_TOKENS  # supaya total <= 1024

def make_collate_fn(tokenizer, system_prompt, schema):
    def collate_fn(batch):
        input_id_list = []
        references = []

        for item in batch:
            if schema == "id":
                input_text = item["input"]      # ID: input/output
                reference  = item["output"]
            elif schema == "en":
                input_text = item["question"]   # EN: question/response
                reference  = item["response"]
            else:
                raise ValueError(f"Unknown schema: {schema}")

            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": input_text},
            ]

            # 1) Tokenisasi per-sample TANPA padding batch
            ids = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt",
                truncation=True,
                max_length=MAX_INPUT_LENGTH,
            )[0]  # shape: (seq_len,)

            input_id_list.append(ids)
            references.append(reference)

        # 2) Hitung panjang maksimum di batch
        max_len = max(t.shape[0] for t in input_id_list)
        pad_id  = tokenizer.pad_token_id
        batch_size = len(input_id_list)

        # 3) Siapkan tensor batched dengan LEFT PADDING manual
        input_ids = torch.full(
            (batch_size, max_len),
            pad_id,
            dtype=torch.long,
        )
        attention_mask = torch.zeros(
            (batch_size, max_len),
            dtype=torch.long,
        )

        for i, ids in enumerate(input_id_list):
            seq_len = ids.shape[0]
            # left padding: isi dari kanan
            input_ids[i, -seq_len:] = ids
            attention_mask[i, -seq_len:] = 1

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "references": references,
        }

    return collate_fn

def evaluate_model(model, tokenizer, eval_dataset, system_prompt,
                   batch_size=64, max_allowed_length=24, schema="en"):
    model.eval()
    all_predictions = []
    all_references = []        # untuk BLEU (nested list)
    all_references_text = []   # untuk ROUGE (list string)

    collate_fn = make_collate_fn(tokenizer, system_prompt, schema)

    dataloader = DataLoader(
        eval_dataset,
        num_workers=8,              # 0 dulu biar error kelihatan jelas
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=False,
        pin_memory=True,
    )

    bleu = BLEU()
    pbar = tqdm(dataloader)

    for batch in pbar:
        input_ids = batch["input_ids"].to(DEVICE)          # shape: (B, L)
        attention_mask = batch["attention_mask"].to(DEVICE)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_allowed_length,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=False,
                use_cache=True,
            )

        # semua sequence punya panjang awal yang sama: L
        seq_len = input_ids.shape[1]

        predictions = []
        for i, output in enumerate(outputs):
            # ambil token hasil generate saja (setelah input penuh)
            generated_tokens = output[seq_len:]
            decoded = tokenizer.decode(generated_tokens, skip_special_tokens=True)
            predictions.append(decoded)

        all_predictions.extend(predictions)
        all_references.extend([[ref] for ref in batch["references"]])
        all_references_text.extend(batch["references"])

    # ====== BLEU ======
    final_bleu = bleu.corpus_score(all_predictions, all_references)

    # ====== ROUGE-1, ROUGE-2, ROUGE-L ======
    scorer = rouge_scorer.RougeScorer(
        ["rouge1", "rouge2", "rougeL"],
        use_stemmer=True,
    )

    rouge1_list, rouge2_list, rougeL_list = [], [], []

    for pred, ref in zip(all_predictions, all_references_text):
        scores = scorer.score(ref, pred)
        rouge1_list.append(scores["rouge1"].fmeasure)
        rouge2_list.append(scores["rouge2"].fmeasure)
        rougeL_list.append(scores["rougeL"].fmeasure)

    # kali 100 supaya bentuknya persen seperti BLEU
    rouge1 = float(np.mean(rouge1_list) * 100)
    rouge2 = float(np.mean(rouge2_list) * 100)
    rougeL = float(np.mean(rougeL_list) * 100)

    return {
        "bleu": final_bleu.score,
        "rouge1": rouge1,
        "rouge2": rouge2,
        "rougeL": rougeL,
        "predictions": all_predictions,
        "references": all_references,
    }


def evaluate_model_multiple_runs(
    model,
    tokenizer,
    eval_dataset,
    system_prompt,
    n_runs=3,
    batch_size=128,
    max_allowed_length=24,
    schema="en",
):
    all_results = []

    for run in range(n_runs):
        set_seed(42 + run)
        results = evaluate_model(
            model,
            tokenizer,
            eval_dataset,
            system_prompt=system_prompt,
            batch_size=batch_size,
            max_allowed_length=max_allowed_length,
            schema=schema,
        )
        all_results.append(results)

        bleu_scores   = [r["bleu"]   for r in all_results]
        rouge1_scores = [r["rouge1"] for r in all_results]
        rouge2_scores = [r["rouge2"] for r in all_results]
        rougeL_scores = [r["rougeL"] for r in all_results]

        print(
            f"Run {run + 1}/{n_runs} - "
            f"BLEU: {np.mean(bleu_scores):.4f} ± {np.std(bleu_scores):.4f} | "
            f"R1: {np.mean(rouge1_scores):.4f} | "
            f"R2: {np.mean(rouge2_scores):.4f} | "
            f"RL: {np.mean(rougeL_scores):.4f}"
        )

    bleu_scores   = [r["bleu"]   for r in all_results]
    rouge1_scores = [r["rouge1"] for r in all_results]
    rouge2_scores = [r["rouge2"] for r in all_results]
    rougeL_scores = [r["rougeL"] for r in all_results]

    return {
        "bleu_mean":   float(np.mean(bleu_scores)),
        "bleu_std":    float(np.std(bleu_scores)),
        "rouge1_mean": float(np.mean(rouge1_scores)),
        "rouge1_std":  float(np.std(rouge1_scores)),
        "rouge2_mean": float(np.mean(rouge2_scores)),
        "rouge2_std":  float(np.std(rouge2_scores)),
        "rougeL_mean": float(np.mean(rougeL_scores)),
        "rougeL_std":  float(np.std(rougeL_scores)),
    }

MODEL_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora-id"
print(MODEL_DIR)

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_DIR,
    max_seq_length=MAX_SEQ_LEN,
    dtype=None,
    load_in_4bit=True,
)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

tokenizer.padding_side = "left"

model.eval()
FastLanguageModel.for_inference(model)

results_en_detail = evaluate_model(
    model,
    tokenizer,
    eval_en_clean["test"],
    system_prompt=SYSTEM_PROMPT_EN,
    batch_size=90,
    max_allowed_length=MAX_NEW_TOKENS,
    schema="en",
)

results_id_detail = evaluate_model(
    model,
    tokenizer,
    eval_id_clean["test"],
    system_prompt=SYSTEM_PROMPT_ID,
    batch_size=90,
    max_allowed_length=MAX_NEW_TOKENS,
    schema="id",
)

print("\nFinal Results (test split):")
print(
    f"English (eval_en_clean['test']): "
    f"BLEU = {results_en_detail['bleu']:.4f}, "
    f"ROUGE-1 = {results_en_detail['rouge1']:.4f}, "
    f"ROUGE-2 = {results_en_detail['rouge2']:.4f}, "
    f"ROUGE-L = {results_en_detail['rougeL']:.4f}"
)
print(
    f"Indonesian (eval_id_clean['test']): "
    f"BLEU = {results_id_detail['bleu']:.4f}, "
    f"ROUGE-1 = {results_id_detail['rouge1']:.4f}, "
    f"ROUGE-2 = {results_id_detail['rouge2']:.4f}, "
    f"ROUGE-L = {results_id_detail['rougeL']:.4f}"
)

### LoRA-EN

In [None]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [None]:
from sacrebleu.metrics import BLEU
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
import random
import transformers
import os
from unsloth import FastLanguageModel
from rouge_score import rouge_scorer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    transformers.set_seed(seed)

# System prompt per bahasa
SYSTEM_PROMPT_ID = (
    "Anda adalah asisten AI yang membantu pengguna dalam bahasa Indonesia. "
)

SYSTEM_PROMPT_EN = (
    "You are a helpful AI assistant that communicates in English. "
)

MAX_SEQ_LEN      = 1024   # sama dengan di FastLanguageModel.from_pretrained
MAX_NEW_TOKENS   = 128
MAX_INPUT_LENGTH = MAX_SEQ_LEN - MAX_NEW_TOKENS  # supaya total <= 1024

def make_collate_fn(tokenizer, system_prompt, schema):
    def collate_fn(batch):
        input_id_list = []
        references = []

        for item in batch:
            if schema == "id":
                input_text = item["input"]      # ID: input/output
                reference  = item["output"]
            elif schema == "en":
                input_text = item["question"]   # EN: question/response
                reference  = item["response"]
            else:
                raise ValueError(f"Unknown schema: {schema}")

            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": input_text},
            ]

            # 1) Tokenisasi per-sample TANPA padding batch
            ids = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt",
                truncation=True,
                max_length=MAX_INPUT_LENGTH,
            )[0]  # shape: (seq_len,)

            input_id_list.append(ids)
            references.append(reference)

        # 2) Hitung panjang maksimum di batch
        max_len = max(t.shape[0] for t in input_id_list)
        pad_id  = tokenizer.pad_token_id
        batch_size = len(input_id_list)

        # 3) Siapkan tensor batched dengan LEFT PADDING manual
        input_ids = torch.full(
            (batch_size, max_len),
            pad_id,
            dtype=torch.long,
        )
        attention_mask = torch.zeros(
            (batch_size, max_len),
            dtype=torch.long,
        )

        for i, ids in enumerate(input_id_list):
            seq_len = ids.shape[0]
            # left padding: isi dari kanan
            input_ids[i, -seq_len:] = ids
            attention_mask[i, -seq_len:] = 1

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "references": references,
        }

    return collate_fn

def evaluate_model(model, tokenizer, eval_dataset, system_prompt,
                   batch_size=64, max_allowed_length=24, schema="en"):
    model.eval()
    all_predictions = []
    all_references = []        # untuk BLEU (nested list)
    all_references_text = []   # untuk ROUGE (list string)

    collate_fn = make_collate_fn(tokenizer, system_prompt, schema)

    dataloader = DataLoader(
        eval_dataset,
        num_workers=8,              # 0 dulu biar error kelihatan jelas
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=False,
        pin_memory=True,
    )

    bleu = BLEU()
    pbar = tqdm(dataloader)

    for batch in pbar:
        input_ids = batch["input_ids"].to(DEVICE)          # shape: (B, L)
        attention_mask = batch["attention_mask"].to(DEVICE)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_allowed_length,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=False,
                use_cache=True,
            )

        # semua sequence punya panjang awal yang sama: L
        seq_len = input_ids.shape[1]

        predictions = []
        for i, output in enumerate(outputs):
            # ambil token hasil generate saja (setelah input penuh)
            generated_tokens = output[seq_len:]
            decoded = tokenizer.decode(generated_tokens, skip_special_tokens=True)
            predictions.append(decoded)

        all_predictions.extend(predictions)
        all_references.extend([[ref] for ref in batch["references"]])
        all_references_text.extend(batch["references"])

    # ====== BLEU ======
    final_bleu = bleu.corpus_score(all_predictions, all_references)

    # ====== ROUGE-1, ROUGE-2, ROUGE-L ======
    scorer = rouge_scorer.RougeScorer(
        ["rouge1", "rouge2", "rougeL"],
        use_stemmer=True,
    )

    rouge1_list, rouge2_list, rougeL_list = [], [], []

    for pred, ref in zip(all_predictions, all_references_text):
        scores = scorer.score(ref, pred)
        rouge1_list.append(scores["rouge1"].fmeasure)
        rouge2_list.append(scores["rouge2"].fmeasure)
        rougeL_list.append(scores["rougeL"].fmeasure)

    # kali 100 supaya bentuknya persen seperti BLEU
    rouge1 = float(np.mean(rouge1_list) * 100)
    rouge2 = float(np.mean(rouge2_list) * 100)
    rougeL = float(np.mean(rougeL_list) * 100)

    return {
        "bleu": final_bleu.score,
        "rouge1": rouge1,
        "rouge2": rouge2,
        "rougeL": rougeL,
        "predictions": all_predictions,
        "references": all_references,
    }


def evaluate_model_multiple_runs(
    model,
    tokenizer,
    eval_dataset,
    system_prompt,
    n_runs=3,
    batch_size=128,
    max_allowed_length=24,
    schema="en",
):
    all_results = []

    for run in range(n_runs):
        set_seed(42 + run)
        results = evaluate_model(
            model,
            tokenizer,
            eval_dataset,
            system_prompt=system_prompt,
            batch_size=batch_size,
            max_allowed_length=max_allowed_length,
            schema=schema,
        )
        all_results.append(results)

        bleu_scores   = [r["bleu"]   for r in all_results]
        rouge1_scores = [r["rouge1"] for r in all_results]
        rouge2_scores = [r["rouge2"] for r in all_results]
        rougeL_scores = [r["rougeL"] for r in all_results]

        print(
            f"Run {run + 1}/{n_runs} - "
            f"BLEU: {np.mean(bleu_scores):.4f} ± {np.std(bleu_scores):.4f} | "
            f"R1: {np.mean(rouge1_scores):.4f} | "
            f"R2: {np.mean(rouge2_scores):.4f} | "
            f"RL: {np.mean(rougeL_scores):.4f}"
        )

    bleu_scores   = [r["bleu"]   for r in all_results]
    rouge1_scores = [r["rouge1"] for r in all_results]
    rouge2_scores = [r["rouge2"] for r in all_results]
    rougeL_scores = [r["rougeL"] for r in all_results]

    return {
        "bleu_mean":   float(np.mean(bleu_scores)),
        "bleu_std":    float(np.std(bleu_scores)),
        "rouge1_mean": float(np.mean(rouge1_scores)),
        "rouge1_std":  float(np.std(rouge1_scores)),
        "rouge2_mean": float(np.mean(rouge2_scores)),
        "rouge2_std":  float(np.std(rouge2_scores)),
        "rougeL_mean": float(np.mean(rougeL_scores)),
        "rougeL_std":  float(np.std(rougeL_scores)),
    }

MODEL_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora-en"
print(MODEL_DIR)

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_DIR,
    max_seq_length=MAX_SEQ_LEN,
    dtype=None,
    load_in_4bit=True,
)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

tokenizer.padding_side = "left"

model.eval()
FastLanguageModel.for_inference(model)

results_en_detail = evaluate_model(
    model,
    tokenizer,
    eval_en_clean["test"],
    system_prompt=SYSTEM_PROMPT_EN,
    batch_size=90,
    max_allowed_length=MAX_NEW_TOKENS,
    schema="en",
)

results_id_detail = evaluate_model(
    model,
    tokenizer,
    eval_id_clean["test"],
    system_prompt=SYSTEM_PROMPT_ID,
    batch_size=90,
    max_allowed_length=MAX_NEW_TOKENS,
    schema="id",
)

print("\nFinal Results (test split):")
print(
    f"English (eval_en_clean['test']): "
    f"BLEU = {results_en_detail['bleu']:.4f}, "
    f"ROUGE-1 = {results_en_detail['rouge1']:.4f}, "
    f"ROUGE-2 = {results_en_detail['rouge2']:.4f}, "
    f"ROUGE-L = {results_en_detail['rougeL']:.4f}"
)
print(
    f"Indonesian (eval_id_clean['test']): "
    f"BLEU = {results_id_detail['bleu']:.4f}, "
    f"ROUGE-1 = {results_id_detail['rouge1']:.4f}, "
    f"ROUGE-2 = {results_id_detail['rouge2']:.4f}, "
    f"ROUGE-L = {results_id_detail['rougeL']:.4f}"
)

## LoRA-Soups Baseline

In [None]:
# Configuration
BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct-unsloth-bnb-4bit"

LORA1_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora-id"
LORA2_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora-en"

LORA_SOUP_DIR = "/content/drive/MyDrive/College/KK/Final Project/model/lora_soup_base_05_05"

CAT_WEIGHTS = [0.5, 0.5]

MAX_SEQ_LEN  = 2048
LOAD_IN_4BIT = True

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

### Building LoRA-Soups (CAT)

In [None]:
def show_adapter_cfg(path):
    with open(os.path.join(path, "adapter_config.json"), "r") as f:
        cfg = json.load(f)
    print("===", path, "===")
    print("r           :", cfg["r"])
    print("lora_alpha  :", cfg.get("lora_alpha"))
    print("target_modules:", cfg.get("target_modules"))

show_adapter_cfg(LORA1_DIR)
show_adapter_cfg(LORA2_DIR)

In [None]:
import os, json, shutil
from typing import List, Optional

import torch
from safetensors.torch import load_file, save_file

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def build_lora_soup(
    lora_paths: List[str],
    weights: Optional[List[float]],
    out_path: str,
    copy_tokenizer: bool = True,
):
    """
    Menggabungkan beberapa LoRA menjadi 1 adapter LoRA-Soups
    dengan cara *weighted average* (model soup) atas seluruh
    bobot float di state_dict.

    lora_paths : list path adapter LoRA sumber
    weights    : bobot soup, akan dinormalisasi; jika None -> uniform
    out_path   : folder output adapter baru
    """
    assert len(lora_paths) >= 2, "Minimal 2 LoRA untuk LoRA-Soups"

    # jika weights tidak diberikan -> uniform
    if weights is None:
        w = torch.ones(len(lora_paths), dtype=torch.float32)
    else:
        assert len(weights) == len(lora_paths), "Panjang weights harus = jumlah LoRA"
        w = torch.tensor(weights, dtype=torch.float32)

    # normalisasi bobot
    w = w / w.sum()

    os.makedirs(out_path, exist_ok=True)

    # 1. baca config dari adapter pertama
    cfg_path0 = os.path.join(lora_paths[0], "adapter_config.json")
    with open(cfg_path0, "r") as f:
        cfg0 = json.load(f)

    base_r      = cfg0["r"]
    base_alpha  = cfg0.get("lora_alpha", None)
    base_target = set(cfg0.get("target_modules", []))

    # cek konsistensi adapter lain
    for p in lora_paths[1:]:
        with open(os.path.join(p, "adapter_config.json"), "r") as f:
            cfgi = json.load(f)
        assert cfgi["r"] == base_r, f"Rank LoRA beda di {p}"
        if "lora_alpha" in cfgi and base_alpha is not None:
            assert cfgi["lora_alpha"] == base_alpha, f"lora_alpha beda di {p}"
        if "target_modules" in cfgi and base_target:
            assert set(cfgi["target_modules"]) == base_target, f"target_modules beda di {p}"

    # 2. config baru untuk SOUP:
    #    rank r & lora_alpha TIDAK berubah, hanya bobot yang di-average
    cfg_soup = cfg0.copy()
    if "target_modules" in cfg_soup:
        cfg_soup["target_modules"] = sorted(cfg_soup["target_modules"])

    with open(os.path.join(out_path, "adapter_config.json"), "w") as f:
        json.dump(cfg_soup, f, indent=2)

    # 3. load semua state dict LoRA
    state_dicts = [load_file(os.path.join(p, "adapter_model.safetensors")) for p in lora_paths]

    # opsional: pastikan key sama semua
    keys0 = set(state_dicts[0].keys())
    for i, sd in enumerate(state_dicts[1:], start=1):
        ki = set(sd.keys())
        assert keys0 == ki, f"Kumpulan key state_dict LoRA ke-{i} beda; cek adapter: {lora_paths[i]}"

    # 4. build state dict baru dengan weighted average
    new_state = {}
    float_dtypes = {
        torch.float16,
        torch.bfloat16,
        torch.float32,
        torch.float64,
    }

    for key in state_dicts[0].keys():
        tensor0 = state_dicts[0][key]

        # Kalau tensor floating -> soup (weighted average)
        if isinstance(tensor0, torch.Tensor) and tensor0.dtype in float_dtypes:
            acc = torch.zeros_like(tensor0)
            for wi, sd in zip(w, state_dicts):
                acc = acc + wi * sd[key]
            new_state[key] = acc
        else:
            # Non-float (misal buffer int, dsb) -> ambil dari adapter pertama saja
            new_state[key] = tensor0

    # 5. simpan
    save_file(new_state, os.path.join(out_path, "adapter_model.safetensors"))

    # 6. copy tokenizer & template dari adapter pertama (opsional)
    if copy_tokenizer:
        extra_files = [
            "tokenizer.json",
            "tokenizer_config.json",
            "special_tokens_map.json",
            "chat_template.jinja",
            "README.md",
        ]
        for fname in extra_files:
            src = os.path.join(lora_paths[0], fname)
            if os.path.exists(src):
                shutil.copy(src, os.path.join(out_path, fname))

    print(f"[OK] LoRA-Soups disimpan di: {out_path}")

In [None]:
build_lora_soup(
    lora_paths=[LORA1_DIR, LORA2_DIR],
    weights=CAT_WEIGHTS,
    out_path=LORA_SOUP_DIR,
    copy_tokenizer=True,
)

### Evaluation

In [None]:
from sacrebleu.metrics import BLEU
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
import random
import transformers
import os
from unsloth import FastLanguageModel
from rouge_score import rouge_scorer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    transformers.set_seed(seed)

# System prompt per bahasa
SYSTEM_PROMPT_ID = (
    "Anda adalah asisten AI yang membantu pengguna dalam bahasa Indonesia. "
)

SYSTEM_PROMPT_EN = (
    "You are a helpful AI assistant that communicates in English. "
)

MAX_SEQ_LEN      = 1024   # sama dengan di FastLanguageModel.from_pretrained
MAX_NEW_TOKENS   = 128
MAX_INPUT_LENGTH = MAX_SEQ_LEN - MAX_NEW_TOKENS  # supaya total <= 1024

def make_collate_fn(tokenizer, system_prompt, schema):
    def collate_fn(batch):
        input_id_list = []
        references = []

        for item in batch:
            if schema == "id":
                input_text = item["input"]      # ID: input/output
                reference  = item["output"]
            elif schema == "en":
                input_text = item["question"]   # EN: question/response
                reference  = item["response"]
            else:
                raise ValueError(f"Unknown schema: {schema}")

            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": input_text},
            ]

            # 1) Tokenisasi per-sample TANPA padding batch
            ids = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt",
                truncation=True,
                max_length=MAX_INPUT_LENGTH,
            )[0]  # shape: (seq_len,)

            input_id_list.append(ids)
            references.append(reference)

        # 2) Hitung panjang maksimum di batch
        max_len = max(t.shape[0] for t in input_id_list)
        pad_id  = tokenizer.pad_token_id
        batch_size = len(input_id_list)

        # 3) Siapkan tensor batched dengan LEFT PADDING manual
        input_ids = torch.full(
            (batch_size, max_len),
            pad_id,
            dtype=torch.long,
        )
        attention_mask = torch.zeros(
            (batch_size, max_len),
            dtype=torch.long,
        )

        for i, ids in enumerate(input_id_list):
            seq_len = ids.shape[0]
            # left padding: isi dari kanan
            input_ids[i, -seq_len:] = ids
            attention_mask[i, -seq_len:] = 1

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "references": references,
        }

    return collate_fn

def evaluate_model(model, tokenizer, eval_dataset, system_prompt,
                   batch_size=64, max_allowed_length=24, schema="en"):
    model.eval()
    all_predictions = []
    all_references = []        # untuk BLEU (nested list)
    all_references_text = []   # untuk ROUGE (list string)

    collate_fn = make_collate_fn(tokenizer, system_prompt, schema)

    dataloader = DataLoader(
        eval_dataset,
        num_workers=8,              # 0 dulu biar error kelihatan jelas
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=False,
        pin_memory=True,
    )

    bleu = BLEU()
    pbar = tqdm(dataloader)

    for batch in pbar:
        input_ids = batch["input_ids"].to(DEVICE)          # shape: (B, L)
        attention_mask = batch["attention_mask"].to(DEVICE)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_allowed_length,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=False,
                use_cache=True,
            )

        # semua sequence punya panjang awal yang sama: L
        seq_len = input_ids.shape[1]

        predictions = []
        for i, output in enumerate(outputs):
            # ambil token hasil generate saja (setelah input penuh)
            generated_tokens = output[seq_len:]
            decoded = tokenizer.decode(generated_tokens, skip_special_tokens=True)
            predictions.append(decoded)

        all_predictions.extend(predictions)
        all_references.extend([[ref] for ref in batch["references"]])
        all_references_text.extend(batch["references"])

    # ====== BLEU ======
    final_bleu = bleu.corpus_score(all_predictions, all_references)

    # ====== ROUGE-1, ROUGE-2, ROUGE-L ======
    scorer = rouge_scorer.RougeScorer(
        ["rouge1", "rouge2", "rougeL"],
        use_stemmer=True,
    )

    rouge1_list, rouge2_list, rougeL_list = [], [], []

    for pred, ref in zip(all_predictions, all_references_text):
        scores = scorer.score(ref, pred)
        rouge1_list.append(scores["rouge1"].fmeasure)
        rouge2_list.append(scores["rouge2"].fmeasure)
        rougeL_list.append(scores["rougeL"].fmeasure)

    # kali 100 supaya bentuknya persen seperti BLEU
    rouge1 = float(np.mean(rouge1_list) * 100)
    rouge2 = float(np.mean(rouge2_list) * 100)
    rougeL = float(np.mean(rougeL_list) * 100)

    return {
        "bleu": final_bleu.score,
        "rouge1": rouge1,
        "rouge2": rouge2,
        "rougeL": rougeL,
        "predictions": all_predictions,
        "references": all_references,
    }


def evaluate_model_multiple_runs(
    model,
    tokenizer,
    eval_dataset,
    system_prompt,
    n_runs=3,
    batch_size=128,
    max_allowed_length=24,
    schema="en",
):
    all_results = []

    for run in range(n_runs):
        set_seed(42 + run)
        results = evaluate_model(
            model,
            tokenizer,
            eval_dataset,
            system_prompt=system_prompt,
            batch_size=batch_size,
            max_allowed_length=max_allowed_length,
            schema=schema,
        )
        all_results.append(results)

        bleu_scores   = [r["bleu"]   for r in all_results]
        rouge1_scores = [r["rouge1"] for r in all_results]
        rouge2_scores = [r["rouge2"] for r in all_results]
        rougeL_scores = [r["rougeL"] for r in all_results]

        print(
            f"Run {run + 1}/{n_runs} - "
            f"BLEU: {np.mean(bleu_scores):.4f} ± {np.std(bleu_scores):.4f} | "
            f"R1: {np.mean(rouge1_scores):.4f} | "
            f"R2: {np.mean(rouge2_scores):.4f} | "
            f"RL: {np.mean(rougeL_scores):.4f}"
        )

    bleu_scores   = [r["bleu"]   for r in all_results]
    rouge1_scores = [r["rouge1"] for r in all_results]
    rouge2_scores = [r["rouge2"] for r in all_results]
    rougeL_scores = [r["rougeL"] for r in all_results]

    return {
        "bleu_mean":   float(np.mean(bleu_scores)),
        "bleu_std":    float(np.std(bleu_scores)),
        "rouge1_mean": float(np.mean(rouge1_scores)),
        "rouge1_std":  float(np.std(rouge1_scores)),
        "rouge2_mean": float(np.mean(rouge2_scores)),
        "rouge2_std":  float(np.std(rouge2_scores)),
        "rougeL_mean": float(np.mean(rougeL_scores)),
        "rougeL_std":  float(np.std(rougeL_scores)),
    }

MODEL_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora_soup_base_05_05"
print(MODEL_DIR)

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_DIR,
    max_seq_length=MAX_SEQ_LEN,
    dtype=None,
    load_in_4bit=True,
)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

tokenizer.padding_side = "left"

model.eval()
FastLanguageModel.for_inference(model)

results_en_detail = evaluate_model(
    model,
    tokenizer,
    eval_en_clean["test"],
    system_prompt=SYSTEM_PROMPT_EN,
    batch_size=90,
    max_allowed_length=MAX_NEW_TOKENS,
    schema="en",
)

results_id_detail = evaluate_model(
    model,
    tokenizer,
    eval_id_clean["test"],
    system_prompt=SYSTEM_PROMPT_ID,
    batch_size=90,
    max_allowed_length=MAX_NEW_TOKENS,
    schema="id",
)

print("\nFinal Results (test split):")
print(
    f"English (eval_en_clean['test']): "
    f"BLEU = {results_en_detail['bleu']:.4f}, "
    f"ROUGE-1 = {results_en_detail['rouge1']:.4f}, "
    f"ROUGE-2 = {results_en_detail['rouge2']:.4f}, "
    f"ROUGE-L = {results_en_detail['rougeL']:.4f}"
)
print(
    f"Indonesian (eval_id_clean['test']): "
    f"BLEU = {results_id_detail['bleu']:.4f}, "
    f"ROUGE-1 = {results_id_detail['rouge1']:.4f}, "
    f"ROUGE-2 = {results_id_detail['rouge2']:.4f}, "
    f"ROUGE-L = {results_id_detail['rougeL']:.4f}"
)

## LoRA-Soups (GA)

### Search for the best weight

In [None]:
N_HEAD = 10
N_MID = 30
N_TAIL = 10
N = N_HEAD + N_MID + N_TAIL

ds_id = eval_id_clean["test"]
ds_en = eval_en_clean["test"]

n_total = len(ds_id)
assert len(ds_en) == n_total, "Panjang split ID dan EN harus sama"
assert n_total >= N, "Dataset terlalu kecil untuk ambil 50 sampel"

head_indices = list(range(0, N_HEAD))

mid_start = (n_total - N_MID) // 2
mid_indices = list(range(mid_start, mid_start + N_MID))

tail_indices = list(range(n_total - N_TAIL, n_total))

indices = head_indices + mid_indices + tail_indices

test_id_samples = ds_id.select(indices)
test_en_samples = ds_en.select(indices)

display(test_id_samples)
display(test_en_samples)

In [None]:
from safetensors.torch import load_file, save_file
import shutil

LORA1_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora-id"
LORA2_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora-en"

LORA_SOUP_TMP_DIR = "/content/drive/MyDrive/College/KK/Final Project/model/lora_soup_ga_tmp"

In [None]:
FLOAT_DTYPES = {
    torch.float16,
    torch.bfloat16,
    torch.float32,
    torch.float64,
}


def load_two_lora_state_dicts(lora_dir_1, lora_dir_2):
    """
    Load state_dict dari 2 adapter LoRA (safetensors).
    Pastikan kedua adapter dilatih dari base dan config yang sama.
    """
    sd1 = load_file(os.path.join(lora_dir_1, "adapter_model.safetensors"))
    sd2 = load_file(os.path.join(lora_dir_2, "adapter_model.safetensors"))

    keys1 = set(sd1.keys())
    keys2 = set(sd2.keys())
    if keys1 != keys2:
        raise ValueError("Key di adapter LoRA 1 dan 2 tidak sama. "
                         "Pastikan keduanya dilatih dari base + config yang sama.")

    return sd1, sd2


def prepare_soup_dir(template_dir, soup_dir):
    """
    Membuat folder soup sekali saja dengan cara copy seluruh isi template_dir
    (misal dari LORA1_DIR) ke soup_dir.
    Nanti setiap kali GA update bobot, kita hanya overwrite adapter_model.safetensors di soup_dir.
    """
    if os.path.exists(soup_dir):
        shutil.rmtree(soup_dir)
    shutil.copytree(template_dir, soup_dir)
    print(f"[INFO] Folder soup dibuat dari template: {template_dir} -> {soup_dir}")


def write_soup_state_dict(sd1, sd2, alpha, out_dir):
    """
    Membuat LoRA-Soup dari 2 adapter:
        W_soup = alpha * W1 + (1 - alpha) * W2

    alpha di [0, 1]. Hasilnya disimpan ke adapter_model.safetensors di out_dir.
    """
    alpha = float(alpha)
    if not (0.0 <= alpha <= 1.0):
        raise ValueError("alpha harus di antara 0 dan 1")

    w1 = alpha
    w2 = 1.0 - alpha

    new_state = {}
    for k in sd1.keys():
        t1 = sd1[k]
        t2 = sd2[k]

        if isinstance(t1, torch.Tensor) and t1.dtype in FLOAT_DTYPES:
            new_state[k] = w1 * t1 + w2 * t2
        else:
            new_state[k] = t1

    out_path = os.path.join(out_dir, "adapter_model.safetensors")
    save_file(new_state, out_path)
    print(f"[DEBUG] Soup weights disimpan ke {out_path}")

In [None]:
# Sanity Check LoRA
lora_sd1, lora_sd2 = load_two_lora_state_dicts(LORA1_DIR, LORA2_DIR)
prepare_soup_dir(LORA1_DIR, LORA_SOUP_TMP_DIR)

In [None]:
from sacrebleu.metrics import BLEU
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
import random
import transformers
import os
from unsloth import FastLanguageModel
from rouge_score import rouge_scorer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    transformers.set_seed(seed)

SYSTEM_PROMPT_ID = (
    "Anda adalah asisten AI yang membantu pengguna dalam bahasa Indonesia. "
)

SYSTEM_PROMPT_EN = (
    "You are a helpful AI assistant that communicates in English. "
)

MAX_SEQ_LEN      = 1024
MAX_NEW_TOKENS   = 128
MAX_INPUT_LENGTH = MAX_SEQ_LEN - MAX_NEW_TOKENS

def make_collate_fn(tokenizer, system_prompt, schema):
    def collate_fn(batch):
        input_id_list = []
        references = []

        for item in batch:
            if schema == "id":
                input_text = item["input"]      # ID: input/output
                reference  = item["output"]
            elif schema == "en":
                input_text = item["question"]   # EN: question/response
                reference  = item["response"]
            else:
                raise ValueError(f"Unknown schema: {schema}")

            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": input_text},
            ]

            # 1) Tokenisasi per-sample TANPA padding batch
            ids = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt",
                truncation=True,
                max_length=MAX_INPUT_LENGTH,
            )[0]  # shape: (seq_len,)

            input_id_list.append(ids)
            references.append(reference)

        # 2) Hitung panjang maksimum di batch
        max_len = max(t.shape[0] for t in input_id_list)
        pad_id  = tokenizer.pad_token_id
        batch_size = len(input_id_list)

        # 3) Siapkan tensor batched dengan LEFT PADDING manual
        input_ids = torch.full(
            (batch_size, max_len),
            pad_id,
            dtype=torch.long,
        )
        attention_mask = torch.zeros(
            (batch_size, max_len),
            dtype=torch.long,
        )

        for i, ids in enumerate(input_id_list):
            seq_len = ids.shape[0]
            # left padding: isi dari kanan
            input_ids[i, -seq_len:] = ids
            attention_mask[i, -seq_len:] = 1

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "references": references,
        }

    return collate_fn

def evaluate_model(model, tokenizer, eval_dataset, system_prompt,
                   batch_size=64, max_allowed_length=24, schema="en"):
    model.eval()
    all_predictions = []
    all_references = []        # untuk BLEU (nested list)
    all_references_text = []   # untuk ROUGE (list string)

    collate_fn = make_collate_fn(tokenizer, system_prompt, schema)

    dataloader = DataLoader(
        eval_dataset,
        num_workers=8,              # 0 dulu biar error kelihatan jelas
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=False,
        pin_memory=True,
    )

    bleu = BLEU()
    pbar = tqdm(dataloader)

    for batch in pbar:
        input_ids = batch["input_ids"].to(DEVICE)          # shape: (B, L)
        attention_mask = batch["attention_mask"].to(DEVICE)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_allowed_length,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=False,
                use_cache=True,
            )

        # semua sequence punya panjang awal yang sama: L
        seq_len = input_ids.shape[1]

        predictions = []
        for i, output in enumerate(outputs):
            # ambil token hasil generate saja (setelah input penuh)
            generated_tokens = output[seq_len:]
            decoded = tokenizer.decode(generated_tokens, skip_special_tokens=True)
            predictions.append(decoded)

        all_predictions.extend(predictions)
        all_references.extend([[ref] for ref in batch["references"]])
        all_references_text.extend(batch["references"])

    # ====== BLEU ======
    final_bleu = bleu.corpus_score(all_predictions, all_references)

    # ====== ROUGE-1, ROUGE-2, ROUGE-L ======
    scorer = rouge_scorer.RougeScorer(
        ["rouge1", "rouge2", "rougeL"],
        use_stemmer=True,
    )

    rouge1_list, rouge2_list, rougeL_list = [], [], []

    for pred, ref in zip(all_predictions, all_references_text):
        scores = scorer.score(ref, pred)
        rouge1_list.append(scores["rouge1"].fmeasure)
        rouge2_list.append(scores["rouge2"].fmeasure)
        rougeL_list.append(scores["rougeL"].fmeasure)

    # kali 100 supaya bentuknya persen seperti BLEU
    rouge1 = float(np.mean(rouge1_list) * 100)
    rouge2 = float(np.mean(rouge2_list) * 100)
    rougeL = float(np.mean(rougeL_list) * 100)

    return {
        "bleu": final_bleu.score,
        "rouge1": rouge1,
        "rouge2": rouge2,
        "rougeL": rougeL,
        "predictions": all_predictions,
        "references": all_references,
    }


def evaluate_model_multiple_runs(
    model,
    tokenizer,
    eval_dataset,
    system_prompt,
    n_runs=3,
    batch_size=128,
    max_allowed_length=24,
    schema="en",
):
    all_results = []

    for run in range(n_runs):
        set_seed(42 + run)
        results = evaluate_model(
            model,
            tokenizer,
            eval_dataset,
            system_prompt=system_prompt,
            batch_size=batch_size,
            max_allowed_length=max_allowed_length,
            schema=schema,
        )
        all_results.append(results)

        bleu_scores   = [r["bleu"]   for r in all_results]
        rouge1_scores = [r["rouge1"] for r in all_results]
        rouge2_scores = [r["rouge2"] for r in all_results]
        rougeL_scores = [r["rougeL"] for r in all_results]

        print(
            f"Run {run + 1}/{n_runs} - "
            f"BLEU: {np.mean(bleu_scores):.4f} ± {np.std(bleu_scores):.4f} | "
            f"R1: {np.mean(rouge1_scores):.4f} | "
            f"R2: {np.mean(rouge2_scores):.4f} | "
            f"RL: {np.mean(rougeL_scores):.4f}"
        )

    bleu_scores   = [r["bleu"]   for r in all_results]
    rouge1_scores = [r["rouge1"] for r in all_results]
    rouge2_scores = [r["rouge2"] for r in all_results]
    rougeL_scores = [r["rougeL"] for r in all_results]

    return {
        "bleu_mean":   float(np.mean(bleu_scores)),
        "bleu_std":    float(np.std(bleu_scores)),
        "rouge1_mean": float(np.mean(rouge1_scores)),
        "rouge1_std":  float(np.std(rouge1_scores)),
        "rouge2_mean": float(np.mean(rouge2_scores)),
        "rouge2_std":  float(np.std(rouge2_scores)),
        "rougeL_mean": float(np.mean(rougeL_scores)),
        "rougeL_std":  float(np.std(rougeL_scores)),
    }

def evaluate_alpha(alpha, eval_en_dataset, eval_id_dataset, batch_size=75, verbose=True, weight_id=0.5):
    """
    Evaluasi 1 nilai alpha:
        w1 = alpha, w2 = 1 - alpha
    Fitness = (1 - weight_id)*BLEU_EN + weight_id*BLEU_ID
    """

    # 1. Tulis LoRA-Soup ke LORA_SOUP_TMP_DIR
    write_soup_state_dict(lora_sd1, lora_sd2, alpha, LORA_SOUP_TMP_DIR)

    # 2. Bersihkan GPU sebelum load model baru
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # 3. Load model + tokenizer dari adapter soup
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=LORA_SOUP_TMP_DIR,
        max_seq_length=MAX_SEQ_LEN,
        dtype=None,
        load_in_4bit=True,
    )

    # Set pad token dan padding side sesuai evaluasi sebelumnya
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id
    tokenizer.padding_side = "left"

    model.to(DEVICE)
    model.eval()
    FastLanguageModel.for_inference(model)

    results_en = evaluate_model(
        model,
        tokenizer,
        eval_en_dataset,
        system_prompt=SYSTEM_PROMPT_EN,
        batch_size=batch_size,
        max_allowed_length=MAX_NEW_TOKENS,
        schema="en",
    )

    results_id = evaluate_model(
        model,
        tokenizer,
        eval_id_dataset,
        system_prompt=SYSTEM_PROMPT_ID,
        batch_size=batch_size,
        max_allowed_length=MAX_NEW_TOKENS,
        schema="id",
    )

    bleu_en = results_en["bleu"]
    bleu_id = results_id["bleu"]

    fitness = (1.0 - weight_id) * bleu_en + weight_id * bleu_id

    if verbose:
        print(
            f"[alpha={alpha:.4f}] "
            f"BLEU_EN={bleu_en:.4f}, BLEU_ID={bleu_id:.4f}, "
            f"Fitness={fitness:.4f}"
        )

    del model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return fitness, bleu_en, bleu_id

In [None]:
# Quick Test
fit_05, bleu_en_05, bleu_id_05 = evaluate_alpha(
    alpha=0.5,
    eval_en_dataset=test_en_samples,
    eval_id_dataset=test_id_samples,
)
print("Fitness alpha=0.5:", fit_05)

In [None]:
# GA Hyperparameters
POP_SIZE       = 8     # jumlah individu per generasi
N_GENERATIONS  = 6     # jumlah generasi
ELITE_FRAC     = 0.25  # proporsi elit yang langsung lolos
MUTATION_STD   = 0.10  # standar deviasi noise mutasi

CROSSOVER_RATE  = 0.9
MUTATION_RATE   = 0.1
TOURNAMENT_SIZE = 3


def init_population(pop_size):
    """
    Inisialisasi populasi alpha secara uniform di [0, 1].
    """
    return np.random.rand(pop_size).astype(np.float32)


def mutate_alpha(alpha, sigma=MUTATION_STD):
    """
    Mutasi alpha dengan Gaussian noise, lalu di-clamp ke [0, 1].
    """
    new_alpha = alpha + np.random.normal(0.0, sigma)
    new_alpha = float(np.clip(new_alpha, 0.0, 1.0))
    return new_alpha


def crossover_alpha(alpha1, alpha2):
    """
    Crossover sederhana: convex combination antara dua parent.
    (Real-coded GA / arithmetic crossover)
    """
    t = np.random.rand()
    child = float(t * alpha1 + (1.0 - t) * alpha2)
    return child


def tournament_select(population, fitnesses, k=TOURNAMENT_SIZE):
    n = len(population)
    k = min(k, n)  # biar gak lebih besar dari ukuran populasi
    idxs = np.random.choice(n, size=k, replace=False)
    best_local_idx = idxs[np.argmax(fitnesses[idxs])]
    return float(population[best_local_idx])


def reproduce_offspring(population, fitnesses):
    """
    Buat satu child:
      - pilih 2 parent dengan tournament selection
      - crossover dengan probabilitas CROSSOVER_RATE
      - mutasi dengan probabilitas MUTATION_RATE
    """
    p1 = tournament_select(population, fitnesses)
    p2 = tournament_select(population, fitnesses)

    # Crossover
    if np.random.rand() < CROSSOVER_RATE:
        child = crossover_alpha(p1, p2)
    else:
        child = p1 if np.random.rand() < 0.5 else p2

    # Mutasi
    if np.random.rand() < MUTATION_RATE:
        child = mutate_alpha(child)

    return float(child)


def run_ga_for_lora_soup(
    eval_en_dataset,
    eval_id_dataset,
    pop_size=POP_SIZE,
    n_generations=N_GENERATIONS,
    elite_frac=ELITE_FRAC,
):
    """
    Jalankan GA untuk mencari alpha terbaik.
    Return:
        best_alpha, best_fitness, history (list per generasi)
    """
    set_seed(123)  # biar cukup reproducible

    population = init_population(pop_size)
    best_alpha = None
    best_fitness = -1e9
    history = []

    for gen in range(n_generations):
        print(f"\n=== Generasi {gen + 1}/{n_generations}")
        fitnesses = []
        per_individual_stats = []

        # 1. Evaluasi semua individu di populasi
        for idx, alpha in enumerate(population):
            print(f"  Individu {idx + 1}/{len(population)}: alpha={alpha:.4f}")
            fitness, bleu_en, bleu_id = evaluate_alpha(
                alpha=alpha,
                eval_en_dataset=eval_en_dataset,
                eval_id_dataset=eval_id_dataset,
                batch_size=80,
                verbose=True,
                weight_id=0.5,  # 0.5 = EN & ID sama penting
            )
            fitnesses.append(fitness)
            per_individual_stats.append({
                "alpha": float(alpha),
                "fitness": float(fitness),
                "bleu_en": float(bleu_en),
                "bleu_id": float(bleu_id),
            })

        fitnesses = np.array(fitnesses, dtype=np.float32)

        # 2. Cari terbaik di generasi ini
        gen_best_idx = int(fitnesses.argmax())
        gen_best_alpha = float(population[gen_best_idx])
        gen_best_fitness = float(fitnesses[gen_best_idx])

        if gen_best_fitness > best_fitness:
            best_fitness = gen_best_fitness
            best_alpha = gen_best_alpha

        print(
            f" >> Best generasi {gen + 1}: alpha={gen_best_alpha:.4f}, "
            f"fitness={gen_best_fitness:.4f}"
        )
        print(
            f" >> Global best sejauh ini: alpha={best_alpha:.4f}, "
            f"fitness={best_fitness:.4f}"
        )

        history.append({
            "generation": gen + 1,
            "gen_best_alpha": gen_best_alpha,
            "gen_best_fitness": gen_best_fitness,
            "individuals": per_individual_stats,
        })

        # 3. Seleksi elit
        n_elite = max(1, int(elite_frac * pop_size))
        elite_indices = fitnesses.argsort()[::-1][:n_elite]
        new_population = [float(population[i]) for i in elite_indices]

        # 4. Isi sisa populasi dengan offspring via seleksi + crossover + mutasi
        while len(new_population) < pop_size:
            child = reproduce_offspring(population, fitnesses)
            new_population.append(child)

        population = np.array(new_population, dtype=np.float32)

    print("\n=== GA selesai")
    print(f"Bobot terbaik (alpha) = {best_alpha:.4f}")
    print(f" -> w_LoRA1 = {best_alpha:.4f}")
    print(f" -> w_LoRA2 = {1.0 - best_alpha:.4f}")
    print(f"Fitness BLEU gabungan terbaik = {best_fitness:.4f}")

    return best_alpha, best_fitness, history

In [None]:
import psutil, os, time

process = psutil.Process(os.getpid())

if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.empty_cache()

start_gpu_mem = torch.cuda.memory_allocated()
start_ram = process.memory_info().rss
start_time = time.perf_counter()

best_alpha, best_fitness, ga_history = run_ga_for_lora_soup(
    eval_en_dataset=test_en_samples,
    eval_id_dataset=test_id_samples,
    pop_size=8,
    n_generations=6,
    elite_frac=0.25,
)

end_time = time.perf_counter()
end_ram = process.memory_info().rss
end_gpu_mem = torch.cuda.memory_allocated()
peak_gpu_mem = torch.cuda.max_memory_allocated()

elapsed = end_time - start_time

print("\n=== HASIL AKHIR GA ===")
print(f"alpha terbaik    : {best_alpha:.4f}")
print(f"w_LoRA1 (adapter1): {best_alpha:.4f}")
print(f"w_LoRA2 (adapter2): {1.0 - best_alpha:.4f}")
print(f"fitness (BLEU mix): {best_fitness:.4f}")
print(f"Total waktu GA     : {elapsed:.2f} detik")

print("\n=== MEMORI GPU ===")
print(f"Memori awal  : {start_gpu_mem / (1024**2):.2f} MB")
print(f"Memori akhir : {end_gpu_mem / (1024**2):.2f} MB")
print(f"Peak usage   : {peak_gpu_mem / (1024**2):.2f} MB")

print("\n=== MEMORI RAM ===")
print(f"RAM awal  : {start_ram / (1024**2):.2f} MB")
print(f"RAM akhir : {end_ram / (1024**2):.2f} MB")
print(f"Δ RAM     : {(end_ram - start_ram) / (1024**2):.2f} MB")

In [None]:
import psutil, os, time

process = psutil.Process(os.getpid())

if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.empty_cache()

start_gpu_mem = torch.cuda.memory_allocated()
start_ram = process.memory_info().rss
start_time = time.perf_counter()

best_alpha, best_fitness, ga_history = run_ga_for_lora_soup(
    eval_en_dataset=test_en_samples,
    eval_id_dataset=test_id_samples,
    pop_size=6,
    n_generations=4,
    elite_frac=1.0,
)

end_time = time.perf_counter()
end_ram = process.memory_info().rss
end_gpu_mem = torch.cuda.memory_allocated()
peak_gpu_mem = torch.cuda.max_memory_allocated()

elapsed = end_time - start_time

print("\n=== HASIL AKHIR GA ===")
print(f"alpha terbaik    : {best_alpha:.4f}")
print(f"w_LoRA1 (adapter1): {best_alpha:.4f}")
print(f"w_LoRA2 (adapter2): {1.0 - best_alpha:.4f}")
print(f"fitness (BLEU mix): {best_fitness:.4f}")
print(f"Total waktu GA     : {elapsed:.2f} detik")

print("\n=== MEMORI GPU ===")
print(f"Memori awal  : {start_gpu_mem / (1024**2):.2f} MB")
print(f"Memori akhir : {end_gpu_mem / (1024**2):.2f} MB")
print(f"Peak usage   : {peak_gpu_mem / (1024**2):.2f} MB")

print("\n=== MEMORI RAM ===")
print(f"RAM awal  : {start_ram / (1024**2):.2f} MB")
print(f"RAM akhir : {end_ram / (1024**2):.2f} MB")
print(f"Δ RAM     : {(end_ram - start_ram) / (1024**2):.2f} MB")

### Building the adapter

In [None]:
# Configuration
BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct-unsloth-bnb-4bit"

LORA1_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora-id"
LORA2_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora-en"

LORA_SOUP_DIR = "/content/drive/MyDrive/College/KK/Final Project/model/lora_soup_ga_1"

CAT_WEIGHTS = [0.5513, 0.4487]

LOAD_IN_4BIT = True

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

In [None]:
def show_adapter_cfg(path):
    with open(os.path.join(path, "adapter_config.json"), "r") as f:
        cfg = json.load(f)
    print("===", path, "===")
    print("r           :", cfg["r"])
    print("lora_alpha  :", cfg.get("lora_alpha"))
    print("target_modules:", cfg.get("target_modules"))

show_adapter_cfg(LORA1_DIR)
show_adapter_cfg(LORA2_DIR)

In [None]:
import os, json, shutil
from typing import List, Optional

import torch
from safetensors.torch import load_file, save_file

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def build_lora_soup(
    lora_paths: List[str],
    weights: Optional[List[float]],
    out_path: str,
    copy_tokenizer: bool = True,
):
    """
    Menggabungkan beberapa LoRA menjadi 1 adapter LoRA-Soups
    dengan cara *weighted average* (model soup) atas seluruh
    bobot float di state_dict.

    lora_paths : list path adapter LoRA sumber
    weights    : bobot soup, akan dinormalisasi; jika None -> uniform
    out_path   : folder output adapter baru
    """
    assert len(lora_paths) >= 2, "Minimal 2 LoRA untuk LoRA-Soups"

    # jika weights tidak diberikan -> uniform
    if weights is None:
        w = torch.ones(len(lora_paths), dtype=torch.float32)
    else:
        assert len(weights) == len(lora_paths), "Panjang weights harus = jumlah LoRA"
        w = torch.tensor(weights, dtype=torch.float32)

    # normalisasi bobot
    w = w / w.sum()

    os.makedirs(out_path, exist_ok=True)

    # 1. baca config dari adapter pertama
    cfg_path0 = os.path.join(lora_paths[0], "adapter_config.json")
    with open(cfg_path0, "r") as f:
        cfg0 = json.load(f)

    base_r      = cfg0["r"]
    base_alpha  = cfg0.get("lora_alpha", None)
    base_target = set(cfg0.get("target_modules", []))

    # cek konsistensi adapter lain
    for p in lora_paths[1:]:
        with open(os.path.join(p, "adapter_config.json"), "r") as f:
            cfgi = json.load(f)
        assert cfgi["r"] == base_r, f"Rank LoRA beda di {p}"
        if "lora_alpha" in cfgi and base_alpha is not None:
            assert cfgi["lora_alpha"] == base_alpha, f"lora_alpha beda di {p}"
        if "target_modules" in cfgi and base_target:
            assert set(cfgi["target_modules"]) == base_target, f"target_modules beda di {p}"

    # 2. config baru untuk SOUP:
    #    rank r & lora_alpha TIDAK berubah, hanya bobot yang di-average
    cfg_soup = cfg0.copy()
    if "target_modules" in cfg_soup:
        cfg_soup["target_modules"] = sorted(cfg_soup["target_modules"])

    with open(os.path.join(out_path, "adapter_config.json"), "w") as f:
        json.dump(cfg_soup, f, indent=2)

    # 3. load semua state dict LoRA
    state_dicts = [load_file(os.path.join(p, "adapter_model.safetensors")) for p in lora_paths]

    # opsional: pastikan key sama semua
    keys0 = set(state_dicts[0].keys())
    for i, sd in enumerate(state_dicts[1:], start=1):
        ki = set(sd.keys())
        assert keys0 == ki, f"Kumpulan key state_dict LoRA ke-{i} beda; cek adapter: {lora_paths[i]}"

    # 4. build state dict baru dengan weighted average
    new_state = {}
    float_dtypes = {
        torch.float16,
        torch.bfloat16,
        torch.float32,
        torch.float64,
    }

    for key in state_dicts[0].keys():
        tensor0 = state_dicts[0][key]

        # Kalau tensor floating -> soup (weighted average)
        if isinstance(tensor0, torch.Tensor) and tensor0.dtype in float_dtypes:
            acc = torch.zeros_like(tensor0)
            for wi, sd in zip(w, state_dicts):
                acc = acc + wi * sd[key]
            new_state[key] = acc
        else:
            # Non-float (misal buffer int, dsb) -> ambil dari adapter pertama saja
            new_state[key] = tensor0

    # 5. simpan
    save_file(new_state, os.path.join(out_path, "adapter_model.safetensors"))

    # 6. copy tokenizer & template dari adapter pertama (opsional)
    if copy_tokenizer:
        extra_files = [
            "tokenizer.json",
            "tokenizer_config.json",
            "special_tokens_map.json",
            "chat_template.jinja",
            "README.md",
        ]
        for fname in extra_files:
            src = os.path.join(lora_paths[0], fname)
            if os.path.exists(src):
                shutil.copy(src, os.path.join(out_path, fname))

    print(f"[OK] LoRA-Soups disimpan di: {out_path}")

In [None]:
build_lora_soup(
    lora_paths=[LORA1_DIR, LORA2_DIR],
    weights=CAT_WEIGHTS,
    out_path=LORA_SOUP_DIR,
    copy_tokenizer=True,
)

### Evaluation

In [None]:
from sacrebleu.metrics import BLEU
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
import random
import transformers
import os
from unsloth import FastLanguageModel
from rouge_score import rouge_scorer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    transformers.set_seed(seed)

# System prompt per bahasa
SYSTEM_PROMPT_ID = (
    "Anda adalah asisten AI yang membantu pengguna dalam bahasa Indonesia. "
)

SYSTEM_PROMPT_EN = (
    "You are a helpful AI assistant that communicates in English. "
)

MAX_SEQ_LEN      = 1024   # sama dengan di FastLanguageModel.from_pretrained
MAX_NEW_TOKENS   = 128
MAX_INPUT_LENGTH = MAX_SEQ_LEN - MAX_NEW_TOKENS  # supaya total <= 1024

def make_collate_fn(tokenizer, system_prompt, schema):
    def collate_fn(batch):
        input_id_list = []
        references = []

        for item in batch:
            if schema == "id":
                input_text = item["input"]      # ID: input/output
                reference  = item["output"]
            elif schema == "en":
                input_text = item["question"]   # EN: question/response
                reference  = item["response"]
            else:
                raise ValueError(f"Unknown schema: {schema}")

            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": input_text},
            ]

            # 1) Tokenisasi per-sample TANPA padding batch
            ids = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt",
                truncation=True,
                max_length=MAX_INPUT_LENGTH,
            )[0]  # shape: (seq_len,)

            input_id_list.append(ids)
            references.append(reference)

        # 2) Hitung panjang maksimum di batch
        max_len = max(t.shape[0] for t in input_id_list)
        pad_id  = tokenizer.pad_token_id
        batch_size = len(input_id_list)

        # 3) Siapkan tensor batched dengan LEFT PADDING manual
        input_ids = torch.full(
            (batch_size, max_len),
            pad_id,
            dtype=torch.long,
        )
        attention_mask = torch.zeros(
            (batch_size, max_len),
            dtype=torch.long,
        )

        for i, ids in enumerate(input_id_list):
            seq_len = ids.shape[0]
            # left padding: isi dari kanan
            input_ids[i, -seq_len:] = ids
            attention_mask[i, -seq_len:] = 1

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "references": references,
        }

    return collate_fn

def evaluate_model(model, tokenizer, eval_dataset, system_prompt,
                   batch_size=64, max_allowed_length=24, schema="en"):
    model.eval()
    all_predictions = []
    all_references = []        # untuk BLEU (nested list)
    all_references_text = []   # untuk ROUGE (list string)

    collate_fn = make_collate_fn(tokenizer, system_prompt, schema)

    dataloader = DataLoader(
        eval_dataset,
        num_workers=8,              # 0 dulu biar error kelihatan jelas
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=False,
        pin_memory=True,
    )

    bleu = BLEU()
    pbar = tqdm(dataloader)

    for batch in pbar:
        input_ids = batch["input_ids"].to(DEVICE)          # shape: (B, L)
        attention_mask = batch["attention_mask"].to(DEVICE)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_allowed_length,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=False,
                use_cache=True,
            )

        # semua sequence punya panjang awal yang sama: L
        seq_len = input_ids.shape[1]

        predictions = []
        for i, output in enumerate(outputs):
            # ambil token hasil generate saja (setelah input penuh)
            generated_tokens = output[seq_len:]
            decoded = tokenizer.decode(generated_tokens, skip_special_tokens=True)
            predictions.append(decoded)

        all_predictions.extend(predictions)
        all_references.extend([[ref] for ref in batch["references"]])
        all_references_text.extend(batch["references"])

    # ====== BLEU ======
    final_bleu = bleu.corpus_score(all_predictions, all_references)

    # ====== ROUGE-1, ROUGE-2, ROUGE-L ======
    scorer = rouge_scorer.RougeScorer(
        ["rouge1", "rouge2", "rougeL"],
        use_stemmer=True,
    )

    rouge1_list, rouge2_list, rougeL_list = [], [], []

    for pred, ref in zip(all_predictions, all_references_text):
        scores = scorer.score(ref, pred)
        rouge1_list.append(scores["rouge1"].fmeasure)
        rouge2_list.append(scores["rouge2"].fmeasure)
        rougeL_list.append(scores["rougeL"].fmeasure)

    # kali 100 supaya bentuknya persen seperti BLEU
    rouge1 = float(np.mean(rouge1_list) * 100)
    rouge2 = float(np.mean(rouge2_list) * 100)
    rougeL = float(np.mean(rougeL_list) * 100)

    return {
        "bleu": final_bleu.score,
        "rouge1": rouge1,
        "rouge2": rouge2,
        "rougeL": rougeL,
        "predictions": all_predictions,
        "references": all_references,
    }


def evaluate_model_multiple_runs(
    model,
    tokenizer,
    eval_dataset,
    system_prompt,
    n_runs=3,
    batch_size=128,
    max_allowed_length=24,
    schema="en",
):
    all_results = []

    for run in range(n_runs):
        set_seed(42 + run)
        results = evaluate_model(
            model,
            tokenizer,
            eval_dataset,
            system_prompt=system_prompt,
            batch_size=batch_size,
            max_allowed_length=max_allowed_length,
            schema=schema,
        )
        all_results.append(results)

        bleu_scores   = [r["bleu"]   for r in all_results]
        rouge1_scores = [r["rouge1"] for r in all_results]
        rouge2_scores = [r["rouge2"] for r in all_results]
        rougeL_scores = [r["rougeL"] for r in all_results]

        print(
            f"Run {run + 1}/{n_runs} - "
            f"BLEU: {np.mean(bleu_scores):.4f} ± {np.std(bleu_scores):.4f} | "
            f"R1: {np.mean(rouge1_scores):.4f} | "
            f"R2: {np.mean(rouge2_scores):.4f} | "
            f"RL: {np.mean(rougeL_scores):.4f}"
        )

    bleu_scores   = [r["bleu"]   for r in all_results]
    rouge1_scores = [r["rouge1"] for r in all_results]
    rouge2_scores = [r["rouge2"] for r in all_results]
    rougeL_scores = [r["rougeL"] for r in all_results]

    return {
        "bleu_mean":   float(np.mean(bleu_scores)),
        "bleu_std":    float(np.std(bleu_scores)),
        "rouge1_mean": float(np.mean(rouge1_scores)),
        "rouge1_std":  float(np.std(rouge1_scores)),
        "rouge2_mean": float(np.mean(rouge2_scores)),
        "rouge2_std":  float(np.std(rouge2_scores)),
        "rougeL_mean": float(np.mean(rougeL_scores)),
        "rougeL_std":  float(np.std(rougeL_scores)),
    }

MODEL_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora_soup_ga_1"
print(MODEL_DIR)

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_DIR,
    max_seq_length=MAX_SEQ_LEN,
    dtype=None,
    load_in_4bit=True,
)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

tokenizer.padding_side = "left"

model.eval()
FastLanguageModel.for_inference(model)

results_en_detail = evaluate_model(
    model,
    tokenizer,
    eval_en_clean["test"],
    system_prompt=SYSTEM_PROMPT_EN,
    batch_size=90,
    max_allowed_length=MAX_NEW_TOKENS,
    schema="en",
)

results_id_detail = evaluate_model(
    model,
    tokenizer,
    eval_id_clean["test"],
    system_prompt=SYSTEM_PROMPT_ID,
    batch_size=90,
    max_allowed_length=MAX_NEW_TOKENS,
    schema="id",
)

print("\nFinal Results (test split):")
print(
    f"English (eval_en_clean['test']): "
    f"BLEU = {results_en_detail['bleu']:.4f}, "
    f"ROUGE-1 = {results_en_detail['rouge1']:.4f}, "
    f"ROUGE-2 = {results_en_detail['rouge2']:.4f}, "
    f"ROUGE-L = {results_en_detail['rougeL']:.4f}"
)
print(
    f"Indonesian (eval_id_clean['test']): "
    f"BLEU = {results_id_detail['bleu']:.4f}, "
    f"ROUGE-1 = {results_id_detail['rouge1']:.4f}, "
    f"ROUGE-2 = {results_id_detail['rouge2']:.4f}, "
    f"ROUGE-L = {results_id_detail['rougeL']:.4f}"
)

## LoRA-Soups (CMA-ES)

### Search for the best weight

In [None]:
N_HEAD = 10
N_MID = 30
N_TAIL = 10
N = N_HEAD + N_MID + N_TAIL

ds_id = eval_id_clean["test"]
ds_en = eval_en_clean["test"]

n_total = len(ds_id)
assert len(ds_en) == n_total, "Panjang split ID dan EN harus sama"
assert n_total >= N, "Dataset terlalu kecil untuk ambil 50 sampel"

head_indices = list(range(0, N_HEAD))

mid_start = (n_total - N_MID) // 2
mid_indices = list(range(mid_start, mid_start + N_MID))

tail_indices = list(range(n_total - N_TAIL, n_total))

indices = head_indices + mid_indices + tail_indices

test_id_samples = ds_id.select(indices)
test_en_samples = ds_en.select(indices)

display(test_id_samples)
display(test_en_samples)

In [None]:
from safetensors.torch import load_file, save_file
import shutil

LORA1_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora-id"
LORA2_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora-en"

LORA_SOUP_TMP_DIR = "/content/drive/MyDrive/College/KK/Final Project/model/lora_soup_cma_tmp"

In [None]:
FLOAT_DTYPES = {
    torch.float16,
    torch.bfloat16,
    torch.float32,
    torch.float64,
}


def load_two_lora_state_dicts(lora_dir_1, lora_dir_2):
    """
    Load state_dict dari 2 adapter LoRA (safetensors).
    Pastikan kedua adapter dilatih dari base dan config yang sama.
    """
    sd1 = load_file(os.path.join(lora_dir_1, "adapter_model.safetensors"))
    sd2 = load_file(os.path.join(lora_dir_2, "adapter_model.safetensors"))

    keys1 = set(sd1.keys())
    keys2 = set(sd2.keys())
    if keys1 != keys2:
        raise ValueError("Key di adapter LoRA 1 dan 2 tidak sama. "
                         "Pastikan keduanya dilatih dari base + config yang sama.")

    return sd1, sd2


def prepare_soup_dir(template_dir, soup_dir):
    """
    Membuat folder soup sekali saja dengan cara copy seluruh isi template_dir
    (misal dari LORA1_DIR) ke soup_dir.
    Nanti setiap kali GA update bobot, kita hanya overwrite adapter_model.safetensors di soup_dir.
    """
    if os.path.exists(soup_dir):
        shutil.rmtree(soup_dir)
    shutil.copytree(template_dir, soup_dir)
    print(f"[INFO] Folder soup dibuat dari template: {template_dir} -> {soup_dir}")


def write_soup_state_dict(sd1, sd2, alpha, out_dir):
    """
    Membuat LoRA-Soup dari 2 adapter:
        W_soup = alpha * W1 + (1 - alpha) * W2

    alpha di [0, 1]. Hasilnya disimpan ke adapter_model.safetensors di out_dir.
    """
    alpha = float(alpha)
    if not (0.0 <= alpha <= 1.0):
        raise ValueError("alpha harus di antara 0 dan 1")

    w1 = alpha
    w2 = 1.0 - alpha

    new_state = {}
    for k in sd1.keys():
        t1 = sd1[k]
        t2 = sd2[k]

        if isinstance(t1, torch.Tensor) and t1.dtype in FLOAT_DTYPES:
            new_state[k] = w1 * t1 + w2 * t2
        else:
            new_state[k] = t1

    out_path = os.path.join(out_dir, "adapter_model.safetensors")
    save_file(new_state, out_path)
    print(f"[DEBUG] Soup weights disimpan ke {out_path}")

In [None]:
# Sanity Check LoRA
lora_sd1, lora_sd2 = load_two_lora_state_dicts(LORA1_DIR, LORA2_DIR)
prepare_soup_dir(LORA1_DIR, LORA_SOUP_TMP_DIR)

In [None]:
from sacrebleu.metrics import BLEU
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
import random
import transformers
import os
from unsloth import FastLanguageModel
from rouge_score import rouge_scorer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    transformers.set_seed(seed)

SYSTEM_PROMPT_ID = (
    "Anda adalah asisten AI yang membantu pengguna dalam bahasa Indonesia. "
)

SYSTEM_PROMPT_EN = (
    "You are a helpful AI assistant that communicates in English. "
)

MAX_SEQ_LEN      = 1024
MAX_NEW_TOKENS   = 128
MAX_INPUT_LENGTH = MAX_SEQ_LEN - MAX_NEW_TOKENS

def make_collate_fn(tokenizer, system_prompt, schema):
    def collate_fn(batch):
        input_id_list = []
        references = []

        for item in batch:
            if schema == "id":
                input_text = item["input"]      # ID: input/output
                reference  = item["output"]
            elif schema == "en":
                input_text = item["question"]   # EN: question/response
                reference  = item["response"]
            else:
                raise ValueError(f"Unknown schema: {schema}")

            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": input_text},
            ]

            # 1) Tokenisasi per-sample TANPA padding batch
            ids = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt",
                truncation=True,
                max_length=MAX_INPUT_LENGTH,
            )[0]  # shape: (seq_len,)

            input_id_list.append(ids)
            references.append(reference)

        # 2) Hitung panjang maksimum di batch
        max_len = max(t.shape[0] for t in input_id_list)
        pad_id  = tokenizer.pad_token_id
        batch_size = len(input_id_list)

        # 3) Siapkan tensor batched dengan LEFT PADDING manual
        input_ids = torch.full(
            (batch_size, max_len),
            pad_id,
            dtype=torch.long,
        )
        attention_mask = torch.zeros(
            (batch_size, max_len),
            dtype=torch.long,
        )

        for i, ids in enumerate(input_id_list):
            seq_len = ids.shape[0]
            # left padding: isi dari kanan
            input_ids[i, -seq_len:] = ids
            attention_mask[i, -seq_len:] = 1

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "references": references,
        }

    return collate_fn

def evaluate_model(model, tokenizer, eval_dataset, system_prompt,
                   batch_size=64, max_allowed_length=24, schema="en"):
    model.eval()
    all_predictions = []
    all_references = []        # untuk BLEU (nested list)
    all_references_text = []   # untuk ROUGE (list string)

    collate_fn = make_collate_fn(tokenizer, system_prompt, schema)

    dataloader = DataLoader(
        eval_dataset,
        num_workers=8,              # 0 dulu biar error kelihatan jelas
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=False,
        pin_memory=True,
    )

    bleu = BLEU()
    pbar = tqdm(dataloader)

    for batch in pbar:
        input_ids = batch["input_ids"].to(DEVICE)          # shape: (B, L)
        attention_mask = batch["attention_mask"].to(DEVICE)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_allowed_length,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=False,
                use_cache=True,
            )

        # semua sequence punya panjang awal yang sama: L
        seq_len = input_ids.shape[1]

        predictions = []
        for i, output in enumerate(outputs):
            # ambil token hasil generate saja (setelah input penuh)
            generated_tokens = output[seq_len:]
            decoded = tokenizer.decode(generated_tokens, skip_special_tokens=True)
            predictions.append(decoded)

        all_predictions.extend(predictions)
        all_references.extend([[ref] for ref in batch["references"]])
        all_references_text.extend(batch["references"])

    # ====== BLEU ======
    final_bleu = bleu.corpus_score(all_predictions, all_references)

    # ====== ROUGE-1, ROUGE-2, ROUGE-L ======
    scorer = rouge_scorer.RougeScorer(
        ["rouge1", "rouge2", "rougeL"],
        use_stemmer=True,
    )

    rouge1_list, rouge2_list, rougeL_list = [], [], []

    for pred, ref in zip(all_predictions, all_references_text):
        scores = scorer.score(ref, pred)
        rouge1_list.append(scores["rouge1"].fmeasure)
        rouge2_list.append(scores["rouge2"].fmeasure)
        rougeL_list.append(scores["rougeL"].fmeasure)

    # kali 100 supaya bentuknya persen seperti BLEU
    rouge1 = float(np.mean(rouge1_list) * 100)
    rouge2 = float(np.mean(rouge2_list) * 100)
    rougeL = float(np.mean(rougeL_list) * 100)

    return {
        "bleu": final_bleu.score,
        "rouge1": rouge1,
        "rouge2": rouge2,
        "rougeL": rougeL,
        "predictions": all_predictions,
        "references": all_references,
    }


def evaluate_model_multiple_runs(
    model,
    tokenizer,
    eval_dataset,
    system_prompt,
    n_runs=3,
    batch_size=128,
    max_allowed_length=24,
    schema="en",
):
    all_results = []

    for run in range(n_runs):
        set_seed(42 + run)
        results = evaluate_model(
            model,
            tokenizer,
            eval_dataset,
            system_prompt=system_prompt,
            batch_size=batch_size,
            max_allowed_length=max_allowed_length,
            schema=schema,
        )
        all_results.append(results)

        bleu_scores   = [r["bleu"]   for r in all_results]
        rouge1_scores = [r["rouge1"] for r in all_results]
        rouge2_scores = [r["rouge2"] for r in all_results]
        rougeL_scores = [r["rougeL"] for r in all_results]

        print(
            f"Run {run + 1}/{n_runs} - "
            f"BLEU: {np.mean(bleu_scores):.4f} ± {np.std(bleu_scores):.4f} | "
            f"R1: {np.mean(rouge1_scores):.4f} | "
            f"R2: {np.mean(rouge2_scores):.4f} | "
            f"RL: {np.mean(rougeL_scores):.4f}"
        )

    bleu_scores   = [r["bleu"]   for r in all_results]
    rouge1_scores = [r["rouge1"] for r in all_results]
    rouge2_scores = [r["rouge2"] for r in all_results]
    rougeL_scores = [r["rougeL"] for r in all_results]

    return {
        "bleu_mean":   float(np.mean(bleu_scores)),
        "bleu_std":    float(np.std(bleu_scores)),
        "rouge1_mean": float(np.mean(rouge1_scores)),
        "rouge1_std":  float(np.std(rouge1_scores)),
        "rouge2_mean": float(np.mean(rouge2_scores)),
        "rouge2_std":  float(np.std(rouge2_scores)),
        "rougeL_mean": float(np.mean(rougeL_scores)),
        "rougeL_std":  float(np.std(rougeL_scores)),
    }

def evaluate_alpha(alpha, eval_en_dataset, eval_id_dataset, batch_size=75, verbose=True, weight_id=0.5):
    """
    Evaluasi 1 nilai alpha:
        w1 = alpha, w2 = 1 - alpha
    Fitness = (1 - weight_id)*BLEU_EN + weight_id*BLEU_ID
    """

    # 1. Tulis LoRA-Soup ke LORA_SOUP_TMP_DIR
    write_soup_state_dict(lora_sd1, lora_sd2, alpha, LORA_SOUP_TMP_DIR)

    # 2. Bersihkan GPU sebelum load model baru
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # 3. Load model + tokenizer dari adapter soup
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=LORA_SOUP_TMP_DIR,
        max_seq_length=MAX_SEQ_LEN,
        dtype=None,
        load_in_4bit=True,
    )

    # Set pad token dan padding side sesuai evaluasi sebelumnya
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id
    tokenizer.padding_side = "left"

    model.to(DEVICE)
    model.eval()
    FastLanguageModel.for_inference(model)

    results_en = evaluate_model(
        model,
        tokenizer,
        eval_en_dataset,
        system_prompt=SYSTEM_PROMPT_EN,
        batch_size=batch_size,
        max_allowed_length=MAX_NEW_TOKENS,
        schema="en",
    )

    results_id = evaluate_model(
        model,
        tokenizer,
        eval_id_dataset,
        system_prompt=SYSTEM_PROMPT_ID,
        batch_size=batch_size,
        max_allowed_length=MAX_NEW_TOKENS,
        schema="id",
    )

    bleu_en = results_en["bleu"]
    bleu_id = results_id["bleu"]

    fitness = (1.0 - weight_id) * bleu_en + weight_id * bleu_id

    if verbose:
        print(
            f"[alpha={alpha:.4f}] "
            f"BLEU_EN={bleu_en:.4f}, BLEU_ID={bleu_id:.4f}, "
            f"Fitness={fitness:.4f}"
        )

    del model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return fitness, bleu_en, bleu_id

In [None]:
# Quick Test
fit_05, bleu_en_05, bleu_id_05 = evaluate_alpha(
    alpha=0.5,
    eval_en_dataset=test_en_samples,
    eval_id_dataset=test_id_samples,
)
print("Fitness alpha=0.5:", fit_05)

In [None]:
!pip install -q cma

In [None]:
import numpy as np
import cma


def cma_objective_alpha_2d(
    z_vec,
    eval_en_dataset,
    eval_id_dataset,
    weight_id=0.5,
    batch_size=75,
    eval_verbose=False,
):
    """
    Objective untuk CMA-ES di dimensi 2.

    z_vec: array-like panjang 2 -> [z1, z2]
    Kita map z1 -> alpha ∈ (0, 1) dengan sigmoid.
    z2 diabaikan (dummy dimension) hanya supaya n >= 2.
    """
    z_vec = np.asarray(z_vec, dtype=float)
    z1 = float(z_vec[0])

    # Map ke alpha ∈ (0,1)
    alpha = 1.0 / (1.0 + np.exp(-z1))

    # Evaluasi LoRA-Soup seperti biasa
    fitness, bleu_en, bleu_id = evaluate_alpha(
        alpha=alpha,
        eval_en_dataset=eval_en_dataset,
        eval_id_dataset=eval_id_dataset,
        batch_size=batch_size,
        verbose=eval_verbose,
        weight_id=weight_id,
    )

    # CMA-ES always MINIMIZE -> kembalikan -fitness (biar maximize fitness)
    return -fitness

def run_cmaes_for_lora_soup(
    eval_en_dataset,
    eval_id_dataset,
    pop_size=8,
    max_iter=20,
    sigma0=1.0,
    z0=(0.0, 0.0),
    weight_id=0.5,
    batch_size=75,
    seed=42,
    eval_verbose=False,
):
    """
    Jalankan CMA-ES (dimensi 2) untuk mencari alpha terbaik LoRA-Soup.

    - Kita optimasi z ∈ R^2.
    - alpha = sigmoid(z1) ∈ (0,1).
    - w_LoRA1 = alpha
      w_LoRA2 = 1 - alpha

    Argumen mirip versi GA/CMA 1D sebelumnya.
    """

    # Set random seed global (sesuai utilmu)
    set_seed(seed)

    # Opsi CMA-ES
    options = {
        # batas untuk z1/z2; setelah sigmoid, alpha hampir [0,1]
        "bounds": [-8.0, 8.0],
        "popsize": pop_size,
        "maxiter": max_iter,
        "verb_disp": 1,
        "seed": seed,
    }

    # Bungkus objective supaya dataset & parameter lain "tertutup"
    def _objective(z_vec):
        return cma_objective_alpha_2d(
            z_vec,
            eval_en_dataset=eval_en_dataset,
            eval_id_dataset=eval_id_dataset,
            weight_id=weight_id,
            batch_size=batch_size,
            eval_verbose=eval_verbose,
        )

    # Inisialisasi CMA-ES di dimensi 2
    es = cma.CMAEvolutionStrategy(list(z0), sigma0, options)

    history = []  # untuk simpan log tiap evaluasi

    eval_counter = 0
    while not es.stop():
        # 1) Minta populasi kandidat z
        solutions = es.ask()  # list of length pop_size, tiap item array shape (2,)

        # 2) Hitung fitness
        fitnesses = []
        for z_vec in solutions:
            f = _objective(z_vec)  # = -fitness_asli
            fitnesses.append(f)

            z1 = float(z_vec[0])
            z2 = float(z_vec[1])
            alpha = 1.0 / (1.0 + np.exp(-z1))

            history.append(
                {
                    "eval": eval_counter,
                    "z1": z1,
                    "z2": z2,
                    "alpha": alpha,
                    "fitness_neg": float(f),   # yang dipakai CMA-ES
                    "fitness_pos": float(-f),  # fitness asli (BLEU mix)
                }
            )
            eval_counter += 1

        # 3) Update CMA-ES dengan hasil evaluasi
        es.tell(solutions, fitnesses)
        es.disp()  # print status singkat

    # Ambil hasil terbaik dari CMA-ES
    result = es.result

    z1_best = float(result.xbest[0])
    best_alpha = 1.0 / (1.0 + np.exp(-z1_best))
    best_fitness = float(-result.fbest)

    print("\n=== HASIL AKHIR CMA-ES (2D) ===")
    print(f"alpha terbaik     : {best_alpha:.4f}")
    print(f"w_LoRA1 (adapter1): {best_alpha:.4f}")
    print(f"w_LoRA2 (adapter2): {1.0 - best_alpha:.4f}")
    print(f"fitness (BLEU mix): {best_fitness:.4f}")

    return best_alpha, best_fitness, history

In [None]:
import psutil, os, time

process = psutil.Process(os.getpid())

if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.empty_cache()

start_gpu_mem = torch.cuda.memory_allocated()
start_ram = process.memory_info().rss
start_time = time.perf_counter()

best_alpha, best_fitness, cma_history = run_cmaes_for_lora_soup(
    eval_en_dataset=test_en_samples,
    eval_id_dataset=test_id_samples,
    pop_size=8,
    max_iter=6,
    sigma0=0.25,          # bisa kamu tuning
    z0=(0.0, 0.0),       # alpha awal ~ 0.5
    weight_id=0.5,
    batch_size=80,
    seed=42,
    eval_verbose=True,
)


end_time = time.perf_counter()
end_ram = process.memory_info().rss
end_gpu_mem = torch.cuda.memory_allocated()
peak_gpu_mem = torch.cuda.max_memory_allocated()

elapsed = end_time - start_time

print("\n=== HASIL AKHIR CMA-ES ===")
print(f"alpha terbaik    : {best_alpha:.4f}")
print(f"w_LoRA1 (adapter1): {best_alpha:.4f}")
print(f"w_LoRA2 (adapter2): {1.0 - best_alpha:.4f}")
print(f"fitness (BLEU mix): {best_fitness:.4f}")
print(f"Total waktu CMA-ES     : {elapsed:.2f} detik")

print("\n=== MEMORI GPU ===")
print(f"Memori awal  : {start_gpu_mem / (1024**2):.2f} MB")
print(f"Memori akhir : {end_gpu_mem / (1024**2):.2f} MB")
print(f"Peak usage   : {peak_gpu_mem / (1024**2):.2f} MB")

print("\n=== MEMORI RAM ===")
print(f"RAM awal  : {start_ram / (1024**2):.2f} MB")
print(f"RAM akhir : {end_ram / (1024**2):.2f} MB")
print(f"Δ RAM     : {(end_ram - start_ram) / (1024**2):.2f} MB")

In [None]:
import psutil, os, time

process = psutil.Process(os.getpid())

if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.empty_cache()

start_gpu_mem = torch.cuda.memory_allocated()
start_ram = process.memory_info().rss
start_time = time.perf_counter()

best_alpha_A, best_fit_A, hist_A = run_cmaes_for_lora_soup(
    eval_en_dataset=test_en_samples,
    eval_id_dataset=test_id_samples,
    pop_size=6,
    max_iter=4,
    sigma0=1.0,
    z0=(0.0, 0.0),
    weight_id=0.5,
    batch_size=80,
    seed=42,
    eval_verbose=True,
)

end_time = time.perf_counter()
end_ram = process.memory_info().rss
end_gpu_mem = torch.cuda.memory_allocated()
peak_gpu_mem = torch.cuda.max_memory_allocated()

elapsed = end_time - start_time

print("\n=== HASIL AKHIR CMA-ES ===")
print(f"alpha terbaik    : {best_alpha:.4f}")
print(f"w_LoRA1 (adapter1): {best_alpha:.4f}")
print(f"w_LoRA2 (adapter2): {1.0 - best_alpha:.4f}")
print(f"fitness (BLEU mix): {best_fitness:.4f}")
print(f"Total waktu CMA-ES     : {elapsed:.2f} detik")

print("\n=== MEMORI GPU ===")
print(f"Memori awal  : {start_gpu_mem / (1024**2):.2f} MB")
print(f"Memori akhir : {end_gpu_mem / (1024**2):.2f} MB")
print(f"Peak usage   : {peak_gpu_mem / (1024**2):.2f} MB")

print("\n=== MEMORI RAM ===")
print(f"RAM awal  : {start_ram / (1024**2):.2f} MB")
print(f"RAM akhir : {end_ram / (1024**2):.2f} MB")
print(f"Δ RAM     : {(end_ram - start_ram) / (1024**2):.2f} MB")

In [None]:
import psutil, os, time

process = psutil.Process(os.getpid())

if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.empty_cache()

start_gpu_mem = torch.cuda.memory_allocated()
start_ram = process.memory_info().rss
start_time = time.perf_counter()

best_alpha_C, best_fit_C, hist_C = run_cmaes_for_lora_soup(
    eval_en_dataset=test_en_samples,
    eval_id_dataset=test_id_samples,
    pop_size=12,
    max_iter=6,
    sigma0=1.5,
    z0=(0.0, 0.0),
    weight_id=0.5,
    batch_size=80,
    seed=44,
    eval_verbose=False,
)

end_time = time.perf_counter()
end_ram = process.memory_info().rss
end_gpu_mem = torch.cuda.memory_allocated()
peak_gpu_mem = torch.cuda.max_memory_allocated()

elapsed = end_time - start_time

print("\n=== HASIL AKHIR CMA-ES ===")
print(f"alpha terbaik    : {best_alpha:.4f}")
print(f"w_LoRA1 (adapter1): {best_alpha:.4f}")
print(f"w_LoRA2 (adapter2): {1.0 - best_alpha:.4f}")
print(f"fitness (BLEU mix): {best_fitness:.4f}")
print(f"Total waktu CMA-ES     : {elapsed:.2f} detik")

print("\n=== MEMORI GPU ===")
print(f"Memori awal  : {start_gpu_mem / (1024**2):.2f} MB")
print(f"Memori akhir : {end_gpu_mem / (1024**2):.2f} MB")
print(f"Peak usage   : {peak_gpu_mem / (1024**2):.2f} MB")

print("\n=== MEMORI RAM ===")
print(f"RAM awal  : {start_ram / (1024**2):.2f} MB")
print(f"RAM akhir : {end_ram / (1024**2):.2f} MB")
print(f"Δ RAM     : {(end_ram - start_ram) / (1024**2):.2f} MB")

### Building the adapter

In [None]:
# Configuration
BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct-unsloth-bnb-4bit"

LORA1_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora-id"
LORA2_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora-en"

LORA_SOUP_DIR = "/content/drive/MyDrive/College/KK/Final Project/model/lora_soup_cmaes_3"

CAT_WEIGHTS = [0.4417, 0.5583]

LOAD_IN_4BIT = True

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

In [None]:
def show_adapter_cfg(path):
    with open(os.path.join(path, "adapter_config.json"), "r") as f:
        cfg = json.load(f)
    print("===", path, "===")
    print("r           :", cfg["r"])
    print("lora_alpha  :", cfg.get("lora_alpha"))
    print("target_modules:", cfg.get("target_modules"))

show_adapter_cfg(LORA1_DIR)
show_adapter_cfg(LORA2_DIR)

In [None]:
import os, json, shutil
from typing import List, Optional

import torch
from safetensors.torch import load_file, save_file

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def build_lora_soup(
    lora_paths: List[str],
    weights: Optional[List[float]],
    out_path: str,
    copy_tokenizer: bool = True,
):
    """
    Menggabungkan beberapa LoRA menjadi 1 adapter LoRA-Soups
    dengan cara *weighted average* (model soup) atas seluruh
    bobot float di state_dict.

    lora_paths : list path adapter LoRA sumber
    weights    : bobot soup, akan dinormalisasi; jika None -> uniform
    out_path   : folder output adapter baru
    """
    assert len(lora_paths) >= 2, "Minimal 2 LoRA untuk LoRA-Soups"

    # jika weights tidak diberikan -> uniform
    if weights is None:
        w = torch.ones(len(lora_paths), dtype=torch.float32)
    else:
        assert len(weights) == len(lora_paths), "Panjang weights harus = jumlah LoRA"
        w = torch.tensor(weights, dtype=torch.float32)

    # normalisasi bobot
    w = w / w.sum()

    os.makedirs(out_path, exist_ok=True)

    # 1. baca config dari adapter pertama
    cfg_path0 = os.path.join(lora_paths[0], "adapter_config.json")
    with open(cfg_path0, "r") as f:
        cfg0 = json.load(f)

    base_r      = cfg0["r"]
    base_alpha  = cfg0.get("lora_alpha", None)
    base_target = set(cfg0.get("target_modules", []))

    # cek konsistensi adapter lain
    for p in lora_paths[1:]:
        with open(os.path.join(p, "adapter_config.json"), "r") as f:
            cfgi = json.load(f)
        assert cfgi["r"] == base_r, f"Rank LoRA beda di {p}"
        if "lora_alpha" in cfgi and base_alpha is not None:
            assert cfgi["lora_alpha"] == base_alpha, f"lora_alpha beda di {p}"
        if "target_modules" in cfgi and base_target:
            assert set(cfgi["target_modules"]) == base_target, f"target_modules beda di {p}"

    # 2. config baru untuk SOUP:
    #    rank r & lora_alpha TIDAK berubah, hanya bobot yang di-average
    cfg_soup = cfg0.copy()
    if "target_modules" in cfg_soup:
        cfg_soup["target_modules"] = sorted(cfg_soup["target_modules"])

    with open(os.path.join(out_path, "adapter_config.json"), "w") as f:
        json.dump(cfg_soup, f, indent=2)

    # 3. load semua state dict LoRA
    state_dicts = [load_file(os.path.join(p, "adapter_model.safetensors")) for p in lora_paths]

    # opsional: pastikan key sama semua
    keys0 = set(state_dicts[0].keys())
    for i, sd in enumerate(state_dicts[1:], start=1):
        ki = set(sd.keys())
        assert keys0 == ki, f"Kumpulan key state_dict LoRA ke-{i} beda; cek adapter: {lora_paths[i]}"

    # 4. build state dict baru dengan weighted average
    new_state = {}
    float_dtypes = {
        torch.float16,
        torch.bfloat16,
        torch.float32,
        torch.float64,
    }

    for key in state_dicts[0].keys():
        tensor0 = state_dicts[0][key]

        # Kalau tensor floating -> soup (weighted average)
        if isinstance(tensor0, torch.Tensor) and tensor0.dtype in float_dtypes:
            acc = torch.zeros_like(tensor0)
            for wi, sd in zip(w, state_dicts):
                acc = acc + wi * sd[key]
            new_state[key] = acc
        else:
            # Non-float (misal buffer int, dsb) -> ambil dari adapter pertama saja
            new_state[key] = tensor0

    # 5. simpan
    save_file(new_state, os.path.join(out_path, "adapter_model.safetensors"))

    # 6. copy tokenizer & template dari adapter pertama (opsional)
    if copy_tokenizer:
        extra_files = [
            "tokenizer.json",
            "tokenizer_config.json",
            "special_tokens_map.json",
            "chat_template.jinja",
            "README.md",
        ]
        for fname in extra_files:
            src = os.path.join(lora_paths[0], fname)
            if os.path.exists(src):
                shutil.copy(src, os.path.join(out_path, fname))

    print(f"[OK] LoRA-Soups disimpan di: {out_path}")

In [None]:
build_lora_soup(
    lora_paths=[LORA1_DIR, LORA2_DIR],
    weights=CAT_WEIGHTS,
    out_path=LORA_SOUP_DIR,
    copy_tokenizer=True,
)

### Evaluation

In [None]:
from sacrebleu.metrics import BLEU
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
import random
import transformers
import os
from unsloth import FastLanguageModel
from rouge_score import rouge_scorer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    transformers.set_seed(seed)

# System prompt per bahasa
SYSTEM_PROMPT_ID = (
    "Anda adalah asisten AI yang membantu pengguna dalam bahasa Indonesia. "
)

SYSTEM_PROMPT_EN = (
    "You are a helpful AI assistant that communicates in English. "
)

MAX_SEQ_LEN      = 1024   # sama dengan di FastLanguageModel.from_pretrained
MAX_NEW_TOKENS   = 128
MAX_INPUT_LENGTH = MAX_SEQ_LEN - MAX_NEW_TOKENS  # supaya total <= 1024

def make_collate_fn(tokenizer, system_prompt, schema):
    def collate_fn(batch):
        input_id_list = []
        references = []

        for item in batch:
            if schema == "id":
                input_text = item["input"]      # ID: input/output
                reference  = item["output"]
            elif schema == "en":
                input_text = item["question"]   # EN: question/response
                reference  = item["response"]
            else:
                raise ValueError(f"Unknown schema: {schema}")

            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": input_text},
            ]

            # 1) Tokenisasi per-sample TANPA padding batch
            ids = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt",
                truncation=True,
                max_length=MAX_INPUT_LENGTH,
            )[0]  # shape: (seq_len,)

            input_id_list.append(ids)
            references.append(reference)

        # 2) Hitung panjang maksimum di batch
        max_len = max(t.shape[0] for t in input_id_list)
        pad_id  = tokenizer.pad_token_id
        batch_size = len(input_id_list)

        # 3) Siapkan tensor batched dengan LEFT PADDING manual
        input_ids = torch.full(
            (batch_size, max_len),
            pad_id,
            dtype=torch.long,
        )
        attention_mask = torch.zeros(
            (batch_size, max_len),
            dtype=torch.long,
        )

        for i, ids in enumerate(input_id_list):
            seq_len = ids.shape[0]
            # left padding: isi dari kanan
            input_ids[i, -seq_len:] = ids
            attention_mask[i, -seq_len:] = 1

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "references": references,
        }

    return collate_fn

def evaluate_model(model, tokenizer, eval_dataset, system_prompt,
                   batch_size=64, max_allowed_length=24, schema="en"):
    model.eval()
    all_predictions = []
    all_references = []        # untuk BLEU (nested list)
    all_references_text = []   # untuk ROUGE (list string)

    collate_fn = make_collate_fn(tokenizer, system_prompt, schema)

    dataloader = DataLoader(
        eval_dataset,
        num_workers=8,              # 0 dulu biar error kelihatan jelas
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=False,
        pin_memory=True,
    )

    bleu = BLEU()
    pbar = tqdm(dataloader)

    for batch in pbar:
        input_ids = batch["input_ids"].to(DEVICE)          # shape: (B, L)
        attention_mask = batch["attention_mask"].to(DEVICE)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_allowed_length,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=False,
                use_cache=True,
            )

        # semua sequence punya panjang awal yang sama: L
        seq_len = input_ids.shape[1]

        predictions = []
        for i, output in enumerate(outputs):
            # ambil token hasil generate saja (setelah input penuh)
            generated_tokens = output[seq_len:]
            decoded = tokenizer.decode(generated_tokens, skip_special_tokens=True)
            predictions.append(decoded)

        all_predictions.extend(predictions)
        all_references.extend([[ref] for ref in batch["references"]])
        all_references_text.extend(batch["references"])

    # ====== BLEU ======
    final_bleu = bleu.corpus_score(all_predictions, all_references)

    # ====== ROUGE-1, ROUGE-2, ROUGE-L ======
    scorer = rouge_scorer.RougeScorer(
        ["rouge1", "rouge2", "rougeL"],
        use_stemmer=True,
    )

    rouge1_list, rouge2_list, rougeL_list = [], [], []

    for pred, ref in zip(all_predictions, all_references_text):
        scores = scorer.score(ref, pred)
        rouge1_list.append(scores["rouge1"].fmeasure)
        rouge2_list.append(scores["rouge2"].fmeasure)
        rougeL_list.append(scores["rougeL"].fmeasure)

    # kali 100 supaya bentuknya persen seperti BLEU
    rouge1 = float(np.mean(rouge1_list) * 100)
    rouge2 = float(np.mean(rouge2_list) * 100)
    rougeL = float(np.mean(rougeL_list) * 100)

    return {
        "bleu": final_bleu.score,
        "rouge1": rouge1,
        "rouge2": rouge2,
        "rougeL": rougeL,
        "predictions": all_predictions,
        "references": all_references,
    }


def evaluate_model_multiple_runs(
    model,
    tokenizer,
    eval_dataset,
    system_prompt,
    n_runs=3,
    batch_size=128,
    max_allowed_length=24,
    schema="en",
):
    all_results = []

    for run in range(n_runs):
        set_seed(42 + run)
        results = evaluate_model(
            model,
            tokenizer,
            eval_dataset,
            system_prompt=system_prompt,
            batch_size=batch_size,
            max_allowed_length=max_allowed_length,
            schema=schema,
        )
        all_results.append(results)

        bleu_scores   = [r["bleu"]   for r in all_results]
        rouge1_scores = [r["rouge1"] for r in all_results]
        rouge2_scores = [r["rouge2"] for r in all_results]
        rougeL_scores = [r["rougeL"] for r in all_results]

        print(
            f"Run {run + 1}/{n_runs} - "
            f"BLEU: {np.mean(bleu_scores):.4f} ± {np.std(bleu_scores):.4f} | "
            f"R1: {np.mean(rouge1_scores):.4f} | "
            f"R2: {np.mean(rouge2_scores):.4f} | "
            f"RL: {np.mean(rougeL_scores):.4f}"
        )

    bleu_scores   = [r["bleu"]   for r in all_results]
    rouge1_scores = [r["rouge1"] for r in all_results]
    rouge2_scores = [r["rouge2"] for r in all_results]
    rougeL_scores = [r["rougeL"] for r in all_results]

    return {
        "bleu_mean":   float(np.mean(bleu_scores)),
        "bleu_std":    float(np.std(bleu_scores)),
        "rouge1_mean": float(np.mean(rouge1_scores)),
        "rouge1_std":  float(np.std(rouge1_scores)),
        "rouge2_mean": float(np.mean(rouge2_scores)),
        "rouge2_std":  float(np.std(rouge2_scores)),
        "rougeL_mean": float(np.mean(rougeL_scores)),
        "rougeL_std":  float(np.std(rougeL_scores)),
    }

MODEL_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora_soup_cmaes_3"
print(MODEL_DIR)

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_DIR,
    max_seq_length=MAX_SEQ_LEN,
    dtype=None,
    load_in_4bit=True,
)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

tokenizer.padding_side = "left"

model.eval()
FastLanguageModel.for_inference(model)

results_en_detail = evaluate_model(
    model,
    tokenizer,
    eval_en_clean["test"],
    system_prompt=SYSTEM_PROMPT_EN,
    batch_size=90,
    max_allowed_length=MAX_NEW_TOKENS,
    schema="en",
)

results_id_detail = evaluate_model(
    model,
    tokenizer,
    eval_id_clean["test"],
    system_prompt=SYSTEM_PROMPT_ID,
    batch_size=90,
    max_allowed_length=MAX_NEW_TOKENS,
    schema="id",
)

print("\nFinal Results (test split):")
print(
    f"English (eval_en_clean['test']): "
    f"BLEU = {results_en_detail['bleu']:.4f}, "
    f"ROUGE-1 = {results_en_detail['rouge1']:.4f}, "
    f"ROUGE-2 = {results_en_detail['rouge2']:.4f}, "
    f"ROUGE-L = {results_en_detail['rougeL']:.4f}"
)
print(
    f"Indonesian (eval_id_clean['test']): "
    f"BLEU = {results_id_detail['bleu']:.4f}, "
    f"ROUGE-1 = {results_id_detail['rouge1']:.4f}, "
    f"ROUGE-2 = {results_id_detail['rouge2']:.4f}, "
    f"ROUGE-L = {results_id_detail['rougeL']:.4f}"
)

## LoRA-Soups (GWO)

### Search for the best weight

In [None]:
N_HEAD = 10
N_MID = 30
N_TAIL = 10
N = N_HEAD + N_MID + N_TAIL

ds_id = eval_id_clean["test"]
ds_en = eval_en_clean["test"]

n_total = len(ds_id)
assert len(ds_en) == n_total, "Panjang split ID dan EN harus sama"
assert n_total >= N, "Dataset terlalu kecil untuk ambil 50 sampel"

head_indices = list(range(0, N_HEAD))

mid_start = (n_total - N_MID) // 2
mid_indices = list(range(mid_start, mid_start + N_MID))

tail_indices = list(range(n_total - N_TAIL, n_total))

indices = head_indices + mid_indices + tail_indices

test_id_samples = ds_id.select(indices)
test_en_samples = ds_en.select(indices)

display(test_id_samples)
display(test_en_samples)

In [None]:
from safetensors.torch import load_file, save_file
import shutil

LORA1_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora-id"
LORA2_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora-en"

LORA_SOUP_TMP_DIR = "/content/drive/MyDrive/College/KK/Final Project/model/lora_soup_gwo_tmp"

In [None]:
FLOAT_DTYPES = {
    torch.float16,
    torch.bfloat16,
    torch.float32,
    torch.float64,
}


def load_two_lora_state_dicts(lora_dir_1, lora_dir_2):
    """
    Load state_dict dari 2 adapter LoRA (safetensors).
    Pastikan kedua adapter dilatih dari base dan config yang sama.
    """
    sd1 = load_file(os.path.join(lora_dir_1, "adapter_model.safetensors"))
    sd2 = load_file(os.path.join(lora_dir_2, "adapter_model.safetensors"))

    keys1 = set(sd1.keys())
    keys2 = set(sd2.keys())
    if keys1 != keys2:
        raise ValueError("Key di adapter LoRA 1 dan 2 tidak sama. "
                         "Pastikan keduanya dilatih dari base + config yang sama.")

    return sd1, sd2


def prepare_soup_dir(template_dir, soup_dir):
    """
    Membuat folder soup sekali saja dengan cara copy seluruh isi template_dir
    (misal dari LORA1_DIR) ke soup_dir.
    Nanti setiap kali GA update bobot, kita hanya overwrite adapter_model.safetensors di soup_dir.
    """
    if os.path.exists(soup_dir):
        shutil.rmtree(soup_dir)
    shutil.copytree(template_dir, soup_dir)
    print(f"[INFO] Folder soup dibuat dari template: {template_dir} -> {soup_dir}")


def write_soup_state_dict(sd1, sd2, alpha, out_dir):
    """
    Membuat LoRA-Soup dari 2 adapter:
        W_soup = alpha * W1 + (1 - alpha) * W2

    alpha di [0, 1]. Hasilnya disimpan ke adapter_model.safetensors di out_dir.
    """
    alpha = float(alpha)
    if not (0.0 <= alpha <= 1.0):
        raise ValueError("alpha harus di antara 0 dan 1")

    w1 = alpha
    w2 = 1.0 - alpha

    new_state = {}
    for k in sd1.keys():
        t1 = sd1[k]
        t2 = sd2[k]

        if isinstance(t1, torch.Tensor) and t1.dtype in FLOAT_DTYPES:
            new_state[k] = w1 * t1 + w2 * t2
        else:
            new_state[k] = t1

    out_path = os.path.join(out_dir, "adapter_model.safetensors")
    save_file(new_state, out_path)
    print(f"[DEBUG] Soup weights disimpan ke {out_path}")

In [None]:
# Sanity Check LoRA
lora_sd1, lora_sd2 = load_two_lora_state_dicts(LORA1_DIR, LORA2_DIR)
prepare_soup_dir(LORA1_DIR, LORA_SOUP_TMP_DIR)

In [None]:
from sacrebleu.metrics import BLEU
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
import random
import transformers
import os
from unsloth import FastLanguageModel
from rouge_score import rouge_scorer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    transformers.set_seed(seed)

SYSTEM_PROMPT_ID = (
    "Anda adalah asisten AI yang membantu pengguna dalam bahasa Indonesia. "
)

SYSTEM_PROMPT_EN = (
    "You are a helpful AI assistant that communicates in English. "
)

MAX_SEQ_LEN      = 1024
MAX_NEW_TOKENS   = 128
MAX_INPUT_LENGTH = MAX_SEQ_LEN - MAX_NEW_TOKENS

def make_collate_fn(tokenizer, system_prompt, schema):
    def collate_fn(batch):
        input_id_list = []
        references = []

        for item in batch:
            if schema == "id":
                input_text = item["input"]      # ID: input/output
                reference  = item["output"]
            elif schema == "en":
                input_text = item["question"]   # EN: question/response
                reference  = item["response"]
            else:
                raise ValueError(f"Unknown schema: {schema}")

            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": input_text},
            ]

            # 1) Tokenisasi per-sample TANPA padding batch
            ids = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt",
                truncation=True,
                max_length=MAX_INPUT_LENGTH,
            )[0]  # shape: (seq_len,)

            input_id_list.append(ids)
            references.append(reference)

        # 2) Hitung panjang maksimum di batch
        max_len = max(t.shape[0] for t in input_id_list)
        pad_id  = tokenizer.pad_token_id
        batch_size = len(input_id_list)

        # 3) Siapkan tensor batched dengan LEFT PADDING manual
        input_ids = torch.full(
            (batch_size, max_len),
            pad_id,
            dtype=torch.long,
        )
        attention_mask = torch.zeros(
            (batch_size, max_len),
            dtype=torch.long,
        )

        for i, ids in enumerate(input_id_list):
            seq_len = ids.shape[0]
            # left padding: isi dari kanan
            input_ids[i, -seq_len:] = ids
            attention_mask[i, -seq_len:] = 1

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "references": references,
        }

    return collate_fn

def evaluate_model(model, tokenizer, eval_dataset, system_prompt,
                   batch_size=64, max_allowed_length=24, schema="en"):
    model.eval()
    all_predictions = []
    all_references = []        # untuk BLEU (nested list)
    all_references_text = []   # untuk ROUGE (list string)

    collate_fn = make_collate_fn(tokenizer, system_prompt, schema)

    dataloader = DataLoader(
        eval_dataset,
        num_workers=8,              # 0 dulu biar error kelihatan jelas
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=False,
        pin_memory=True,
    )

    bleu = BLEU()
    pbar = tqdm(dataloader)

    for batch in pbar:
        input_ids = batch["input_ids"].to(DEVICE)          # shape: (B, L)
        attention_mask = batch["attention_mask"].to(DEVICE)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_allowed_length,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=False,
                use_cache=True,
            )

        # semua sequence punya panjang awal yang sama: L
        seq_len = input_ids.shape[1]

        predictions = []
        for i, output in enumerate(outputs):
            # ambil token hasil generate saja (setelah input penuh)
            generated_tokens = output[seq_len:]
            decoded = tokenizer.decode(generated_tokens, skip_special_tokens=True)
            predictions.append(decoded)

        all_predictions.extend(predictions)
        all_references.extend([[ref] for ref in batch["references"]])
        all_references_text.extend(batch["references"])

    # ====== BLEU ======
    final_bleu = bleu.corpus_score(all_predictions, all_references)

    # ====== ROUGE-1, ROUGE-2, ROUGE-L ======
    scorer = rouge_scorer.RougeScorer(
        ["rouge1", "rouge2", "rougeL"],
        use_stemmer=True,
    )

    rouge1_list, rouge2_list, rougeL_list = [], [], []

    for pred, ref in zip(all_predictions, all_references_text):
        scores = scorer.score(ref, pred)
        rouge1_list.append(scores["rouge1"].fmeasure)
        rouge2_list.append(scores["rouge2"].fmeasure)
        rougeL_list.append(scores["rougeL"].fmeasure)

    # kali 100 supaya bentuknya persen seperti BLEU
    rouge1 = float(np.mean(rouge1_list) * 100)
    rouge2 = float(np.mean(rouge2_list) * 100)
    rougeL = float(np.mean(rougeL_list) * 100)

    return {
        "bleu": final_bleu.score,
        "rouge1": rouge1,
        "rouge2": rouge2,
        "rougeL": rougeL,
        "predictions": all_predictions,
        "references": all_references,
    }


def evaluate_model_multiple_runs(
    model,
    tokenizer,
    eval_dataset,
    system_prompt,
    n_runs=3,
    batch_size=128,
    max_allowed_length=24,
    schema="en",
):
    all_results = []

    for run in range(n_runs):
        set_seed(42 + run)
        results = evaluate_model(
            model,
            tokenizer,
            eval_dataset,
            system_prompt=system_prompt,
            batch_size=batch_size,
            max_allowed_length=max_allowed_length,
            schema=schema,
        )
        all_results.append(results)

        bleu_scores   = [r["bleu"]   for r in all_results]
        rouge1_scores = [r["rouge1"] for r in all_results]
        rouge2_scores = [r["rouge2"] for r in all_results]
        rougeL_scores = [r["rougeL"] for r in all_results]

        print(
            f"Run {run + 1}/{n_runs} - "
            f"BLEU: {np.mean(bleu_scores):.4f} ± {np.std(bleu_scores):.4f} | "
            f"R1: {np.mean(rouge1_scores):.4f} | "
            f"R2: {np.mean(rouge2_scores):.4f} | "
            f"RL: {np.mean(rougeL_scores):.4f}"
        )

    bleu_scores   = [r["bleu"]   for r in all_results]
    rouge1_scores = [r["rouge1"] for r in all_results]
    rouge2_scores = [r["rouge2"] for r in all_results]
    rougeL_scores = [r["rougeL"] for r in all_results]

    return {
        "bleu_mean":   float(np.mean(bleu_scores)),
        "bleu_std":    float(np.std(bleu_scores)),
        "rouge1_mean": float(np.mean(rouge1_scores)),
        "rouge1_std":  float(np.std(rouge1_scores)),
        "rouge2_mean": float(np.mean(rouge2_scores)),
        "rouge2_std":  float(np.std(rouge2_scores)),
        "rougeL_mean": float(np.mean(rougeL_scores)),
        "rougeL_std":  float(np.std(rougeL_scores)),
    }

def evaluate_alpha(alpha, eval_en_dataset, eval_id_dataset, batch_size=75, verbose=True, weight_id=0.5):
    """
    Evaluasi 1 nilai alpha:
        w1 = alpha, w2 = 1 - alpha
    Fitness = (1 - weight_id)*BLEU_EN + weight_id*BLEU_ID
    """

    # 1. Tulis LoRA-Soup ke LORA_SOUP_TMP_DIR
    write_soup_state_dict(lora_sd1, lora_sd2, alpha, LORA_SOUP_TMP_DIR)

    # 2. Bersihkan GPU sebelum load model baru
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # 3. Load model + tokenizer dari adapter soup
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=LORA_SOUP_TMP_DIR,
        max_seq_length=MAX_SEQ_LEN,
        dtype=None,
        load_in_4bit=True,
    )

    # Set pad token dan padding side sesuai evaluasi sebelumnya
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id
    tokenizer.padding_side = "left"

    model.to(DEVICE)
    model.eval()
    FastLanguageModel.for_inference(model)

    results_en = evaluate_model(
        model,
        tokenizer,
        eval_en_dataset,
        system_prompt=SYSTEM_PROMPT_EN,
        batch_size=batch_size,
        max_allowed_length=MAX_NEW_TOKENS,
        schema="en",
    )

    results_id = evaluate_model(
        model,
        tokenizer,
        eval_id_dataset,
        system_prompt=SYSTEM_PROMPT_ID,
        batch_size=batch_size,
        max_allowed_length=MAX_NEW_TOKENS,
        schema="id",
    )

    bleu_en = results_en["bleu"]
    bleu_id = results_id["bleu"]

    fitness = (1.0 - weight_id) * bleu_en + weight_id * bleu_id

    if verbose:
        print(
            f"[alpha={alpha:.4f}] "
            f"BLEU_EN={bleu_en:.4f}, BLEU_ID={bleu_id:.4f}, "
            f"Fitness={fitness:.4f}"
        )

    del model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return fitness, bleu_en, bleu_id

In [None]:
# Quick Test
fit_05, bleu_en_05, bleu_id_05 = evaluate_alpha(
    alpha=0.5,
    eval_en_dataset=test_en_samples,
    eval_id_dataset=test_id_samples,
)
print("Fitness alpha=0.5:", fit_05)

In [None]:
import numpy as np

def gwo_optimize_alpha(
    eval_en_dataset,
    eval_id_dataset,
    n_wolves=8,        # jumlah serigala (search agents)
    max_iters=6,      # jumlah iterasi GWO
    a_max=2.0,         # parameter 'a' awal (akan turun ke 0)
    batch_size=80,
    eval_verbose=False,
    seed=42,
    weight_id=0.5,     # bobot pentingnya BLEU ID di evaluate_alpha
):
    """
    Grey Wolf Optimizer (GWO) untuk mencari alpha optimal di [0,1]
    sebagai bobot LoRA-Soup dua adapter.

    Return:
        best_alpha   : alpha terbaik
        best_fitness : fitness terbaik
        history      : list dict berisi log per iterasi (opsional untuk analisis)
    """
    rng = np.random.default_rng(seed)

    # --- 1. Inisialisasi posisi serigala (alpha_i) di [0,1] ---
    positions = rng.uniform(low=0.0, high=1.0, size=(n_wolves,))
    fitness = np.zeros(n_wolves, dtype=float)

    history = []

    # Helper untuk evaluasi dan simpan log
    def eval_position(alpha_value, iter_idx, wolf_idx):
        alpha_value = float(alpha_value)
        fitness_val, bleu_en, bleu_id = evaluate_alpha(
            alpha=alpha_value,
            eval_en_dataset=eval_en_dataset,
            eval_id_dataset=eval_id_dataset,
            batch_size=batch_size,
            verbose=eval_verbose,
            weight_id=weight_id,
        )
        history.append(
            {
                "iter": iter_idx,
                "wolf": wolf_idx,
                "alpha": alpha_value,
                "fitness": float(fitness_val),
                "bleu_en": float(bleu_en),
                "bleu_id": float(bleu_id),
            }
        )
        return fitness_val

    # Fungsi bantu: cari alpha, beta, delta dan fitness-nya
    def get_leaders(positions, fitness):
        # Karena kita MAKSIMALKAN fitness -> sort descending
        idx_sorted = np.argsort(-fitness)
        alpha_pos = positions[idx_sorted[0]]
        beta_pos  = positions[idx_sorted[1]]
        delta_pos = positions[idx_sorted[2]]

        alpha_fit = fitness[idx_sorted[0]]
        beta_fit  = fitness[idx_sorted[1]]
        delta_fit = fitness[idx_sorted[2]]

        return alpha_pos, beta_pos, delta_pos, alpha_fit, beta_fit, delta_fit

    # --- 2. Evaluasi awal semua serigala ---
    for i in range(n_wolves):
        fitness[i] = eval_position(positions[i], iter_idx=0, wolf_idx=i)

    # Tentukan alpha, beta, delta awal
    alpha_pos, beta_pos, delta_pos, alpha_fit, beta_fit, delta_fit = get_leaders(
        positions, fitness
    )

    if eval_verbose:
        print(f"[Iter 00] best_alpha={alpha_pos:.4f}, fitness={alpha_fit:.4f}")

    # --- 4. Loop utama GWO ---
    for iter_idx in range(1, max_iters + 1):
        # Parameter 'a' turun linear dari a_max -> 0
        a = a_max - iter_idx * (a_max / max_iters)

        # Update posisi tiap serigala
        for i in range(n_wolves):
            X = positions[i]  # posisi sekarang (scalar, 1D)

            # Terhadap alpha (leader 1)
            r1, r2 = rng.random(), rng.random()
            A1 = 2 * a * r1 - a
            C1 = 2 * r2
            D_alpha = abs(C1 * alpha_pos - X)
            X1 = alpha_pos - A1 * D_alpha

            # Terhadap beta (leader 2)
            r1, r2 = rng.random(), rng.random()
            A2 = 2 * a * r1 - a
            C2 = 2 * r2
            D_beta = abs(C2 * beta_pos - X)
            X2 = beta_pos - A2 * D_beta

            # Terhadap delta (leader 3)
            r1, r2 = rng.random(), rng.random()
            A3 = 2 * a * r1 - a
            C3 = 2 * r2
            D_delta = abs(C3 * delta_pos - X)
            X3 = delta_pos - A3 * D_delta

            # Posisi baru = rata-rata X1, X2, X3
            new_X = (X1 + X2 + X3) / 3.0

            # Clamp supaya tetap di [0,1]
            new_X = np.clip(new_X, 0.0, 1.0)

            positions[i] = new_X

        # Evaluasi ulang semua posisi setelah update
        for i in range(n_wolves):
            fitness[i] = eval_position(positions[i], iter_idx=iter_idx, wolf_idx=i)

        # Update alpha/beta/delta setelah evaluasi
        alpha_pos, beta_pos, delta_pos, alpha_fit, beta_fit, delta_fit = get_leaders(
            positions, fitness
        )

        if eval_verbose:
            print(
                f"[Iter {iter_idx:02d}] "
                f"best_alpha={alpha_pos:.4f}, fitness={alpha_fit:.4f}"
            )

    # --- 5. Hasil akhir ---
    best_alpha = float(alpha_pos)
    best_fitness = float(alpha_fit)

    return best_alpha, best_fitness, history

In [None]:
import psutil, os, time

process = psutil.Process(os.getpid())

if torch.cuda.is_available():
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.empty_cache()

start_gpu_mem = torch.cuda.memory_allocated()
start_ram = process.memory_info().rss
start_time = time.perf_counter()

best_alpha, best_fitness, history = gwo_optimize_alpha(
    eval_en_dataset=test_en_samples,
    eval_id_dataset=test_id_samples,
    n_wolves=8,
    max_iters=6,
    batch_size=80,
    eval_verbose=True,
    weight_id=0.5,
)

end_time = time.perf_counter()
end_ram = process.memory_info().rss
end_gpu_mem = torch.cuda.memory_allocated()
peak_gpu_mem = torch.cuda.max_memory_allocated()

elapsed = end_time - start_time

print("\n=== HASIL AKHIR GWO ===")
print(f"alpha terbaik     : {best_alpha:.4f}")
print(f"w_LoRA1 (adapter1): {best_alpha:.4f}")
print(f"w_LoRA2 (adapter2): {1.0 - best_alpha:.4f}")
print(f"fitness (BLEU mix): {best_fitness:.4f}")
print(f"Total waktu GWO   : {elapsed:.2f} detik")

print("\n=== MEMORI GPU ===")
print(f"Memori awal  : {start_gpu_mem / (1024**2):.2f} MB")
print(f"Memori akhir : {end_gpu_mem / (1024**2):.2f} MB")
print(f"Peak usage   : {peak_gpu_mem / (1024**2):.2f} MB")

print("\n=== MEMORI RAM ===")
print(f"RAM awal  : {start_ram / (1024**2):.2f} MB")
print(f"RAM akhir : {end_ram / (1024**2):.2f} MB")
print(f"Δ RAM     : {(end_ram - start_ram) / (1024**2):.2f} MB")

### Building the adapter

In [None]:
# Configuration
BASE_MODEL = "unsloth/Llama-3.2-1B-Instruct-unsloth-bnb-4bit"

LORA1_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora-id"
LORA2_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora-en"

LORA_SOUP_DIR = "/content/drive/MyDrive/College/KK/Final Project/model/lora_soup_gwo_1"

CAT_WEIGHTS = [0.5632, 0.4368]

LOAD_IN_4BIT = True

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

In [None]:
def show_adapter_cfg(path):
    with open(os.path.join(path, "adapter_config.json"), "r") as f:
        cfg = json.load(f)
    print("===", path, "===")
    print("r           :", cfg["r"])
    print("lora_alpha  :", cfg.get("lora_alpha"))
    print("target_modules:", cfg.get("target_modules"))

show_adapter_cfg(LORA1_DIR)
show_adapter_cfg(LORA2_DIR)

In [None]:
import os, json, shutil
from typing import List, Optional

import torch
from safetensors.torch import load_file, save_file

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def build_lora_soup(
    lora_paths: List[str],
    weights: Optional[List[float]],
    out_path: str,
    copy_tokenizer: bool = True,
):
    """
    Menggabungkan beberapa LoRA menjadi 1 adapter LoRA-Soups
    dengan cara *weighted average* (model soup) atas seluruh
    bobot float di state_dict.

    lora_paths : list path adapter LoRA sumber
    weights    : bobot soup, akan dinormalisasi; jika None -> uniform
    out_path   : folder output adapter baru
    """
    assert len(lora_paths) >= 2, "Minimal 2 LoRA untuk LoRA-Soups"

    # jika weights tidak diberikan -> uniform
    if weights is None:
        w = torch.ones(len(lora_paths), dtype=torch.float32)
    else:
        assert len(weights) == len(lora_paths), "Panjang weights harus = jumlah LoRA"
        w = torch.tensor(weights, dtype=torch.float32)

    # normalisasi bobot
    w = w / w.sum()

    os.makedirs(out_path, exist_ok=True)

    # 1. baca config dari adapter pertama
    cfg_path0 = os.path.join(lora_paths[0], "adapter_config.json")
    with open(cfg_path0, "r") as f:
        cfg0 = json.load(f)

    base_r      = cfg0["r"]
    base_alpha  = cfg0.get("lora_alpha", None)
    base_target = set(cfg0.get("target_modules", []))

    # cek konsistensi adapter lain
    for p in lora_paths[1:]:
        with open(os.path.join(p, "adapter_config.json"), "r") as f:
            cfgi = json.load(f)
        assert cfgi["r"] == base_r, f"Rank LoRA beda di {p}"
        if "lora_alpha" in cfgi and base_alpha is not None:
            assert cfgi["lora_alpha"] == base_alpha, f"lora_alpha beda di {p}"
        if "target_modules" in cfgi and base_target:
            assert set(cfgi["target_modules"]) == base_target, f"target_modules beda di {p}"

    # 2. config baru untuk SOUP:
    #    rank r & lora_alpha TIDAK berubah, hanya bobot yang di-average
    cfg_soup = cfg0.copy()
    if "target_modules" in cfg_soup:
        cfg_soup["target_modules"] = sorted(cfg_soup["target_modules"])

    with open(os.path.join(out_path, "adapter_config.json"), "w") as f:
        json.dump(cfg_soup, f, indent=2)

    # 3. load semua state dict LoRA
    state_dicts = [load_file(os.path.join(p, "adapter_model.safetensors")) for p in lora_paths]

    # opsional: pastikan key sama semua
    keys0 = set(state_dicts[0].keys())
    for i, sd in enumerate(state_dicts[1:], start=1):
        ki = set(sd.keys())
        assert keys0 == ki, f"Kumpulan key state_dict LoRA ke-{i} beda; cek adapter: {lora_paths[i]}"

    # 4. build state dict baru dengan weighted average
    new_state = {}
    float_dtypes = {
        torch.float16,
        torch.bfloat16,
        torch.float32,
        torch.float64,
    }

    for key in state_dicts[0].keys():
        tensor0 = state_dicts[0][key]

        # Kalau tensor floating -> soup (weighted average)
        if isinstance(tensor0, torch.Tensor) and tensor0.dtype in float_dtypes:
            acc = torch.zeros_like(tensor0)
            for wi, sd in zip(w, state_dicts):
                acc = acc + wi * sd[key]
            new_state[key] = acc
        else:
            # Non-float (misal buffer int, dsb) -> ambil dari adapter pertama saja
            new_state[key] = tensor0

    # 5. simpan
    save_file(new_state, os.path.join(out_path, "adapter_model.safetensors"))

    # 6. copy tokenizer & template dari adapter pertama (opsional)
    if copy_tokenizer:
        extra_files = [
            "tokenizer.json",
            "tokenizer_config.json",
            "special_tokens_map.json",
            "chat_template.jinja",
            "README.md",
        ]
        for fname in extra_files:
            src = os.path.join(lora_paths[0], fname)
            if os.path.exists(src):
                shutil.copy(src, os.path.join(out_path, fname))

    print(f"[OK] LoRA-Soups disimpan di: {out_path}")

In [None]:
build_lora_soup(
    lora_paths=[LORA1_DIR, LORA2_DIR],
    weights=CAT_WEIGHTS,
    out_path=LORA_SOUP_DIR,
    copy_tokenizer=True,
)

### Evaluation

In [None]:
from sacrebleu.metrics import BLEU
import numpy as np
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader
import random
import transformers
import os
from unsloth import FastLanguageModel
from rouge_score import rouge_scorer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    transformers.set_seed(seed)

# System prompt per bahasa
SYSTEM_PROMPT_ID = (
    "Anda adalah asisten AI yang membantu pengguna dalam bahasa Indonesia. "
)

SYSTEM_PROMPT_EN = (
    "You are a helpful AI assistant that communicates in English. "
)

MAX_SEQ_LEN      = 1024   # sama dengan di FastLanguageModel.from_pretrained
MAX_NEW_TOKENS   = 128
MAX_INPUT_LENGTH = MAX_SEQ_LEN - MAX_NEW_TOKENS  # supaya total <= 1024

def make_collate_fn(tokenizer, system_prompt, schema):
    def collate_fn(batch):
        input_id_list = []
        references = []

        for item in batch:
            if schema == "id":
                input_text = item["input"]      # ID: input/output
                reference  = item["output"]
            elif schema == "en":
                input_text = item["question"]   # EN: question/response
                reference  = item["response"]
            else:
                raise ValueError(f"Unknown schema: {schema}")

            messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user",   "content": input_text},
            ]

            # 1) Tokenisasi per-sample TANPA padding batch
            ids = tokenizer.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_tensors="pt",
                truncation=True,
                max_length=MAX_INPUT_LENGTH,
            )[0]  # shape: (seq_len,)

            input_id_list.append(ids)
            references.append(reference)

        # 2) Hitung panjang maksimum di batch
        max_len = max(t.shape[0] for t in input_id_list)
        pad_id  = tokenizer.pad_token_id
        batch_size = len(input_id_list)

        # 3) Siapkan tensor batched dengan LEFT PADDING manual
        input_ids = torch.full(
            (batch_size, max_len),
            pad_id,
            dtype=torch.long,
        )
        attention_mask = torch.zeros(
            (batch_size, max_len),
            dtype=torch.long,
        )

        for i, ids in enumerate(input_id_list):
            seq_len = ids.shape[0]
            # left padding: isi dari kanan
            input_ids[i, -seq_len:] = ids
            attention_mask[i, -seq_len:] = 1

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "references": references,
        }

    return collate_fn

def evaluate_model(model, tokenizer, eval_dataset, system_prompt,
                   batch_size=64, max_allowed_length=24, schema="en"):
    model.eval()
    all_predictions = []
    all_references = []        # untuk BLEU (nested list)
    all_references_text = []   # untuk ROUGE (list string)

    collate_fn = make_collate_fn(tokenizer, system_prompt, schema)

    dataloader = DataLoader(
        eval_dataset,
        num_workers=8,              # 0 dulu biar error kelihatan jelas
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=False,
        pin_memory=True,
    )

    bleu = BLEU()
    pbar = tqdm(dataloader)

    for batch in pbar:
        input_ids = batch["input_ids"].to(DEVICE)          # shape: (B, L)
        attention_mask = batch["attention_mask"].to(DEVICE)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=max_allowed_length,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=False,
                use_cache=True,
            )

        # semua sequence punya panjang awal yang sama: L
        seq_len = input_ids.shape[1]

        predictions = []
        for i, output in enumerate(outputs):
            # ambil token hasil generate saja (setelah input penuh)
            generated_tokens = output[seq_len:]
            decoded = tokenizer.decode(generated_tokens, skip_special_tokens=True)
            predictions.append(decoded)

        all_predictions.extend(predictions)
        all_references.extend([[ref] for ref in batch["references"]])
        all_references_text.extend(batch["references"])

    # ====== BLEU ======
    final_bleu = bleu.corpus_score(all_predictions, all_references)

    # ====== ROUGE-1, ROUGE-2, ROUGE-L ======
    scorer = rouge_scorer.RougeScorer(
        ["rouge1", "rouge2", "rougeL"],
        use_stemmer=True,
    )

    rouge1_list, rouge2_list, rougeL_list = [], [], []

    for pred, ref in zip(all_predictions, all_references_text):
        scores = scorer.score(ref, pred)
        rouge1_list.append(scores["rouge1"].fmeasure)
        rouge2_list.append(scores["rouge2"].fmeasure)
        rougeL_list.append(scores["rougeL"].fmeasure)

    # kali 100 supaya bentuknya persen seperti BLEU
    rouge1 = float(np.mean(rouge1_list) * 100)
    rouge2 = float(np.mean(rouge2_list) * 100)
    rougeL = float(np.mean(rougeL_list) * 100)

    return {
        "bleu": final_bleu.score,
        "rouge1": rouge1,
        "rouge2": rouge2,
        "rougeL": rougeL,
        "predictions": all_predictions,
        "references": all_references,
    }


def evaluate_model_multiple_runs(
    model,
    tokenizer,
    eval_dataset,
    system_prompt,
    n_runs=3,
    batch_size=128,
    max_allowed_length=24,
    schema="en",
):
    all_results = []

    for run in range(n_runs):
        set_seed(42 + run)
        results = evaluate_model(
            model,
            tokenizer,
            eval_dataset,
            system_prompt=system_prompt,
            batch_size=batch_size,
            max_allowed_length=max_allowed_length,
            schema=schema,
        )
        all_results.append(results)

        bleu_scores   = [r["bleu"]   for r in all_results]
        rouge1_scores = [r["rouge1"] for r in all_results]
        rouge2_scores = [r["rouge2"] for r in all_results]
        rougeL_scores = [r["rougeL"] for r in all_results]

        print(
            f"Run {run + 1}/{n_runs} - "
            f"BLEU: {np.mean(bleu_scores):.4f} ± {np.std(bleu_scores):.4f} | "
            f"R1: {np.mean(rouge1_scores):.4f} | "
            f"R2: {np.mean(rouge2_scores):.4f} | "
            f"RL: {np.mean(rougeL_scores):.4f}"
        )

    bleu_scores   = [r["bleu"]   for r in all_results]
    rouge1_scores = [r["rouge1"] for r in all_results]
    rouge2_scores = [r["rouge2"] for r in all_results]
    rougeL_scores = [r["rougeL"] for r in all_results]

    return {
        "bleu_mean":   float(np.mean(bleu_scores)),
        "bleu_std":    float(np.std(bleu_scores)),
        "rouge1_mean": float(np.mean(rouge1_scores)),
        "rouge1_std":  float(np.std(rouge1_scores)),
        "rouge2_mean": float(np.mean(rouge2_scores)),
        "rouge2_std":  float(np.std(rouge2_scores)),
        "rougeL_mean": float(np.mean(rougeL_scores)),
        "rougeL_std":  float(np.std(rougeL_scores)),
    }

MODEL_DIR  = "/content/drive/MyDrive/College/KK/Final Project/model/lora_soup_gwo_1"
print(MODEL_DIR)

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_DIR,
    max_seq_length=MAX_SEQ_LEN,
    dtype=None,
    load_in_4bit=True,
)

if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

tokenizer.padding_side = "left"

model.eval()
FastLanguageModel.for_inference(model)

results_en_detail = evaluate_model(
    model,
    tokenizer,
    eval_en_clean["test"],
    system_prompt=SYSTEM_PROMPT_EN,
    batch_size=90,
    max_allowed_length=MAX_NEW_TOKENS,
    schema="en",
)

results_id_detail = evaluate_model(
    model,
    tokenizer,
    eval_id_clean["test"],
    system_prompt=SYSTEM_PROMPT_ID,
    batch_size=90,
    max_allowed_length=MAX_NEW_TOKENS,
    schema="id",
)

print("\nFinal Results (test split):")
print(
    f"English (eval_en_clean['test']): "
    f"BLEU = {results_en_detail['bleu']:.4f}, "
    f"ROUGE-1 = {results_en_detail['rouge1']:.4f}, "
    f"ROUGE-2 = {results_en_detail['rouge2']:.4f}, "
    f"ROUGE-L = {results_en_detail['rougeL']:.4f}"
)
print(
    f"Indonesian (eval_id_clean['test']): "
    f"BLEU = {results_id_detail['bleu']:.4f}, "
    f"ROUGE-1 = {results_id_detail['rouge1']:.4f}, "
    f"ROUGE-2 = {results_id_detail['rouge2']:.4f}, "
    f"ROUGE-L = {results_id_detail['rougeL']:.4f}"
)