# Initialization

In [1]:
# Block 0: Configuration - EDIT THESE BEFORE RUNNING
# Put your real paths here (exact names & folders in your Drive).

# Path to training CSV (collaborative dataset). Example filename you gave:
TRAIN_PATH = "/content/drive/MyDrive/Datasets - Training.csv"

# Path to validation CSV (human-labeled validation set, 52 examples)
VALIDATION_PATH = "/content/drive/MyDrive/Datasets - Human-Labeled Validation Set.csv"

# Optional few-shot JSONL (one JSON per line) with curated exemplars:
# Each line example:
# {"english":"...", "filipino":"...", "criteria":{"accuracy":1,...}, "explanation":"..."}
FEW_SHOT_JSONL = "/content/drive/MyDrive/few_shot_examples.jsonl"

# Output CSV to write results to:
OUTPUT_CSV = "/content/drive/MyDrive/llm_judge_results.csv"

# Model to use (Hugging Face). Qwen2.5-7B-Instruct recommended for Colab.
MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"

# Generation params
MAX_NEW_TOKENS = 512
TEMPERATURE = 0.0   # deterministic; use >0 for robustness checks
CONSISTENCY_RUNS = 0  # number of repeated runs per example for consistency test

# If your training CSV uses different column names, set these accordingly:
COL_ENG = "English"
COL_CORRECT = "Filipino-Correct"
COL_FLAWED = "Filipino-Flawed"
COL_REMARKS = "Remarks"
COL_CONTRIB = "Contributor"

# End of Block 0


In [2]:
!pip install -q torch
!pip install -q transformers accelerate datasets scipy pandas regex



from google.colab import drive
import os, json, re, time, random
import pandas as pd
import numpy as np
from scipy.stats import spearmanr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Mount Drive
drive.mount('/content/drive', force_remount=True)

# Quick path checks
for p in [TRAIN_PATH, VALIDATION_PATH]:
    if not os.path.exists(p):
        print(f"WARNING: path not found: {p}")
    else:
        print(f"Found: {p}")

Mounted at /content/drive
Found: /content/drive/MyDrive/Datasets - Training.csv
Found: /content/drive/MyDrive/Datasets - Human-Labeled Validation Set.csv


In [3]:
# JSON extraction + evaluate single pair
import re

def extract_json_from_text(text):
    import json
    import re

    s = text.strip()
    try:
        return json.loads(s)
    except Exception:
        pass

    # Find first {...} block (greedy matching to nearest balanced braces)
    brace_stack = []
    start_idx = None
    for i, ch in enumerate(text):
        if ch == '{':
            if start_idx is None:
                start_idx = i
            brace_stack.append(ch)
        elif ch == '}' and brace_stack:
            brace_stack.pop()
            if not brace_stack:
                candidate = text[start_idx:i+1]
                try:
                    return json.loads(candidate)
                except Exception:
                    pass
                start_idx = None  # reset and continue searching

    return None

def normalize_sum_to_label(sum_points):
    if sum_points >= 5:
        return 5, "excellent"
    elif sum_points >= 3:
        return 3, "good"
    else:
        return 1, "poor"

REQUIRED_CRITERIA = ["accuracy","fluency","coherence","cultural_appropriateness","guideline_adherence","completeness"]

def explanation_mentions_most_criteria(explanation_text, threshold=0.8):
    if not isinstance(explanation_text, str):
        return False
    explanation_text = explanation_text.lower()
    count = 0
    for crit in REQUIRED_CRITERIA:
        short = crit.split("_")[0]
        if short in explanation_text or crit.replace("_", " ") in explanation_text:
            count += 1
    return (count / len(REQUIRED_CRITERIA)) >= threshold

def evaluate_pair(
    english, filipino, reference=None, tries=2, temp=TEMPERATURE, seed=None, debug=False
):
    prompt = build_prompt(english, filipino, reference)
    last_raw = None

    for attempt in range(tries):
        if debug:
            print(f"[evaluate_pair] Attempt {attempt + 1}/{tries}, seed={seed}, temp={temp}")

        raw = generate_text(prompt, temperature=temp, seed=seed)
        last_raw = raw
        parsed = extract_json_from_text(raw)

        # If parsing failed, try only once more with temp=0.0 and no retries after that
        if parsed is None:
            if debug:
                print("[evaluate_pair] Parsing failed, retrying once with temp=0.0")
            raw = generate_text(prompt, temperature=0.0, seed=seed)
            last_raw = raw
            parsed = extract_json_from_text(raw)
            if parsed is None:
                if debug:
                    print("[evaluate_pair] Parsing failed again, skipping to next attempt")
                # Optional: consider a very short sleep or just continue immediately
                time.sleep(0.2)
                continue

        crit_in = parsed.get("criteria", {})
        crit_clean = {}
        for k in REQUIRED_CRITERIA:
            v = crit_in.get(k, 0)
            if isinstance(v, bool):
                v = int(v)
            try:
                vi = int(v)
                vi = 1 if vi >= 1 else 0
            except Exception:
                vi = 0
            crit_clean[k] = vi

        sum_points = sum(crit_clean.values())
        norm_score, label = normalize_sum_to_label(sum_points)

        explanation = parsed.get("explanation", "")
        explanation_ok = explanation_mentions_most_criteria(explanation)

        return {
            "criteria": crit_clean,
            "sum_points": sum_points,
            "normalized_score": norm_score,
            "label": label,
            "explanation": explanation,
            "explanation_ok": explanation_ok,
            "raw_output": raw,
        }

    # failed all tries
    if debug:
        print("[evaluate_pair] Failed all attempts to parse valid JSON.")
    return {"error": "no valid JSON parsed", "raw": last_raw}

In [4]:
# config
MAX_NEW_TOKENS = 256
TEMPERATURE = 0.7

print("Loading model:", MODEL_NAME)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# Load model
try:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map="auto",              # Automatically use GPU if available
        torch_dtype=torch.float16,      # Use half precision to save some VRAM
        trust_remote_code=True
    )
    print("Loaded model in FP16 mode.")
except Exception as e:
    print("FP16 load failed; falling back to default. Error:", e)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map="auto",
        trust_remote_code=True
    )

# Set to evaluation mode
model.eval()
DEVICE = next(model.parameters()).device
print("Model loaded on", DEVICE)

# Text generation function
def generate_text(prompt, max_new_tokens=MAX_NEW_TOKENS, temperature=TEMPERATURE, seed=None):
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    gen_kwargs = dict(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=max_new_tokens,
        do_sample=(temperature > 0),
        temperature=temperature,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
    )

    if seed is not None:
        torch.manual_seed(seed)
        random.seed(seed)
        np.random.seed(seed)

    with torch.inference_mode():
        out = model.generate(**gen_kwargs)

    return tokenizer.decode(out[0], skip_special_tokens=True)

Loading model: Qwen/Qwen2.5-3B-Instruct


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded model in FP16 mode.
Model loaded on cuda:0


# Evaluate Dataset

In [5]:
# Load the collaborative training dataset and validation dataset
assert os.path.exists(TRAIN_PATH), f"Training dataset not found at {TRAIN_PATH}"
df_train = pd.read_csv(TRAIN_PATH)
print("Training dataset shape:", df_train.shape)
print("Training columns:", df_train.columns.tolist())
display(df_train.head())

if os.path.exists(VALIDATION_PATH):
    df_val = pd.read_csv(VALIDATION_PATH)
    print("Validation dataset shape:", df_val.shape)
    print("Validation columns:", df_val.columns.tolist())
    display(df_val.head())
else:
    df_val = None
    print("Validation CSV not found - please upload and set VALIDATION_PATH.")

Training dataset shape: (564, 5)
Training columns: ['English', 'Filipino-Correct', 'Filipino-Flawed', 'Remarks', 'Contributor']


Unnamed: 0,English,Filipino-Correct,Filipino-Flawed,Remarks,Contributor
0,Ang gnda na mura pa,It so beautiful and it's even affordable.,beautiful and cheap,flawed translation failed to express the 'na' ...,Charibeth Cheng
1,The Philippines is an archipelago made up of o...,"Ang Pilipinas ay isang kapulaang binubuo ng 7,...",Ang Pilipinas ay isang puno na binubuo ng mahi...,,Geena Tibule/Charlyne Arajoy Carabeo
2,Philippines is the world's second-largest arch...,Ang Pilipinas ang pangalawa sa pinakamalaking ...,Ang Pilipinas ay ang pangalawang malaking isla...,,Geena Tibule/Charlyne Arajoy Carabeo
3,Filipino and English are the two official lang...,Filipino at Ingles ang dalawang opisyal na lin...,Tagalog at Ingles ang dalawa opisyal lingwahe ...,,Geena Tibule/Charlyne Arajoy Carabeo
4,Tagalog is the most widely spoken native langu...,Tagalog ang pinakamalawak at ginagamit na katu...,Tagalog ay ang pinaka malawak sinasabi katutub...,,Geena Tibule/Charlyne Arajoy Carabeo


Validation dataset shape: (64, 6)
Validation columns: ['Source Text (English)', 'Target Text (Filipino)', 'Final Score                          (1 - lowest, 5 - highest)', 'Rater 1 Explanation', 'Rater 2 Explanation', 'Contributor']


Unnamed: 0,Source Text (English),Target Text (Filipino),"Final Score (1 - lowest, 5 - highest)",Rater 1 Explanation,Rater 2 Explanation,Contributor
0,The children laughed and played under the afte...,Ang mga bata ay nagtawanan at naglaro sa ilali...,4.0,"Accurate, fluent, and natural translation. Cap...",Just slight error due to the literal translati...,Paul Ivan Enclonar/Alonzo Rimando
1,She took a break to gather her thoughts.,Nagpahinga siya para mag-isip-isip.,4.0,The translation is accurate. It was able to ca...,The translation would have been better if the ...,Paul Ivan Enclonar/Alonzo Rimando
2,The algorithm efficiently identifies patterns ...,Mabisang kinikilala ng algoritmo ang mga patte...,3.0,"The translation of ""identifies"" as ""kinikilala...",The translation would have been better if the ...,Paul Ivan Enclonar/Alonzo Rimando
3,Data normalization helps improve model perform...,Tumutulong sa pagpabuti ng model ang normalisa...,5.0,The translated text is natural and captures th...,The translation didn't literally translated th...,Paul Ivan Enclonar/Alonzo Rimando
4,alam mo ma'am masaya naman topics natin sa phi...,"You know, ma'am, we have a lot of fun philosop...",4.0,"flawed translation is close, but failed to tra...",,Charibeth Cheng


In [6]:
# Prepare few-shot exemplars
def load_few_shot_jsonl(path):
    shots = []
    if os.path.exists(path):
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    shots.append(json.loads(line))
                except Exception as e:
                    print("Skipping malformed JSONL line:", e)
    return shots

few_shot_shots = load_few_shot_jsonl(FEW_SHOT_JSONL)
if len(few_shot_shots) > 0:
    print(f"Loaded {len(few_shot_shots)} curated few-shot exemplars from JSONL.")
else:
    print("No curated few-shot JSONL found. Sampling 3 examples from training set as exemplars (assumed correct).")
    n = 3
    sampled = df_train.sample(n, random_state=42)
    few_shot_shots = []
    for _, r in sampled.iterrows():
        eng = str(r.get(COL_ENG, r.get("english", "")))
        # prefer correct translation when available for exemplar clarity
        fil = str(r.get(COL_CORRECT, r.get(COL_FLAWED, r.get("Filipino-Correct", ""))))
        few_shot_shots.append({
            "english": eng,
            "filipino": fil,
            "criteria": {"accuracy":1,"fluency":1,"coherence":1,"cultural_appropriateness":1,"guideline_adherence":1,"completeness":1},
            "explanation": "Accurate, fluent, coherent, culturally appropriate, follows guidelines, complete."
        })
print("Prepared few-shot exemplars:", len(few_shot_shots))

Loaded 3 curated few-shot exemplars from JSONL.
Prepared few-shot exemplars: 3


In [7]:
# Prompt builder
import json

BASE_PROMPT = r"""
You are a translation judge for English→Filipino. Evaluate the translation using SIX criteria:

1) Accuracy: Does the Filipino translation correctly convey the English source text’s meaning, intent, and details? (0 or 1)
2) Fluency: Is the translation grammatically correct, natural, and idiomatic in Filipino? (0 or 1)
3) Coherence: Does the translation maintain logical flow, context, and structure from the source? (0 or 1)
4) Cultural Appropriateness: Does the translation respect Filipino cultural norms, idioms, and sensitivities (e.g., use of "po" and "opo", regional expressions)? (0 or 1)
5) Guideline Adherence: Does the translation follow domain-specific style, terminology, or guidelines (e.g., legal, medical precision)? (0 or 1)
6) Completeness: Are all elements of the English source text translated into Filipino without omissions or additions? (0 or 1)

For each criterion, assign 0 or 1 (1 = meets the criterion, 0 = does not).

Calculate:
- sum_points: sum of all criteria points (0 to 6)
- normalized_score: an integer from 1 to 5 computed as:
  normalized_score = round(1 + 4 * (sum_points / 6))
- label: based on normalized_score:
    5 → "excellent"
    4 → "good"
    3 → "fair"
    2 → "poor"
    1 → "very poor"

Return ONLY valid, parseable JSON with the exact schema below and NOTHING else:

{{
  "criteria": {{
    "accuracy": 0|1,
    "fluency": 0|1,
    "coherence": 0|1,
    "cultural_appropriateness": 0|1,
    "guideline_adherence": 0|1,
    "completeness": 0|1
  }},
  "sum_points": 0-6,
  "normalized_score": 1-5,
  "label": "excellent"|"good"|"fair"|"poor"|"very poor",
  "explanation": "<short explanation mentioning each criterion: Accuracy:..., Fluency:..., etc.>"
}}

Now evaluate the following translation and respond with ONLY the JSON described above.

English: "{english}"
Filipino: "{filipino}"
"""

def escape_quotes(text):
    if text is None:
        return ""
    return text.replace("\\", "\\\\").replace("\"", "\\\"").replace("\n", "\\n")

def build_prompt(english, filipino, reference=None, shots=None):
    shot_texts = []
    if shots is None:
        shots = []
    for s in shots:
        exemplar_json = {
            "criteria": s.get("criteria", {"accuracy":1,"fluency":1,"coherence":1,"cultural_appropriateness":1,"guideline_adherence":1,"completeness":1}),
        }
        exemplar_json["sum_points"] = sum(exemplar_json["criteria"].values())
        exemplar_json["normalized_score"] = 5 if exemplar_json["sum_points"]>=5 else (3 if exemplar_json["sum_points"]>=3 else 1)
        exemplar_json["label"] = "excellent" if exemplar_json["normalized_score"]==5 else ("good" if exemplar_json["normalized_score"]==3 else "poor")
        exemplar_json["explanation"] = s.get("explanation", "")
        shot_texts.append(
            f"### Example\nEnglish: \"{escape_quotes(s['english'])}\"\nFilipino: \"{escape_quotes(s['filipino'])}\"\nDesired JSON output:\n{json.dumps(exemplar_json, ensure_ascii=False)}\n"
        )
    ref_text = f"\nReference: \"{escape_quotes(reference)}\"\n" if reference else "\n"
    prompt = BASE_PROMPT + "\n\n" + "\n".join(shot_texts) + f"\n### Now evaluate:\nEnglish: \"{escape_quotes(english)}\"\nFilipino: \"{escape_quotes(filipino)}\"{ref_text}\nReturn JSON only.\n"
    return prompt


In [8]:
import time
import random
import numpy as np
import pandas as pd
from scipy.stats import spearmanr

assert df_val is not None, "Validation dataset (df_val) must be loaded to run evaluation."

# Normalize column names (lowercase, stripped)
df_val.columns = [col.strip().lower() for col in df_val.columns]

# Drop rows where all columns are empty
df_val = df_val.dropna(how='all')

def find_column(columns, keywords):
    """Find first column containing all keywords (case-insensitive)."""
    for col in columns:
        if all(k.lower() in col for k in keywords):
            return col
    return None

col_eng = find_column(df_val.columns, ["source", "english"])
col_fil = find_column(df_val.columns, ["target", "filipino"])
col_score = find_column(df_val.columns, ["final", "score"])

assert col_eng is not None and col_fil is not None, "Could not find English/Filipino columns in df_val."

CHUNK_SIZE = 10
all_results = []
processed_count = 0
start_time = time.time()
num_rows = len(df_val)

OUTPUT_JSONL = "/content/drive/MyDrive/results.jsonl"  # JSON Lines file path

with open(OUTPUT_JSONL, "w", encoding="utf-8") as f:
    pass

for idx, row in df_val.iterrows():
    try:
        eng_text = row[col_eng]
        fil_text = row[col_fil]
        reference = None  # No reference column available

        # Skip if either English or Filipino text is empty or not a string
        if not (isinstance(eng_text, str) and eng_text.strip()) or not (isinstance(fil_text, str) and fil_text.strip()):
            continue

        sums = []
        scores = []
        raw_outputs_list = []
        explanation_ok_count = 0

        if CONSISTENCY_RUNS >= 1:
            # Main evaluation run
            seed = random.randint(1, 2**30 - 1)
            res = evaluate_pair(eng_text, fil_text, reference=reference, tries=1, temp=TEMPERATURE, seed=seed)

            # Additional consistency runs
            for _ in range(CONSISTENCY_RUNS):
                seed = random.randint(1, 2**30 - 1)
                tmp_res = evaluate_pair(eng_text, fil_text, reference=reference, tries=1, temp=TEMPERATURE, seed=seed)
                raw_outputs_list.append(tmp_res.get("raw_output", tmp_res.get("raw", ""))[:500])
                if tmp_res.get("sum_points") is not None:
                    sums.append(tmp_res["sum_points"])
                    scores.append(tmp_res["normalized_score"])
                if tmp_res.get("explanation_ok"):
                    explanation_ok_count += 1
        else:
            # No consistency runs, just main tries=2
            res = evaluate_pair(eng_text, fil_text, reference=reference, tries=2)
            raw_outputs_list = []
            sums = []
            scores = []
            explanation_ok_count = 0

        # Compute variation percentage for consistency sums if applicable
        variation_pct = None
        if len(sums) >= 2:
            mean = np.mean(sums)
            std = np.std(sums)
            variation_pct = (std / mean * 100) if mean != 0 else (std * 100)

        # Parse human score safely
        human_score = None
        if col_score and not pd.isna(row[col_score]):
            try:
                human_score = int(row[col_score])
            except Exception:
                pass

        all_results.append({
            "idx": idx,
            "english": eng_text,
            "filipino": fil_text,
            "reference": reference,
            "model_criteria": res.get("criteria"),
            "model_sum": res.get("sum_points"),
            "model_score": res.get("normalized_score"),
            "model_label": res.get("label"),
            "model_explanation": res.get("explanation"),
            "explanation_ok": res.get("explanation_ok"),
            "consistency_sums": sums,
            "consistency_scores": scores,
            "consistency_variation_pct": variation_pct,
            "raw_outputs": raw_outputs_list,
            "human_score": human_score
        })

        processed_count += 1

        # Periodic progress update
        if processed_count % 5 == 0 or processed_count == num_rows:
            elapsed = time.time() - start_time
            avg_per_row = elapsed / processed_count
            est_total = avg_per_row * num_rows
            est_remaining = est_total - elapsed
            print(f"⏳ Processed {processed_count}/{num_rows} rows. "
                  f"Elapsed: {elapsed:.1f}s, Avg/row: {avg_per_row:.2f}s, Est remaining: {est_remaining:.1f}s")



        # Save chunks to JSON Lines file
        if processed_count % CHUNK_SIZE == 0:
            with open(OUTPUT_JSONL, "a", encoding="utf-8") as f:
                for record in all_results:
                    f.write(json.dumps(record) + "\n")
            all_results = []
            print(f"💾 Saved {processed_count} rows so far...")

    except Exception as e:
        print(f"⚠️ Error processing row {idx}: {e}")
        continue

# Save leftover rows after loop
if all_results:
    with open(OUTPUT_JSONL, "a", encoding="utf-8") as f:
        for record in all_results:
            f.write(json.dumps(record) + "\n")
    print(f"💾 Saved final chunk. Total rows processed: {processed_count}")

print(f"📂 All evaluations saved to: {OUTPUT_JSONL}")

# Load JSON Lines for analysis
out_df = pd.read_json(OUTPUT_JSONL, lines=True)

# Compute Spearman correlation if data is sufficient
valid_rows = out_df.dropna(subset=["human_score", "model_score"])
if len(valid_rows) >= 2:
    rho, pval = spearmanr(valid_rows["human_score"].astype(float), valid_rows["model_score"].astype(float))
    print(f"📊 Spearman rho = {rho:.4f} (p={pval:.4g}), n={len(valid_rows)}")
else:
    print("⚠️ Not enough pairs to compute Spearman correlation.")

print(f"🧠 Explainability coverage: {out_df['explanation_ok'].mean():.3f}")
print(f"📈 Average consistency variation %: {out_df['consistency_variation_pct'].dropna().mean():.3f}")


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'

⏳ Processed 5/57 rows. Elapsed: 153.7s, Avg/row: 30.75s, Est remaining: 1599.0s


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'

⏳ Processed 10/57 rows. Elapsed: 257.7s, Avg/row: 25.77s, Est remaining: 1211.1s
💾 Saved 10 rows so far...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


⏳ Processed 15/57 rows. Elapsed: 322.3s, Avg/row: 21.49s, Est remaining: 902.6s


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'

⏳ Processed 20/57 rows. Elapsed: 504.6s, Avg/row: 25.23s, Est remaining: 933.6s
💾 Saved 20 rows so far...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'

⏳ Processed 25/57 rows. Elapsed: 609.0s, Avg/row: 24.36s, Est remaining: 779.5s


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'

⏳ Processed 30/57 rows. Elapsed: 751.7s, Avg/row: 25.06s, Est remaining: 676.5s
💾 Saved 30 rows so far...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'

⏳ Processed 35/57 rows. Elapsed: 933.2s, Avg/row: 26.66s, Est remaining: 586.6s


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'

⏳ Processed 40/57 rows. Elapsed: 1154.0s, Avg/row: 28.85s, Est remaining: 490.4s
💾 Saved 40 rows so far...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'

⏳ Processed 45/57 rows. Elapsed: 1332.5s, Avg/row: 29.61s, Est remaining: 355.3s


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'

⏳ Processed 50/57 rows. Elapsed: 1472.3s, Avg/row: 29.45s, Est remaining: 206.1s
💾 Saved 50 rows so far...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p'

⏳ Processed 55/57 rows. Elapsed: 1574.7s, Avg/row: 28.63s, Est remaining: 57.3s


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


⏳ Processed 57/57 rows. Elapsed: 1600.0s, Avg/row: 28.07s, Est remaining: 0.0s
💾 Saved final chunk. Total rows processed: 57
📂 All evaluations saved to: /content/drive/MyDrive/results.jsonl
📊 Spearman rho = -0.0344 (p=0.8445), n=35
🧠 Explainability coverage: 0.971
📈 Average consistency variation %: nan


# Evaluate Translation Pair

In [12]:
import json
import random

# ---- Input: your single pair ----
english_text = "Hello, how are you?"
filipino_text = "Kamusta, kumusta ka?"
reference_text = None  # optional, can be left as None

# ---- Parameters ----
CONSISTENCY_RUNS = 0   # no extra runs
TEMPERATURE = 0.7

# ---- Evaluation ----
sums = []
scores = []
raw_outputs_list = []
explanation_ok_count = 0

# Only main evaluation run
seed = random.randint(1, 2**30 - 1)
res = evaluate_pair(
    english_text,
    filipino_text,
    reference=reference_text,
    tries=1,
    temp=TEMPERATURE,
    seed=seed,
    debug=True
)

# ---- Output results ----
result_record = {
    "english": english_text,
    "filipino": filipino_text,
    "reference": reference_text,
    "model_criteria": res.get("criteria"),
    "model_sum": res.get("sum_points"),
    "model_score": res.get("normalized_score"),
    "model_label": res.get("label"),
    "model_explanation": res.get("explanation"),
    "explanation_ok": res.get("explanation_ok"),
    "raw_outputs": raw_outputs_list,
}

OUTPUT_FILE = "/content/drive/MyDrive/single_pair_result.json"
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(result_record, f, ensure_ascii=False, indent=2)

print(f"💾 Result saved to: {OUTPUT_FILE}")

print(json.dumps(result_record, indent=2, ensure_ascii=False))


[evaluate_pair] Attempt 1/1, seed=88914704, temp=0.7


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


[evaluate_pair] Parsing failed, retrying once with temp=0.0
💾 Result saved to: /content/drive/MyDrive/single_pair_result.json
{
  "english": "Hello, how are you?",
  "filipino": "Kamusta, kumusta ka?",
  "reference": null,
  "model_criteria": {
    "accuracy": 0,
    "fluency": 1,
    "coherence": 1,
    "cultural_appropriateness": 1,
    "guideline_adherence": 1,
    "completeness": 1
  },
  "model_sum": 5,
  "model_score": 5,
  "model_label": "excellent",
  "model_explanation": "Accuracy is 0 because 'Kumusta' is a greeting but doesn't fully capture the full meaning of 'Hello, how are you?'. Fluency, coherence, cultural appropriateness, and guideline adherence are all 1 as the translation is grammatically correct, maintains context, respects cultural norms, and follows the greeting convention. Completeness is 1 as all elements are included.",
  "explanation_ok": true,
  "consistency_sums": [],
  "consistency_scores": [],
  "consistency_variation_pct": null,
  "raw_outputs": []
}
