## Initialization

In [1]:
!pip install transformers accelerate huggingface_hub pyyaml tqdm scipy

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import yaml
import json

In [3]:
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    total_vram = torch.cuda.get_device_properties(0).total_memory / (1024**3)
    print(f"✅ Using GPU: {gpu_name} ({total_vram:.1f} GB VRAM)")
else:
    raise RuntimeError("❌ No GPU detected in Colab runtime!")

✅ Using GPU: Tesla T4 (14.7 GB VRAM)


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# EDIT this path to your project (quote path if it has spaces)
PROJECT_DIR = "/content/drive/MyDrive/LLM_Project"

# Change to project dir
%cd "{PROJECT_DIR}"

# Confirm files
!ls -la
!ls -la LLM || true

/content/drive/MyDrive/LLM_Project
total 274
-rw------- 1 root root 235640 Aug 11 12:27 agentic_judge
-rw------- 1 root root   2220 Aug 10 21:05 criteria.yml
-rw------- 1 root root    892 Aug 10 17:11 domain_glossary.csv
drwx------ 2 root root   4096 Aug 11 11:42 LLM
-rw------- 1 root root  22788 Aug 10 20:07 LLM_dataset.csv
-rw------- 1 root root   9634 Aug 11 12:23 main.py
drwx------ 2 root root   4096 Aug 11 11:42 results
total 47
-rw------- 1 root root  5595 Aug 10 19:41 1.py
-rw------- 1 root root 15818 Aug 10 19:32 2.py
-rw------- 1 root root  5445 Aug 10 23:21 agent_tools.py
-rw------- 1 root root     0 Aug 10 14:26 __init__.py
drwx------ 2 root root  4096 Aug 11 11:42 .ipynb_checkpoints
-rw------- 1 root root  1728 Aug 10 20:35 llm_api.py
-rw------- 1 root root   599 Aug 10 12:50 logger.py
-rw------- 1 root root   869 Aug 10 20:40 memory.py
-rw------- 1 root root  7981 Aug 10 22:06 planner.py
drwx------ 2 root root  4096 Aug 10 20:11 __pycache__


In [6]:
import sys, os
import importlib # Import importlib to reload modules

LLM_DIR = os.path.join(PROJECT_DIR, 'LLM')
if LLM_DIR not in sys.path:
    sys.path.insert(0, LLM_DIR)

# Import and reload package components
import logger
importlib.reload(logger)
from logger import AgentLogger

import llm_api
importlib.reload(llm_api)
from llm_api import LLM


# Import other components
import planner
from memory import Memory
from agent_tools import Toolset

importlib.reload(planner)


print("Imported LLM package components successfully.")

Imported LLM package components successfully.


In [7]:
# Initialize model wrapper (this will download weights the first time)
# If your llm_api.LLM expects device param, pass "cuda" or "cpu"
llm = LLM(device="cuda" if torch.cuda.is_available() else "cpu")
print("LLM object created.")

✅ Using GPU: Tesla T4 (14.7 GB VRAM)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

LLM object created.


In [8]:
# Paths to config files (adjust if names differ)
CRITERIA_PATH = os.path.join(PROJECT_DIR, "criteria.yml")
GLOSSARY_PATH = os.path.join(PROJECT_DIR, "domain_glossary.csv")
TRACE_DIR = os.path.join(PROJECT_DIR, "results", "agent_traces")
RESULTS_FILE = os.path.join(PROJECT_DIR, "results", "results_agent_judge.json")

os.makedirs(TRACE_DIR, exist_ok=True)
os.makedirs(os.path.dirname(RESULTS_FILE), exist_ok=True)

memory = Memory(criteria_path=CRITERIA_PATH, glossary_path=GLOSSARY_PATH)
tools = Toolset(glossary_path=GLOSSARY_PATH)
logger = AgentLogger(TRACE_DIR)

# Planner signature we used: Planner(memory, tools, logger, device)
planner = planner.Planner(memory=memory, tools=tools, logger=logger, device="cuda" if torch.cuda.is_available() else "cpu") # Corrected instantiation

print("Agent components initialized.")

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Using GPU: Tesla T4 (14.7 GB VRAM)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Agent components initialized.


## Evaluate Dataset

In [15]:
# Set the Dataset
import pandas as pd
import os

dataset_path = os.path.join(PROJECT_DIR, "LLM_dataset.csv")

dataset = []
if os.path.exists(dataset_path):
    try:
        df = pd.read_csv(dataset_path, encoding='utf-8')

        # Convert DataFrame to list of dictionaries using your specific headers
        for _, row in df.iterrows():
            dataset.append({
                "source": row['Source Text (English)'],
                "translation": row['Target Text (Filipino)'],
                "human_score": row['Final Score                          (1 - lowest, 5 - highest)']
            })

        print(f"Loaded {len(dataset)} examples from dataset.csv")
    except Exception as e:
        print(f"Error loading CSV: {e}")
        # Fallback to sample data
        dataset = [
            {
                "source": "The Department of Health announced a new vaccine rollout.",
                "translation": "Inanunsyo ng Kagawaran ng Kalusugan ang bagong programa ng pagbabakuna.",
                "human_score": 5
            },
            {
                "source": "Blood pressure must be monitored regularly for patients with diabetes.",
                "translation": "Ang presyon ng dugo ay dapat regular na sinusuri para sa mga pasyenteng may diabetes.",
                "human_score": 3
            }
        ]
        print("Using fallback sample dataset:", len(dataset))
else:
    # Fallback sample data
    dataset = [
        {
            "source": "The Department of Health announced a new vaccine rollout.",
            "translation": "Inanunsyo ng Kagawaran ng Kalusugan ang bagong programa ng pagbabakuna.",
            "human_score": 5
        },
        {
            "source": "Blood pressure must be monitored regularly for patients with diabetes.",
            "translation": "Ang presyon ng dugo ay dapat regular na sinusuri para sa mga pasyenteng may diabetes.",
            "human_score": 3
        }
    ]
    print("Using fallback sample dataset:", len(dataset))

Error loading CSV: name 'pd' is not defined
Using fallback sample dataset: 2


In [None]:
# Run the Agentic AI judge on the Dataset
from tqdm import tqdm
import json

def run_and_save(dataset, planner, out_file):
    with open(out_file, 'a', encoding='utf-8') as fout:
        for i, sample in enumerate(tqdm(dataset, desc="Evaluating")):
            try:
                result = planner.evaluate(sample)
            except Exception as e:
                # graceful fallback
                result = {
                    "error": str(e),
                    "criteria_scores": {},
                    "raw_sum": 0,
                    "overall_score": 1,
                    "label": "error",
                    "explanation": {}
                }
            # enrich result with identifiers if present
            result_meta = {
                "index": i,
                "source": sample.get("source"),
                "translation": sample.get("translation"),
                "reference": sample.get("reference", None),
                "human_score": sample.get("human_score", None),
                "result": result
            }
            fout.write(json.dumps(result_meta, ensure_ascii=False) + "\n")
    print("Saved results to", out_file)

run_and_save(dataset, planner, RESULTS_FILE)

Evaluating:   0%|          | 0/64 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Evaluating:   2%|▏         | 1/64 [02

Saved results to /content/drive/MyDrive/LLM_Project/results/results_agent_judge.json





In [None]:
# Calculate Spearman's Rho
import json, math
from scipy.stats import spearmanr

RESULTS_FILE = "/content/drive/MyDrive/LLM_Project/results/results_agent_judge.json"

llm_scores = []
human_scores = []

with open(RESULTS_FILE, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        entry = json.loads(line)
        human = entry.get("human_score")
        # print(human)
        model_out = entry.get("result", {})
        # print(model_out)
        # print(model_out.get("overall_score"))
        overall = model_out.get("overall_score")
        if human is not None and overall is not None:
            human_scores.append(float(human))
            llm_scores.append(float(overall))

def is_nan(x):
    try:
        # Check for float nan
        return math.isnan(float(x))
    except:
        # If conversion fails, maybe string 'nan' or other invalid value
        return str(x).lower() == 'nan'

filtered_pairs = [(h, l) for h, l in zip(human_scores, llm_scores) if not (is_nan(h) or is_nan(l))]

if len(filtered_pairs) >= 2:
    human_scores_clean, llm_scores_clean = zip(*filtered_pairs)
    human_scores_clean = list(human_scores_clean)
    llm_scores_clean = list(llm_scores_clean)
    # Now run your correlation
    rho, pval = spearmanr(human_scores_clean, llm_scores_clean)
    print(f"Spearman rho: {rho:.4f}, p-value: {pval:.4g} (N={len(human_scores_clean)})")
else:
    print("Not enough valid data points after removing NaNs.")


Spearman rho: -0.0678, p-value: 0.6165 (N=57)


In [None]:
# Check Explanation
import json
import math

missing_expl = []
with open(RESULTS_FILE, 'r', encoding='utf-8') as f:
    for line in f:
        entry = json.loads(line)
        source = entry.get("source")

        # Skip if source is nan (float nan or string "nan")
        if source is None:
            continue
        if isinstance(source, float) and math.isnan(source):
            continue
        if isinstance(source, str) and source.strip().lower() == "nan":
            continue

        res = entry.get("result", {})
        expl = res.get("explanation", {})
        expected_keys = {"accuracy", "fluency", "coherence", "cultural_appropriateness", "guideline_adherence", "completeness"}
        missing = expected_keys - set(expl.keys())
        if missing:
            missing_expl.append({"index": entry.get("index"), "missing": list(missing)})

print("Number of entries missing some criterion explanations:", len(missing_expl))
print(missing_expl[:10])


Number of entries missing some criterion explanations: 0
[]


In [None]:
# Print JSON Output
import json

try:
    with open(RESULTS_FILE, 'r', encoding='utf-8') as f:
        for line in f:
            # Load each line as JSON and pretty print it
            entry = json.loads(line)
            print(json.dumps(entry, ensure_ascii=False, indent=4))
            print("-" * 20) # Add a separator for clarity
except FileNotFoundError:
    print(f"Error: The file {RESULTS_FILE} was not found.")
except Exception as e:
    print(f"An error occurred while reading the file: {e}")

{
    "index": 0,
    "source": "The children laughed and played under the afternoon sun.",
    "translation": "Ang mga bata ay nagtawanan at naglaro sa ilalim ng hapon na araw.",
    "reference": null,
    "human_score": 4.0,
    "result": {
        "criteria_scores": {
            "accuracy": 0,
            "fluency": 1,
            "coherence": 1,
            "cultural_appropriateness": 0,
            "guideline_adherence": 1,
            "completeness": 0
        },
        "raw_sum": 3,
        "overall_score": 3,
        "label": "good",
        "explanation": {
            "accuracy": "While the translation captures the main action (laughing and playing), it lacks specific details such as the time of day ('afternoon') and the setting ('under the sun'). These elements are crucial for conveying the full meaning of the original sentence.",
            "fluency": "While the translation is correct and natural, it could benefit from a slight adjustment to improve its idiomatic flow wi

In [None]:
# Check Consistency
import random
import statistics
from tqdm import tqdm

def is_valid_text(text):
    if text is None:
        return False
    if isinstance(text, float) and math.isnan(text):
        return False
    if isinstance(text, str) and text.strip().lower() == "nan":
        return False
    return True

filtered_dataset = [
    sample for sample in dataset
    if is_valid_text(sample.get("source")) and is_valid_text(sample.get("translation"))
]

K = min(5, len(filtered_dataset))
N = 5  # number of repeats
indices = random.sample(range(len(filtered_dataset)), K)
consistency_report = []

for idx in indices:
    sample = filtered_dataset[idx]
    scores = []
    # Wrap inner loop with tqdm for progress bar on repeats
    for _ in tqdm(range(N), desc=f"Evaluating sample {idx}", leave=False):
        out = planner.evaluate(sample)
        scores.append(out.get("overall_score"))
    mean = statistics.mean(scores)
    stdev = statistics.pstdev(scores) if mean != 0 else 0.0
    pct_var = (stdev / mean) * 100 if mean != 0 else 0.0
    consistency_report.append({
        "index": idx,
        "scores": scores,
        "mean": mean,
        "stdev": stdev,
        "pct_var": pct_var
    })

consistency_report


Evaluating sample 29:   0%|          | 0/5 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation fla

[{'index': 29,
  'scores': [2, 2, 2, 2, 2],
  'mean': 2,
  'stdev': 0.0,
  'pct_var': 0.0},
 {'index': 42,
  'scores': [2, 2, 2, 2, 2],
  'mean': 2,
  'stdev': 0.0,
  'pct_var': 0.0},
 {'index': 5,
  'scores': [0, 0, 0, 0, 0],
  'mean': 0,
  'stdev': 0.0,
  'pct_var': 0.0},
 {'index': 2,
  'scores': [2, 2, 2, 2, 2],
  'mean': 2,
  'stdev': 0.0,
  'pct_var': 0.0},
 {'index': 22,
  'scores': [4, 4, 4, 4, 4],
  'mean': 4,
  'stdev': 0.0,
  'pct_var': 0.0}]

## Evaluate Single Entry

In [13]:
import json
# Helper Function to evaluate single pairr
def evaluate_single_pair(source_text, translation_text, planner):
    sample = {
        "source": source_text,
        "translation": translation_text
    }
    try:
        result = planner.evaluate(sample)
    except Exception as e:
        result = {
            "error": str(e),
            "criteria_scores": {},
            "raw_sum": 0,
            "overall_score": 1,
            "label": "error",
            "explanation": {}
        }
    result_meta = {
        "source": sample["source"],
        "translation": sample["translation"],
        "result": result
    }
    return result_meta

In [16]:
# ENGLISH SOURCE TEXT
source_text = "It's raining cats and dogs."

# TAGALOG TRANSLATION
translation_text = "Umuulan ng pusa at aso."

evaluate_single_pair(source_text, translation_text, planner)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


{'source': "It's raining cats and dogs.",
 'translation': 'Umuulan ng pusa at aso.',
 'result': {'criteria_scores': {'accuracy': 0,
   'fluency': 0,
   'coherence': 0,
   'cultural_appropriateness': 0,
   'guideline_adherence': 0,
   'completeness': 0},
  'raw_sum': 0,
  'overall_score': 0,
  'label': 'poor',
  'explanation': {'accuracy': 'The translation is inaccurate as it does not correctly represent the idiomatic expression of heavy rain in English.',
   'fluency': 'Although the translation is grammatically correct, it fails to accurately convey the idiomatic meaning of the source text, making it unnatural for a native speaker.',
   'coherence': 'The translation fails to preserve the metaphorical meaning and the logical structure of the source text.',
   'cultural_appropriateness': "The translation does not accurately represent the cultural idiom of 'it's raining cats and dogs' in Filipino, failing to capture the intended meaning and cultural nuances.",
   'guideline_adherence': "T

In [None]:
# ENGLISH SOURCE TEXT
source_text = "INSERT"

# TAGALOG TRANSLATION
translation_text = "INSERT"

evaluate_single_pair(source_text, translation_text, planner)