In [3]:

from analyze import analyze_data
from utils import RequestData
import json
from datetime import datetime
from zoneinfo import ZoneInfo
import re
# run_log_dir = "/mnt/network_drive/lrq/logs/logs_2025-11-23-22-33-54/run_0" # deepseek-ocr
# run_log_dir = "/mnt/network_drive/lrq/logs/logs_2025-11-23-23-48-36/run_0" # llama3.1
run_log_dir = "/mnt/network_drive/lrq/logs/logs_2025-11-24-20-00-06/run_0" # qwen
dataset = "/mnt/network_drive/lrq/traces/longBench/longbench_data/gt_qasper.json"

loadgen_result_file = f"{run_log_dir}/loadgen_result.json"
loadgen_results = []
with open(loadgen_result_file, "r") as f:
    for line in f:
        loadgen_result = json.loads(line)
        loadgen_results.append(loadgen_result)


In [4]:
results_map = {}
for loadgen_result in loadgen_results:
    body = json.loads(loadgen_result['body'])
    response = loadgen_result['response']['response']
    # print(response)
    if not response:
        continue
    if 'error' in response:
        # print(f"Error: {response['error']}")
        continue
# response = json.loads(loadgen_results[0])
    results_map[body['prompt']] = {"prediction": response['choices'][0]['text']}

In [5]:
import json
import re
from collections import Counter

def normalize(text):
    text = text.lower().strip()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text.split()

def compute_f1(prediction, ground_truth):
    pred_tokens = normalize(prediction)
    gt_tokens = normalize(ground_truth)

    if len(pred_tokens) == 0 or len(gt_tokens) == 0:
        return 0.0

    # Count overlapping tokens: intersection of multisets
    common = Counter(pred_tokens) & Counter(gt_tokens)
    num_same = sum(common.values())

    if num_same == 0:
        return 0.0

    precision = num_same / len(pred_tokens)
    recall = num_same / len(gt_tokens)
    f1 = 2 * precision * recall / (precision + recall)
    return f1

In [6]:
import json
import numpy as np
from rouge_score import rouge_scorer

# --- F1 helper functions ---
import re
from collections import Counter

def normalize(text):
    text = text.lower().strip()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    return text.split()

def compute_f1(prediction, ground_truth):
    pred_tokens = normalize(prediction)
    gt_tokens = normalize(ground_truth)

    if len(pred_tokens) == 0 or len(gt_tokens) == 0:
        return 0.0

    common = Counter(pred_tokens) & Counter(gt_tokens)
    num_same = sum(common.values())

    if num_same == 0:
        return 0.0

    precision = num_same / len(pred_tokens)
    recall = num_same / len(gt_tokens)
    return 2 * precision * recall / (precision + recall)


# --- Load dataset ---
with open(dataset, "r") as f:
    dataset_json = json.load(f)

gt_answer_map = {item["prompt"]: item["ground_truth_answer"]
                 for item in dataset_json}

# ROUGE scorer
scorer = rouge_scorer.RougeScorer(
    ["rouge1", "rouge2", "rougeL"],
    use_stemmer=True
)

# --- Compute metrics ---
for prompt in results_map:
    prediction = results_map[prompt]["prediction"]
    gt = gt_answer_map[prompt]

    # F1
    f1 = compute_f1(prediction, gt)
    results_map[prompt]["f1"] = f1

    # ROUGE
    rs = scorer.score(gt, prediction)
    results_map[prompt]["rouge1"] = rs["rouge1"].fmeasure
    results_map[prompt]["rouge2"] = rs["rouge2"].fmeasure
    results_map[prompt]["rougeL"] = rs["rougeL"].fmeasure


# --- Aggregate statistics ---
all_items = list(results_map.items())

avg_f1     = np.mean([v["f1"]      for _, v in all_items])
avg_r1     = np.mean([v["rouge1"]  for _, v in all_items])
avg_r2     = np.mean([v["rouge2"]  for _, v in all_items])
avg_rL     = np.mean([v["rougeL"]  for _, v in all_items])

# Best according to F1
best_f1_prompt, best_f1_score = max(
    [(p, v["f1"]) for p, v in all_items],
    key=lambda x: x[1]
)

# Best according to ROUGE-L
best_rL_prompt, best_rL_score = max(
    [(p, v["rougeL"]) for p, v in all_items],
    key=lambda x: x[1]
)


# --- Print results ---
print("\n===== Evaluation Results =====")
print(f"Total Prompts Evaluated: {len(all_items)}\n")

print(f"Average F1 Score:      {avg_f1:.4f}")
print(f"Average ROUGE-1 Score: {avg_r1:.4f}")
print(f"Average ROUGE-2 Score: {avg_r2:.4f}")
print(f"Average ROUGE-L Score: {avg_rL:.4f}\n")

print("=== Best by F1 ===")
print(f"F1 Score:    {best_f1_score:.4f}")
print(f"Prediction:  {results_map[best_f1_prompt]['prediction']}")
print(f"Ground Truth:{gt_answer_map[best_f1_prompt]}\n")

print("=== Best by ROUGE-L ===")
print(f"ROUGE-L Score: {best_rL_score:.4f}")
print(f"Prediction:    {results_map[best_rL_prompt]['prediction']}")
print(f"Ground Truth:  {gt_answer_map[best_rL_prompt]}")


===== Evaluation Results =====
Total Prompts Evaluated: 200

Average F1 Score:      0.0516
Average ROUGE-1 Score: 0.0578
Average ROUGE-2 Score: 0.0316
Average ROUGE-L Score: 0.0522

=== Best by F1 ===
F1 Score:    0.5574
Prediction:  ####

The languages explored in the study are Bulgarian, Croatian, Czech, Danish, English, French, German, Indonesian, Italian, Norwegian, Persian, Polish, Portuguese, Slovenian, Spanish, and Swedish. These languages represent a diverse set of typologies, morphologies, and syntaxes, providing a comprehensive multilingual evaluation of the tagging models.
Ground Truth:Bulgarian, Croatian, Czech, Danish, English, French, German, Indonesian, Italian, Norwegian, Persian, Polish, Portuguese, Slovenian, Spanish and Swedish

=== Best by ROUGE-L ===
ROUGE-L Score: 0.5574
Prediction:    ####

The languages explored in the study are Bulgarian, Croatian, Czech, Danish, English, French, German, Indonesian, Italian, Norwegian, Persian, Polish, Portuguese, Slovenian, S

In [7]:
import json
import numpy as np
import os
import glob

# Directory containing your gt_*.json files
DATA_DIR = "/mnt/network_drive/lrq/traces/longBench/longbench_data"

# ---------- Token counter (word-based) ----------
def count_word_tokens(text):
    return len(text.split())

all_token_counts = []
files_processed = 0

# Iterate over all JSON files starting with gt_
for file_path in glob.glob(os.path.join(DATA_DIR, "gt_*.json")):
    print(f"Processing: {file_path}")
    files_processed += 1

    # Load dataset
    with open(file_path, "r") as f:
        dataset_json = json.load(f)

    # Collect token counts for this file
    for item in dataset_json:
        gt_answer = item.get("ground_truth_answer", "")
        all_token_counts.append(count_word_tokens(gt_answer))

print("\n===== Results Across All Files =====")
print("Files processed:", files_processed)
print("Total samples:", len(all_token_counts))

# Compute P99 and max
if len(all_token_counts) > 0:
    p99 = np.percentile(all_token_counts, 99)
    max_val = max(all_token_counts)
    print("P99 token count:", p99)
    print("Max token count:", max_val)
else:
    print("No token counts found.")

Processing: /mnt/network_drive/lrq/traces/longBench/longbench_data/gt_qasper.json
Processing: /mnt/network_drive/lrq/traces/longBench/longbench_data/gt_multifieldqa_en.json
Processing: /mnt/network_drive/lrq/traces/longBench/longbench_data/gt_hotpotqa.json
Processing: /mnt/network_drive/lrq/traces/longBench/longbench_data/gt_2wikimqa.json
Processing: /mnt/network_drive/lrq/traces/longBench/longbench_data/gt_gov_report.json
Processing: /mnt/network_drive/lrq/traces/longBench/longbench_data/gt_multi_news.json
Processing: /mnt/network_drive/lrq/traces/longBench/longbench_data/gt_trec.json
Processing: /mnt/network_drive/lrq/traces/longBench/longbench_data/gt_triviaqa.json
Processing: /mnt/network_drive/lrq/traces/longBench/longbench_data/gt_samsum.json
Processing: /mnt/network_drive/lrq/traces/longBench/longbench_data/gt_passage_count.json
Processing: /mnt/network_drive/lrq/traces/longBench/longbench_data/gt_passage_retrieval_en.json
Processing: /mnt/network_drive/lrq/traces/longBench/long