In [None]:
!pip install -q \
  pandas \
  numpy \
  sacrebleu \
  rouge-score \
  scikit-learn \
  sentence-transformers

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [None]:
import json, re
import pandas as pd
import numpy as np
from pathlib import Path

# ==== DOSYA YOLLARI (burayı değiştir) ====
PRED_JSONL = "/content/pred_run_001.jsonl"        # örn: logs/pred_run_001.jsonl
RAG_CSV    = "/content/rag_log_run_001.csv"       # örn: logs/rag_log_run_001.csv
REF_JSONL  = "/content/loratrain_from_csv.jsonl"  # örn: loratrain_fixed.jsonl

def read_jsonl(path):
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line=line.strip()
            if not line:
                continue
            rows.append(json.loads(line))
    return rows

# 4 satırlık çıktıyı parse et (t+120, durum, mesaj, referans)
def parse_4lines(text: str):
    if not text or not isinstance(text, str):
        return None
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    out = {}
    for ln in lines[:10]:
        if ":" not in ln:
            continue
        k,v = ln.split(":",1)
        k = k.strip().lower()
        v = v.strip()
        if k in ["t+120_glukoz_mgdl","durum","mesaj","referans"]:
            out[k] = v
    # minimum kontrol
    if "durum" not in out or "mesaj" not in out or "referans" not in out:
        return None
    return out

def normalize_text(s: str):
    s = (s or "").strip()
    s = re.sub(r"\s+", " ", s)
    return s

# Ref jsonl: {messages:[...assistant...]} içinden assistant content al
def extract_ref_from_jsonl(ref_obj):
    msgs = ref_obj.get("messages", [])
    for m in msgs:
        if isinstance(m, dict) and m.get("role")=="assistant":
            return m.get("content","")
    return ""

pred_rows = read_jsonl(PRED_JSONL)
ref_rows  = read_jsonl(REF_JSONL)

print("Pred rows:", len(pred_rows))
print("Ref rows :", len(ref_rows))

# aynı uzunluk değilse minimuma göre kırp (pratik)
n = min(len(pred_rows), len(ref_rows))
pred_rows = pred_rows[:n]
ref_rows  = ref_rows[:n]
print("Paired   :", n)

# Parse
pairs = []
format_ok = 0

for i in range(n):
    pred = pred_rows[i]
    ref  = ref_rows[i]

    pred_out = pred.get("llm_output_4lines") or pred.get("llm_output") or ""
    ref_out  = extract_ref_from_jsonl(ref)

    p4 = parse_4lines(pred_out)
    r4 = parse_4lines(ref_out)

    ok = (p4 is not None)
    format_ok += int(ok)

    pairs.append({
        "i": i,
        "pred_raw": pred_out,
        "ref_raw": ref_out,
        "pred": p4,
        "ref": r4,
        "xgb_pred_t120": pred.get("xgb_pred_t120"),
        "thr_hyper": (pred.get("thresholds") or {}).get("hyper"),
        "thr_hypo":  (pred.get("thresholds") or {}).get("hypo"),
    })

print(f"Format OK (pred 4 satır parse): {format_ok}/{n} -> {format_ok/n:.3f}")

def expected_status(xgb_pred, hyper, hypo):
    x = float(xgb_pred)
    h = float(hyper)
    y = float(hypo)
    return "Uyarı" if (x < y or x >= h) else "Normal"

y_true, y_pred = [], []
skipped = 0

for row in pairs:
    p = row["pred"]
    if p is None:
        skipped += 1
        continue

    xgb = row["xgb_pred_t120"]
    hyper = row["thr_hyper"]
    hypo  = row["thr_hypo"]
    if xgb is None or hyper is None or hypo is None:
        skipped += 1
        continue

    gt = expected_status(xgb, hyper, hypo)

    pr = p.get("durum","").strip()
    # normalize
    pr = "Uyarı" if pr.lower().startswith("uyar") else ("Normal" if pr.lower().startswith("nor") else pr)

    y_true.append(gt)
    y_pred.append(pr)

labels = ["Normal","Uyarı"]

acc = np.mean([yt==yp for yt,yp in zip(y_true,y_pred)]) if y_true else 0.0
print("Status pairs:", len(y_true), "skipped:", skipped)
print("Durum Accuracy:", acc)

# Confusion matrix
cm = {a:{b:0 for b in labels} for a in labels}
for yt,yp in zip(y_true,y_pred):
    if yt in labels and yp in labels:
        cm[yt][yp]+=1

print("\nConfusion Matrix (rows=true, cols=pred)")
df_cm = pd.DataFrame(cm).T[labels]
display(df_cm)

# =========================
# (EKLENDİ) 1-2-3: False Alarm Rate, Recall, Macro-F1
# =========================
# Pozitif sınıf = "Uyarı"
TP = cm["Uyarı"]["Uyarı"]
FN = cm["Uyarı"]["Normal"]
FP = cm["Normal"]["Uyarı"]
TN = cm["Normal"]["Normal"]

false_alarm_rate = FP / (FP + TN) if (FP + TN) > 0 else 0.0  # False Positive Rate
recall_uyari = TP / (TP + FN) if (TP + FN) > 0 else 0.0      # Recall (Uyarı)
precision_uyari = TP / (TP + FP) if (TP + FP) > 0 else 0.0   # Precision (Uyarı)
f1_uyari = (2 * precision_uyari * recall_uyari / (precision_uyari + recall_uyari)) if (precision_uyari + recall_uyari) > 0 else 0.0

# Macro-F1 = F1(Normal) ve F1(Uyarı) ortalaması
# Normal için "pozitif" kabul edip hesapla:
TP_n = cm["Normal"]["Normal"]
FN_n = cm["Normal"]["Uyarı"]
FP_n = cm["Uyarı"]["Normal"]
precision_normal = TP_n / (TP_n + FP_n) if (TP_n + FP_n) > 0 else 0.0
recall_normal = TP_n / (TP_n + FN_n) if (TP_n + FN_n) > 0 else 0.0
f1_normal = (2 * precision_normal * recall_normal / (precision_normal + recall_normal)) if (precision_normal + recall_normal) > 0 else 0.0

macro_f1 = (f1_normal + f1_uyari) / 2.0

print("\nEk Durum Metrikleri")
print("False Alarm Rate (Normal->Uyarı):", float(false_alarm_rate))
print("Recall (Uyarı):", float(recall_uyari))
print("Macro-F1:", float(macro_f1))

from sacrebleu.metrics import BLEU
from rouge_score import rouge_scorer

bleu = BLEU(smooth_method="exp")
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)

pred_texts = []
ref_texts  = []

for row in pairs:
    p = row["pred"]
    r = row["ref"]
    if p is None or r is None:
        continue
    pred_msg = normalize_text(p.get("mesaj",""))
    ref_msg  = normalize_text(r.get("mesaj",""))
    if pred_msg and ref_msg:
        pred_texts.append(pred_msg)
        ref_texts.append(ref_msg)

print("Pairs for text metrics:", len(pred_texts))

# BLEU (corpus)
bleu_score = bleu.corpus_score(pred_texts, [ref_texts]).score
print("BLEU:", round(bleu_score, 2))

# ROUGE-L (average F1)
rouge_f1 = []
for pm, rm in zip(pred_texts, ref_texts):
    score = scorer.score(rm, pm)["rougeL"].fmeasure
    rouge_f1.append(score)

print("ROUGE-L (F1):", float(np.mean(rouge_f1)) if rouge_f1 else 0.0)

rag = pd.read_csv(RAG_CSV)
print("RAG rows:", len(rag))
display(rag.head(3))

# Coverage
if "rag_found" in rag.columns:
    coverage = rag["rag_found"].mean()
else:
    coverage = None

# Best similarity
if "rag_best_sim" in rag.columns:
    avg_sim = rag["rag_best_sim"].mean()
    med_sim = rag["rag_best_sim"].median()
    min_sim = rag["rag_best_sim"].min()
    max_sim = rag["rag_best_sim"].max()
else:
    avg_sim = med_sim = min_sim = max_sim = None

print("\nRAG Metrics")
print("Coverage (rag_found mean):", coverage)
print("Best Similarity avg/median/min/max:", avg_sim, med_sim, min_sim, max_sim)

# Opsiyonel: TopK similarity'lerin ortalaması (retrieved_topk_sims json list ise)
if "retrieved_topk_sims" in rag.columns:
    def parse_list(x):
        try:
            return json.loads(x) if isinstance(x,str) else []
        except:
            return []
    topk_means = []
    for x in rag["retrieved_topk_sims"].tolist():
        arr = parse_list(x)
        if arr:
            topk_means.append(float(np.mean(arr)))
    if topk_means:
        print("Mean(sim) over TopK (avg of row-means):", float(np.mean(topk_means)))

    report = {
    "n_pairs_total": n,
    "format_ok_ratio": format_ok/n if n else 0,
    "status_pairs": len(y_true),
    "status_accuracy": float(acc),

    # (EKLENDİ) 1-2-3
    "false_alarm_rate": float(false_alarm_rate),
    "recall_uyari": float(recall_uyari),
    "macro_f1": float(macro_f1),

    "bleu": float(bleu_score),
    "rougeL_f1": float(np.mean(rouge_f1)) if rouge_f1 else 0.0,
    "rag_coverage": float(coverage) if coverage is not None else None,
    "rag_best_sim_avg": float(avg_sim) if avg_sim is not None else None,
    "rag_best_sim_median": float(med_sim) if med_sim is not None else None,
}
report

Pred rows: 18
Ref rows : 1627
Paired   : 18
Format OK (pred 4 satır parse): 15/18 -> 0.833
Status pairs: 15 skipped: 3
Durum Accuracy: 1.0

Confusion Matrix (rows=true, cols=pred)


Unnamed: 0,Normal,Uyarı
Normal,4,0
Uyarı,0,11



Ek Durum Metrikleri
False Alarm Rate (Normal->Uyarı): 0.0
Recall (Uyarı): 1.0
Macro-F1: 1.0
Pairs for text metrics: 15
BLEU: 14.67
ROUGE-L (F1): 0.33582390098519127
RAG rows: 18


Unnamed: 0,ts,run_tag,image_idx,food101_best,food101_best_conf,chosen_label,rag_best_name,rag_best_sim,rag_found,retrieved_topk_names,retrieved_topk_sims
0,2026-01-12T17:45:00,run_001,1,pizza,0.814937,pizza,pizza,0.521144,1,"[""pizza"", ""garlic_bread"", ""lobster_roll_sandwi...","[0.5211443901062012, 0.3636852502822876, 0.337..."
1,2026-01-12T17:45:30,run_001,1,pizza,0.814937,pizza,pizza,0.521144,1,"[""pizza"", ""garlic_bread"", ""lobster_roll_sandwi...","[0.5211443901062012, 0.3636852502822876, 0.337..."
2,2026-01-12T17:46:03,run_001,1,pizza,0.814937,pizza,pizza,0.521144,1,"[""pizza"", ""garlic_bread"", ""lobster_roll_sandwi...","[0.5211443901062012, 0.3636852502822876, 0.337..."



RAG Metrics
Coverage (rag_found mean): 1.0
Best Similarity avg/median/min/max: 0.5211443901062012 0.5211443901062012 0.5211443901062012 0.5211443901062012
Mean(sim) over TopK (avg of row-means): 0.3695065677165985


{'n_pairs_total': 18,
 'format_ok_ratio': 0.8333333333333334,
 'status_pairs': 15,
 'status_accuracy': 1.0,
 'false_alarm_rate': 0.0,
 'recall_uyari': 1.0,
 'macro_f1': 1.0,
 'bleu': 14.671751886000918,
 'rougeL_f1': 0.33582390098519127,
 'rag_coverage': 1.0,
 'rag_best_sim_avg': 0.5211443901062012,
 'rag_best_sim_median': 0.5211443901062012}