# Model Performance Reporting Notebook

This notebook generates comprehensive evaluation reports for each AI-based evaluator model. It reads the `*_evaluations.csv` files produced by the `5_Evaluation.ipynb` pipeline, computes summary statistics, inter-rater reliability metrics (Cohen's Kappa), component-specific distance averages, and domain-specific distance averages, and writes a neatly formatted Markdown report for each model into the `model_reports/` directory.

**Notebook Structure**:
1. Imports and Setup
2. Directory Configuration
3. Helper Functions
4. Load Framework & Human Evaluations
5. Report Generation Function
6. Execute Report Generation

_Note: Make sure you have run `5_Evaluation.ipynb` and that the `model_evaluation_data/` folder contains your `*_evaluations.csv` files, and that you have access to your human evaluation data (`peru_cleaned_transcripts.csv`) and framework JSON (`Teach_1.json`)._

In [1]:
# 1. Imports and Setup
import os
import glob
import json
import math
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score

## 2. Directory Configuration

Define paths for input evaluation CSVs and output reports, and ensure the output directory exists.

In [2]:
# Directories
import sys
import os
from pathlib import Path

sys.path.append('/Users/mkrasnow/Desktop/montesa')

EVALS_DIR = 'model_evaluation_data'
REPORTS_DIR = 'model_reports'
os.makedirs(REPORTS_DIR, exist_ok=True)
print(f"Input evaluations directory: {EVALS_DIR}")
print(f"Output reports directory:    {REPORTS_DIR}")

Input evaluations directory: model_evaluation_data
Output reports directory:    model_reports


## 3. Helper Functions

Define utility functions for converting labels to numeric, computing normalized distances, and computing Cohen's Kappa.

In [3]:
def alpha_to_numeric(x):
    """
    Convert string labels to numeric values:
      - 'Y'/'y'/'Yes'/'1' -> 1.0
      - 'N'/'n'/'No'/'0'  -> 0.0
      - 'N/A' or empty   -> np.nan
      - 'L' -> 1.0, 'M' -> 2.0, 'H' -> 3.0
      - Numeric strings convertible to float -> float(x)
    """
    if x is None:
        return np.nan
    s = str(x).strip()
    if s in {'Y','y','Yes','1'}:
        return 1.0
    if s in {'N','n','No','0'}:
        return 0.0
    if s in {'N/A','','NA','na','nan'}:
        return np.nan
    if s in {'L','M','H'}:
        return {'L':1.0,'M':2.0,'H':3.0}[s]
    try:
        return float(s)
    except ValueError:
        return np.nan

def component_distance(human_score, ai_score, score_type):
    """
    Compute normalized distance between human and AI scores.
    score_type: 'YN' (max 1), 'LMH' (max 2), 'NUM' (1-5 scale, max diff 4).
    Returns float in [0,1].
    """
    h = alpha_to_numeric(human_score)
    a = alpha_to_numeric(ai_score)
    if math.isnan(h) and math.isnan(a):
        return 0.0
    if math.isnan(h) ^ math.isnan(a):
        return 1.0
    if score_type == 'YN':
        dmax = 1.0
    elif score_type == 'LMH':
        dmax = 2.0
    else:
        dmax = 4.0
    return min(max(abs(h - a) / dmax, 0.0), 1.0)

def encode_for_kappa(series, score_list):
    """Map categorical labels to integers for Cohen's Kappa."""
    mapping = {lbl: i for i, lbl in enumerate(score_list)}
    return series.map(lambda x: mapping.get(x, mapping.get('N/A', len(score_list)-1)))

def compute_component_kappa(human_series, ai_series, score_list, weight=None):
    """Compute Cohen's Kappa for a single component."""
    h_enc = encode_for_kappa(human_series, score_list)
    a_enc = encode_for_kappa(ai_series, score_list)
    return cohen_kappa_score(h_enc, a_enc, weights=weight)

def compute_distances_for_item(human_row, ai_row, framework):
    """
    Compute domain-level and overall normalized distances for one item.
    Returns (domain_distances: dict(domain_id->float), overall_distance:float).
    """
    domain_dist = {}
    num = 0.0
    wsum = 0.0
    for domain in framework['structure']['domains']:
        did = str(domain['id'])
        dnum = 0.0
        dwsum = 0.0
        stype = 'NUM'
        for comp in domain['components']:
            cname = comp['name']
            cweight = float(comp.get('weight',1.0))
            sl = comp.get('scoreList', [])
            if set(sl) <= {'Y','N','N/A'}:
                stype = 'YN'
            elif set(sl) <= {'L','M','H','N/A'}:
                stype = 'LMH'
            d = component_distance(human_row.get(cname), ai_row.get(cname), stype)
            dnum += cweight * d
            dwsum += cweight
        Dd = (dnum / dwsum) if dwsum>0 else 0.0
        domain_dist[did] = Dd
        num += float(domain.get('weight',1.0)) * Dd
        wsum += float(domain.get('weight',1.0))
    overall = (num/wsum) if wsum>0 else 0.0
    return domain_dist, overall

## 4. Load Framework & Human Evaluations

Functions to load the evaluation framework JSON and the cleaned transcripts with human evaluation scores.

In [4]:
from pathlib import Path

def load_framework(path):
    with open(path, 'r') as f:
        return json.load(f)

def load_human_evaluations(cleaned_csv_path, framework_json_path):
    df = pd.read_csv(cleaned_csv_path, dtype=str)
    # Extract base_id and clip_number
    clip_info = df['School_Clip'].str.extract(r'(?P<base_id>\d{6,7})\s*Clip\s*(?P<clip_num>[12])')
    df['base_id'] = clip_info['base_id']
    df['clip_number'] = clip_info['clip_num'].map({'1':'first','2':'last'})
    # Determine evaluation columns from framework
    framework = load_framework(framework_json_path)
    eval_cols = []
    for domain in framework['structure']['domains']:
        for comp in domain['components']:
            if comp['name'] in df.columns:
                eval_cols.append(comp['name'])
    return df[['base_id','clip_number'] + eval_cols]

# Example paths (adjust as needed)
FRAMEWORK_PATH   = '/Users/mkrasnow/Desktop/montesa/new/models/_context/Teach_1.json'
TRANSCRIPTS_PATH = '/Users/mkrasnow/Desktop/montesa/new/formattedData/peru_cleaned_transcripts.csv'

## 5. Report Generation Function

This function iterates over each `*_evaluations.csv` file, computes all metrics, and writes a Markdown report for each model.

In [5]:
def generate_reports(framework_path, transcripts_path, evals_dir, reports_dir):
    framework = load_framework(framework_path)
    human_df  = load_human_evaluations(transcripts_path, framework_path)

    # Gather component definitions
    components = []
    for domain in framework['structure']['domains']:
        for comp in domain['components']:
            components.append({
                'id': str(comp['id']),
                'name': comp['name'],
                'score_list': comp.get('scoreList', ['Y','N','N/A'])
            })

    # Process each evaluation CSV
    for eval_file in glob.glob(os.path.join(evals_dir, '*_evaluations.csv')):
        model_name = os.path.basename(eval_file).replace('_evaluations.csv','')
        ai_df = pd.read_csv(eval_file, dtype=str)

        # Compute distances for each item and track agreement
        dist_records = []
        for _, row in ai_df.iterrows():
            hr = human_df[(human_df.base_id==row.base_id)&(human_df.clip_number==row.clip_number)]
            if hr.empty: continue
            hr = hr.iloc[0]
            dom_dist, overall = compute_distances_for_item(hr, row, framework)
            rec = {'base_id':row.base_id,'clip_number':row.clip_number,'overall_distance':overall}
            for did, dval in dom_dist.items(): rec[f'domain_{did}_dist'] = dval
            dist_records.append(rec)
        distances_df = pd.DataFrame(dist_records)
        # Add LLM-human agreement column (accuracy) as 1 - distance
        distances_df['agreement'] = 1 - distances_df['overall_distance']

        # Summary statistics for distance and agreement
        overall_stats = distances_df['overall_distance'].describe()
        agreement_stats = distances_df['agreement'].describe()

        # Component-wise Cohen's Kappa
        kappa_records = []
        for comp in components:
            comp_name = comp['name']
            # Skip if component not present in either AI or human evaluations
            if comp_name not in ai_df.columns or comp_name not in human_df.columns:
                continue
            h_ser = human_df[comp_name]
            a_ser = ai_df[comp_name]
            weight = 'quadratic' if any(lbl in ['L','M','H'] for lbl in comp['score_list']) else None
            try:
                k = compute_component_kappa(h_ser, a_ser, comp['score_list'], weight)
            except:
                k = np.nan
            kappa_records.append({'component_name': comp_name, 'kappa': k})
        kappa_df = pd.DataFrame(kappa_records)

        # Component-specific distance averages
        comp_dist_avgs = []
        for comp in components:
            cname = comp['name']
            # Skip if component not present in either AI or human evaluations
            if cname not in ai_df.columns or cname not in human_df.columns:
                continue
            # Compute mean normalized diff per component
            diffs = ai_df.apply(lambda r: component_distance(
                human_df[(human_df.base_id == r.base_id) & (human_df.clip_number == r.clip_number)].iloc[0].get(cname),
                r.get(cname),
                'YN' if set(comp['score_list']) <= {'Y', 'N', 'N/A'} else ('LMH' if set(comp['score_list']) <= {'L', 'M', 'H', 'N/A'} else 'NUM')
            ), axis=1)
            comp_dist_avgs.append({'component_name': cname, 'avg_distance': diffs.mean()})
        comp_dists_df = pd.DataFrame(comp_dist_avgs)

        # Domain-specific distance averages
        domain_avgs = []
        for domain in framework['structure']['domains']:
            did = str(domain['id'])
            col = f'domain_{did}_dist'
            if col in distances_df.columns:
                domain_avgs.append({'domain_id':did,'domain_name':domain['name'],'avg_distance':distances_df[col].mean()})
        domain_dists_df = pd.DataFrame(domain_avgs)

        # Build Markdown report
        lines = []
        lines.append(f"# Report for Model: **{model_name}**\n")
        lines.append("## 1. Distance and LLM-Human Agreement Summary Statistics\n")
        lines.append("### 1.1 Overall Distance\n")
        lines.append(overall_stats.to_markdown() + "\n")
        lines.append("### 1.2 LLM-Human Agreement\n")
        lines.append(agreement_stats.to_markdown() + "\n")
        lines.append("## 2. Inter-Rater Reliability (Cohen's Kappa)\n")
        lines.append(kappa_df.to_markdown(index=False) + "\n")
        lines.append("## 3. Component-Specific Distance Averages\n")
        lines.append(comp_dists_df.to_markdown(index=False) + "\n")
        lines.append("## 4. Domain-Specific Distance Averages\n")
        lines.append(domain_dists_df.to_markdown(index=False) + "\n")

        report_md = "\n".join(lines)
        out_path = os.path.join(reports_dir, f"{model_name}_report.md")
        with open(out_path, 'w') as f:
            f.write(report_md)
        print(f"✅ Written report for {model_name} → {out_path}")

## 6. Execute Report Generation

Run the `generate_reports` function with your configured paths.

In [6]:
%pip install tabulate

Note: you may need to restart the kernel to use updated packages.


In [7]:
generate_reports(
    framework_path=FRAMEWORK_PATH,
    transcripts_path=TRANSCRIPTS_PATH,
    evals_dir=EVALS_DIR,
    reports_dir=REPORTS_DIR,
)

print("\nAll model reports generated successfully.")

✅ Written report for BaseEvaluator → model_reports/BaseEvaluator_report.md

All model reports generated successfully.


## 7. Reliability Exam Evaluation

Evaluate AI models against the Teach Reliability Exam criteria:
- **Time on Learning**: exact agreement on 2 of 3 snapshots per segment.
- **Quality Elements**: within 1 point of master codes on at least 8 of 9 high-inference elements per segment.

Two evaluation versions are implemented:
1. **Random Set Exam**: Two attempts with random sets of three segments.
2. **Average Exam**: Compute exam metrics across all available segments.

In [8]:
# Reliability Exam Functions
import random

# Define columns for Time on Learning snapshots and Quality elements
snapshot_teacher = ['Teacher provides learning activity - 1st Snapshot',
                   'Teacher provides learning activity - 2nd Snapshot',
                   'Teacher provides learning activity - 3rd Snapshot']
snapshot_students = ['Students are on task - 1st Snapshot',
                     'Students are on task - 2nd Snapshot',
                     'Students are on task - 3rd Snapshot']
quality_elements = [
    'Supportive Learning Environment',
    'Positive Behavioral Expectations',
    'Lesson Facilitation',
    'Checks for understanding',
    'Feedback',
    'Critical Thinking',
    'Autonomy',
    'Perseverance',
    'Social & Collaborative Skills'
]

def evaluate_segment(ai_row, human_row):
    # Time on Learning: agreement on snapshot if both teacher and students match
    time_agreements = []
    for t_col, s_col in zip(snapshot_teacher, snapshot_students):
        t_match = ai_row[t_col] == human_row[t_col]
        s_match = ai_row[s_col] == human_row[s_col]
        time_agreements.append(t_match and s_match)
    time_ok = sum(time_agreements) >= 2
    # Quality Elements: within 1 point on numeric scale
    quality_agreements = []
    for col in quality_elements:
        h = alpha_to_numeric(human_row[col])
        a = alpha_to_numeric(ai_row[col])
        if math.isnan(h) or math.isnan(a):
            quality_agreements.append(False)
        else:
            quality_agreements.append(abs(h - a) <= 1.0)
    quality_ok = sum(quality_agreements) >= 8
    return time_ok, quality_ok

def run_random_exam(models, ai_df, human_df, attempts=2, set_size=3, seed=42):
    random.seed(seed)
    results = {}
    for model in models:
        df_m = ai_df[ai_df['model_name'] == model]
        keys = df_m[['base_id','clip_number']].drop_duplicates().apply(tuple, axis=1).tolist()
        certified = False
        attempt_results = []
        for att in range(1, attempts+1):
            sample = random.sample(keys, k=set_size)
            time_pass = True
            quality_pass = True
            for seg in sample:
                row_ai = df_m[(df_m['base_id'] == seg[0]) & (df_m['clip_number'] == seg[1])].iloc[0]
                row_h = human_df[(human_df['base_id'] == seg[0]) & (human_df['clip_number'] == seg[1])].iloc[0]
                t_ok, q_ok = evaluate_segment(row_ai, row_h)
                if not (t_ok and q_ok):
                    time_pass = time_pass and t_ok
                    quality_pass = quality_pass and q_ok
            passed = time_pass and quality_pass
            attempt_results.append({'attempt': att, 'passed': passed, 'segments': sample})
            if passed:
                certified = True
                break
        results[model] = {'certified': certified, 'attempts': attempt_results}
    return results

def run_average_exam(models, ai_df, human_df):
    results = {}
    for model in models:
        df_m = ai_df[ai_df['model_name'] == model]
        time_ok_list = []
        quality_ok_list = []
        for _, row_ai in df_m.iterrows():
            row_h = human_df[(human_df['base_id'] == row_ai['base_id']) & (human_df['clip_number'] == row_ai['clip_number'])].iloc[0]
            t_ok, q_ok = evaluate_segment(row_ai, row_h)
            time_ok_list.append(t_ok)
            quality_ok_list.append(q_ok)
        time_rate = sum(time_ok_list) / len(time_ok_list) if time_ok_list else float('nan')
        quality_rate = sum(quality_ok_list) / len(quality_ok_list) if quality_ok_list else float('nan')
        results[model] = {'time_pass_rate': time_rate, 'quality_pass_rate': quality_rate}
    return results

In [9]:
# Execute Reliability Exams
models = [os.path.basename(f).replace('_evaluations.csv','') for f in glob.glob(os.path.join(EVALS_DIR, '*_evaluations.csv'))]
ai_df_all = pd.concat([pd.read_csv(f, dtype=str) for f in glob.glob(os.path.join(EVALS_DIR, '*_evaluations.csv'))], ignore_index=True)
human_df_all = load_human_evaluations(TRANSCRIPTS_PATH, FRAMEWORK_PATH)

random_results = run_random_exam(models, ai_df_all, human_df_all)
print("Random Exam Results:")
print(random_results)

avg_results = run_average_exam(models, ai_df_all, human_df_all)
print("\nAverage Exam Results:")
print(avg_results)

KeyError: 'Teacher provides learning activity - 3rd Snapshot'