In [6]:
import json
import os
import numpy as np
import pandas as pd

# --- Configuration: adjust these to your setup ---
llms = ['Llama-3.1-8B-Instruct', 'gemma-2-2b-it', 'mistral-instruct']
tasks = ['education', 'therapy', 'chatting']
n_files_per_llm = 4

# You can now specify a custom path for each task

task_paths = {
    "therapy": "therapy/exp/05.08.25/",
    "chitchat": "chatting/exp/04.26.25/",
    "education": "training_data/in_education/"
}


# Metric keys
metric_keys = ['P2_prompt_consistency_score', 'P2_index_consistency_score', 'P2_q&a_consistency_score']
rename_map = {
    'P2_prompt_consistency_score': 'prompt-to-line Consistency',
    'P2_index_consistency_score':   'line-to-line consistency',
    'P2_q&a_consistency_score':     'q&a consistency'
}

records = []

for task in tasks:
    task_dir = task_paths.get(task, "")
    if not os.path.isdir(task_dir):
        print(f"Warning: task directory not found: {task_dir}; skipping.")
        continue

    all_files = [f for f in os.listdir(task_dir) if f.endswith('.json')]
    for llm in llms:
        llm_files = sorted([f for f in all_files if llm in f])[:n_files_per_llm]
        metric_file_means = {k: [] for k in metric_keys}

        for fname in llm_files:
            path = os.path.join(task_dir, fname)
            try:
                data = json.load(open(path))
            except Exception as e:
                print(f"  • Couldn’t read {fname} ({e}); using zeros.")
                for k in metric_keys:
                    metric_file_means[k].append(0.0)
                continue

            # handle list-of-dicts format
            if isinstance(data, list):
                for k in metric_keys:
                    vals = []
                    for i, entry in enumerate(data):
                        v = entry.get(k, 0.0)
                        if not isinstance(v, (int, float)):
                            print(f"    • Bad '{k}' in element {i} of {fname}; using 0")
                            v = 0.0
                        vals.append(v)
                    metric_file_means[k].append(np.mean(vals) if vals else 0.0)

            # handle single-dict-with-list format
            elif isinstance(data, dict):
                for k in metric_keys:
                    arr = data.get(k, [])
                    if isinstance(arr, list) and len(arr) > 0:
                        metric_file_means[k].append(np.mean(arr))
                    else:
                        print(f"    • Missing/invalid '{k}' in {fname}; using 0")
                        metric_file_means[k].append(0.0)
            else:
                print(f"    • Unexpected structure in {fname}; padding zeros.")
                for k in metric_keys:
                    metric_file_means[k].append(0.0)

        # pad if fewer runs
        for k in metric_keys:
            missing = n_files_per_llm - len(metric_file_means[k])
            if missing > 0:
                metric_file_means[k].extend([0.0] * missing)

        rec = {'Task': task, 'LLM': llm}
        for k in metric_keys:
            rec[k] = np.mean(metric_file_means[k])
        records.append(rec)

# Build DataFrame with multi-index and three metric columns
df = pd.DataFrame(records)
table = df.set_index(['Task','LLM'])[metric_keys]
table.rename(columns=rename_map, inplace=True)

# Export to LaTeX
latex = table.to_latex(
    float_format="%.3f",
    caption="All Metrics (Prompt & Index Consistency) by Task and LLM",
    label="tab:all_metrics"
)
with open('all_llm_metrics.tex', 'w') as f:
    f.write(latex)

# Print or display
print(latex)
print(table)
print("\nLaTeX table written to all_llm_metrics.tex")


\begin{table}
\caption{All Metrics (Prompt & Index Consistency) by Task and LLM}
\label{tab:all_metrics}
\begin{tabular}{llrrr}
\toprule
 &  & prompt-to-line Consistency & line-to-line consistency & q&a consistency \\
Task & LLM &  &  &  \\
\midrule
\multirow[t]{3}{*}{education} & Llama-3.1-8B-Instruct & 0.824 & 0.800 & 0.000 \\
 & gemma-2-2b-it & 0.511 & 0.928 & 0.000 \\
 & mistral-instruct & 0.728 & 0.975 & 0.000 \\
\cline{1-5}
\multirow[t]{3}{*}{therapy} & Llama-3.1-8B-Instruct & 0.164 & 0.170 & 0.000 \\
 & gemma-2-2b-it & 0.166 & 0.246 & 0.000 \\
 & mistral-instruct & 0.216 & 0.241 & 0.000 \\
\cline{1-5}
\bottomrule
\end{tabular}
\end{table}

                                 prompt-to-line Consistency  \
Task      LLM                                                 
education Llama-3.1-8B-Instruct                    0.824292   
          gemma-2-2b-it                            0.511292   
          mistral-instruct                         0.728125   
therapy   Llama-3.1-8B-Instruc

In [3]:
import os
import json
import numpy as np
import glob
from collections import defaultdict

# Set your domain file paths here
domain_paths = {
    "therapy": "therapy/exp/05.11.25/",
    "teaching": "education/exp/05.06.25/",
    "chitchat": "chatting/exp/05.06.25/"
}

default_paths = {
    "therapy": "therapy/exp/05.08.25/",
    "chitchat": "chatting/exp/04.26.25/",
    "teaching": "training_data/in_education/"
}

# Only these algorithms are considered
algorithms = ["sft", "ppo", "kto"]

# Results structures
results = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
default_results = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
loaded_files = set()

def detect_algorithm(filename):
    name = filename.lower()
    if "kto" in name:
        return "kto"
    elif "ppo" in name:
        return "ppo"
    elif "sft" in name:
        return "sft"
    else:
        return None

# --- Process fine-tuned (sft, ppo, kto) files ---
for domain, base_path in domain_paths.items():
    pattern = os.path.join(base_path, "*.json")
    matched_files = glob.glob(pattern)

    for file_path in matched_files:
        file_path = os.path.abspath(file_path)
        if any(bad in file_path.lower() for bad in ["gemma", "mistral"]):
            continue
        if file_path in loaded_files:
            continue
        loaded_files.add(file_path)

        alg = detect_algorithm(file_path)
        if alg is None:
            continue

        try:
            with open(file_path, "r") as f:
                data = json.load(f)
            for entry in data:
                if "P2_prompt_consistency_score" in entry and "rounds" in entry:
                    score = entry["P2_prompt_consistency_score"]
                    rounds = entry["rounds"]
                    results[domain][alg][rounds].append(score)
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

# --- Process default files, excluding gemma/mistral ---
for domain, path in default_paths.items():
    pattern = os.path.join(path, "*.json")
    matched_files = glob.glob(pattern)

    for file_path in matched_files:
        file_path = os.path.abspath(file_path)
        if any(bad in file_path.lower() for bad in ["gemma", "mistral"]):
            continue
        if file_path in loaded_files:
            continue
        loaded_files.add(file_path)

        try:
            with open(file_path, "r") as f:
                data = json.load(f)
            for entry in data:
                if all(k in entry for k in ["P2_prompt_consistency_score", "rounds", "task_name"]):
                    score = entry["P2_prompt_consistency_score"]
                    rounds = entry["rounds"]
                    task = entry["task_name"]
                    default_results[domain][task][rounds].append(score)
        except Exception as e:
            print(f"Error reading default file {file_path}: {e}")

# --- Print default results grouped by task ---
print(f"{'Domain':<10} {'Task':<12} {'Rounds':<6} {'Mean':<8} {'Std Dev':<8} {'N':<4}")
print("-" * 70)
for domain in default_paths:
    domain_has_data = domain in default_results and len(default_results[domain]) > 0

    if not domain_has_data:
        print(f"{domain:<10} {'(no data)':<12} {'-':<6} {'-':<8} {'-':<8} {'0':<4}")
        print("-" * 70)
        continue

    for task in default_results[domain]:
        all_scores = []
        for rounds, scores in sorted(default_results[domain][task].items()):
            mean = round(np.mean(scores), 3)
            std = round(np.std(scores), 3)
            count = len(scores)
            all_scores.extend(scores)
            print(f"{domain:<10} {task:<12} {rounds:<6} {mean:<8} {std:<8} {count:<4}")

        if all_scores:
            avg_mean = round(np.mean(all_scores), 3)
            avg_std = round(np.std(all_scores), 3)
            total = len(all_scores)
            print(f"{domain:<10} {task:<12} {'Avg':<6} {avg_mean:<8} {avg_std:<8} {total:<4}")
        print("-" * 70)

# --- Print fine-tuned results grouped by algorithm ---
print(f"\n{'Domain':<10} {'Algorithm':<9} {'Rounds':<6} {'Mean':<8} {'Std Dev':<8} {'N':<4}")
print("-" * 65)
for domain in domain_paths:
    for alg in algorithms:
        if domain not in results or alg not in results[domain]:
            print(f"{domain:<10} {alg:<9} {'-':<6} {'-':<8} {'-':<8} {'0':<4}")
            print("-" * 65)
            continue

        all_scores = []
        for rounds, scores in sorted(results[domain][alg].items()):
            mean = round(np.mean(scores), 3)
            std = round(np.std(scores), 3)
            count = len(scores)
            all_scores.extend(scores)
            print(f"{domain:<10} {alg:<9} {rounds:<6} {mean:<8} {std:<8} {count:<4}")

        if all_scores:
            avg_mean = round(np.mean(all_scores), 3)
            avg_std = round(np.std(all_scores), 3)
            total = len(all_scores)
            print(f"{domain:<10} {alg:<9} {'Avg':<6} {avg_mean:<8} {avg_std:<8} {total:<4}")
        print("-" * 65)

Domain     Task         Rounds Mean     Std Dev  N   
----------------------------------------------------------------------
therapy    Therapy      10     0.738    0.222    100 
therapy    Therapy      20     0.68     0.194    100 
therapy    Therapy      40     0.638    0.186    100 
therapy    Therapy      60     0.571    0.185    100 
therapy    Therapy      Avg    0.657    0.207    400 
----------------------------------------------------------------------
chitchat   Chatting     10     0.488    0.273    100 
chitchat   Chatting     20     0.609    0.242    100 
chitchat   Chatting     40     0.665    0.211    100 
chitchat   Chatting     60     0.714    0.205    100 
chitchat   Chatting     Avg    0.619    0.249    400 
----------------------------------------------------------------------
teaching   Education    10     0.848    0.165    100 
teaching   Education    20     0.798    0.137    100 
teaching   Education    40     0.822    0.119    100 
teaching   Education    60     

In [16]:
import json
import os
import numpy as np
from collections import defaultdict

# --- Configuration ---
llms = ['Llama-3.1-8B-Instruct', 'gemma-2-2b-it', 'mistral-instruct']
tasks = ['education', 'therapy', 'chatting']
n_files_per_llm = 4

task_paths = {
    "therapy": "therapy/exp/05.08.25/",
    "chatting": "chatting/exp/04.26.25/",
    "education": "training_data/in_education/"
}

metrics_to_summarize = {
    'P2_prompt_consistency_score': 'Prompt Consistency',
    'P2_index_consistency_score': 'Index Consistency',
    'P2_q&a_consistency_score': 'Q&A Consistency'
}

# --- Generate LLM × Task × Metric LaTeX Table (mean ± std) ---
latex_llm_rows = []
for task in tasks:
    task_dir = task_paths.get(task, "")
    if not os.path.isdir(task_dir): continue

    for llm in llms:
        scores_by_metric = {k: [] for k in metrics_to_summarize}
        all_files = [f for f in os.listdir(task_dir) if f.endswith('.json') and llm in f][:n_files_per_llm]

        for fname in all_files:
            try:
                with open(os.path.join(task_dir, fname)) as f:
                    data = json.load(f)
            except:
                continue
            if not isinstance(data, list): continue

            for entry in data:
                for key in metrics_to_summarize:
                    val = entry.get(key)
                    if isinstance(val, (int, float)):
                        scores_by_metric[key].append(val)

        row = [f"\\textit{{{task.title()}}}" if llm == llms[0] else "", llm]
        for key in metrics_to_summarize:
            vals = scores_by_metric[key]
            if vals:
                mean = np.mean(vals)
                std = np.std(vals)
                row.append(f"${mean:.3f} \\pm {std:.3f}$")
            else:
                row.append("$\\text{--}$")
        latex_llm_rows.append(" & ".join(row) + " \\\\")

latex_llm_table = r"""
\begin{table*}[t]
    \centering
    \scriptsize
    \begin{tabular}{l l c c c}
        \toprule
        \textbf{Task} & \textbf{LLM} 
        & \textbf{prompt-to-line Consistency} 
        & \textbf{line-to-line Consistency} 
        & \textbf{Q\&A Consistency} \\
        \midrule
""" + "\n".join(latex_llm_rows) + r"""
        \bottomrule
    \end{tabular}
    \caption{\textbf{LLM Consistency Metrics across Tasks.} 
    Mean and standard deviation (mean $\pm$ std) of three consistency metrics—prompt-to-line, line-to-line, and Q\&A consistency—for each LLM across different dialogue tasks.}
    \label{tab:llm_consistency}
\end{table*}
"""

with open("llm_consistency_metrics.tex", "w") as f:
    f.write(latex_llm_table)

# --- Generate Conversation Length × Task × Metric LaTeX Table (mean ± std) ---
latex_round_rows = []
round_lengths = [10, 20, 40, 60]

for task in tasks:
    task_dir = task_paths.get(task, "")
    if not os.path.isdir(task_dir): continue

    for round_len in round_lengths + ["Avg"]:
        scores_by_metric = {k: [] for k in metrics_to_summarize}
        for llm in llms:
            files = [f for f in os.listdir(task_dir) if f.endswith('.json') and llm in f][:n_files_per_llm]
            for fname in files:
                try:
                    with open(os.path.join(task_dir, fname)) as f:
                        data = json.load(f)
                except:
                    continue
                if not isinstance(data, list): continue

                for entry in data:
                    if round_len != "Avg" and entry.get("rounds") != round_len:
                        continue
                    for key in metrics_to_summarize:
                        val = entry.get(key)
                        if isinstance(val, (int, float)):
                            scores_by_metric[key].append(val)

        row = [f"\\textit{{{task.title()}}}" if round_len == round_lengths[0] else "", str(round_len)]
        for key in metrics_to_summarize:
            vals = scores_by_metric[key]
            if vals:
                mean = np.mean(vals)
                std = np.std(vals)
                row.append(f"${mean:.3f} \\pm {std:.3f}$")
            else:
                row.append("$\\text{--}$")
        latex_round_rows.append(" & ".join(row) + " \\\\")

latex_round_table = r"""
\begin{table*}[t]
    \centering
    \scriptsize
    \begin{tabular}{l c c c c}
        \toprule
        \textbf{Task} & \textbf{Rounds} 
        & \textbf{prompt-to-line Consistency} 
        & \textbf{line-to-line Consistency} 
        & \textbf{Q\&A Consistency} \\
        \midrule
""" + "\n".join(latex_round_rows) + r"""
        \bottomrule
    \end{tabular}
    \caption{\textbf{Consistency Metrics across Conversation Lengths.} 
    Mean and standard deviation (mean $\pm$ std) of each consistency metric for each task, averaged across LLMs.}
    \label{tab:length_consistency}
\end{table*}
"""

with open("round_length_consistency_metrics.tex", "w") as f:
    f.write(latex_round_table)

print("\n===== LLM × Task Consistency Table (LaTeX) =====\n")
print(latex_llm_table)

print("\n===== Round Length × Task Consistency Table (LaTeX) =====\n")
print(latex_round_table)


===== LLM × Task Consistency Table (LaTeX) =====


\begin{table*}[t]
    \centering
    \scriptsize
    \begin{tabular}{l l c c c}
        \toprule
        \textbf{Task} & \textbf{LLM} 
        & \textbf{prompt-to-line Consistency} 
        & \textbf{line-to-line Consistency} 
        & \textbf{Q\&A Consistency} \\
        \midrule
\textit{Education} & Llama-3.1-8B-Instruct & $0.824 \pm 0.132$ & $0.800 \pm 0.148$ & $\text{--}$ \\
 & gemma-2-2b-it & $0.511 \pm 0.250$ & $0.928 \pm 0.092$ & $\text{--}$ \\
 & mistral-instruct & $0.728 \pm 0.191$ & $0.975 \pm 0.063$ & $\text{--}$ \\
\textit{Therapy} & Llama-3.1-8B-Instruct & $0.657 \pm 0.207$ & $0.681 \pm 0.168$ & $\text{--}$ \\
 & gemma-2-2b-it & $0.665 \pm 0.247$ & $0.984 \pm 0.040$ & $\text{--}$ \\
 & mistral-instruct & $0.863 \pm 0.186$ & $0.964 \pm 0.078$ & $\text{--}$ \\
\textit{Chatting} & Llama-3.1-8B-Instruct & $0.619 \pm 0.249$ & $0.992 \pm 0.025$ & $\text{--}$ \\
 & gemma-2-2b-it & $0.871 \pm 0.230$ & $0.900 \pm 0.123$ & $\text{