# Chatbot Evaluation Error Analysis

This notebook analyzes the results of a chatbot evaluation run. It provides: standard metrics by scenario, function call error analysis, and LLM-based root cause analysis for each test case.

In [None]:
import json
import pandas as pd
from collections import Counter, defaultdict
from pathlib import Path

# Path to evaluation_results.json (update if needed)
results_path = Path('evaluation_results.json')
with open(results_path, 'r', encoding='utf-8') as f:
    data = json.load(f)
rows = data[0]['rows'] if isinstance(data, list) and 'rows' in data[0] else data
df = pd.json_normalize(rows)
print(f'Loaded {len(df)} test cases.')

## 1. Standard Metrics by Scenario

In [None]:
metrics = ['outputs.Precision_fn.score', 'outputs.Recall_fn.score', 'outputs.Precision_args.score', 'outputs.Recall_args.score', 'outputs.Reliability.score']
scenario_col = 'inputs.scenarioType'
summary = df.groupby(scenario_col)[metrics].mean().round(2)
display(summary)

## 2. Function Call Error Analysis

This section compares expected vs actual function calls and arguments, and identifies which functions fail the most.

In [None]:
def extract_fn_calls(row, key):
    calls = row.get(key, [])
    if isinstance(calls, str):
        try: calls = json.loads(calls)
        except: return []
    return [c.get('functionName') for c in calls if isinstance(c, dict) and 'functionName' in c]

df['expected_fns'] = df.apply(lambda r: extract_fn_calls(r, 'inputs.expected_function_calls'), axis=1)
df['actual_fns'] = df.apply(lambda r: extract_fn_calls(r, 'outputs.function_calls'), axis=1)

all_expected = [fn for fns in df['expected_fns'] for fn in fns]
all_actual = [fn for fns in df['actual_fns'] for fn in fns]
expected_counts = Counter(all_expected)
actual_counts = Counter(all_actual)

print('Expected function call counts:')
display(pd.DataFrame.from_dict(expected_counts, orient='index', columns=['expected']))
print('Actual function call counts:')
display(pd.DataFrame.from_dict(actual_counts, orient='index', columns=['actual']))

# Function-level error rates
fn_errors = defaultdict(lambda: {'missed': 0, 'extra': 0, 'total': 0})
for _, row in df.iterrows():
    exp, act = set(row['expected_fns']), set(row['actual_fns'])
    for fn in exp - act:
        fn_errors[fn]['missed'] += 1
    for fn in act - exp:
        fn_errors[fn]['extra'] += 1
    for fn in exp:
        fn_errors[fn]['total'] += 1

fn_error_df = pd.DataFrame(fn_errors).T
fn_error_df['missed_rate'] = (fn_error_df['missed'] / fn_error_df['total']).round(2)
fn_error_df['extra_rate'] = (fn_error_df['extra'] / fn_error_df['total']).round(2)
display(fn_error_df.sort_values('missed_rate', ascending=False))

## 3. LLM-based Root Cause Analysis (per test case)

For each test case, we use an LLM to analyze the chat history and function call results to provide insights on root causes of errors. (Requires OpenAI or compatible LLM API key.)

In [None]:
import os
import openai
openai.api_key = os.getenv('OPENAI_API_KEY')

# Helper to format chat history for LLM prompt
def format_chat_history(chat):
    if not isinstance(chat, list):
        return str(chat)
    out = []
    for m in chat:
        role = m.get('role', m.get('name', ''))
        content = m.get('content', '')
        if content:
            out.append(f"{role}: {content}")
    return '\n'.join(out)

def analyze_case_with_llm(chat_history, expected_calls, actual_calls, metrics):
    prompt = f'''
You are an expert in chatbot evaluation.

Chat history:
{chat_history}

Expected function calls: {expected_calls}
Actual function calls: {actual_calls}

Metrics: {metrics}

Analyze the root cause of any errors or mismatches. Provide actionable insights for improvement.
'''
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "system", "content": "You are a helpful AI assistant for chatbot evaluation."},
                      {"role": "user", "content": prompt}],
            max_tokens=300,
            temperature=0.2
        )
        return response['choices'][0]['message']['content']
    except Exception as e:
        return f"LLM analysis failed: {e}"

# Run LLM analysis for each test case (limit to first 5 for demo)
llm_results = []
for idx, row in df.head(5).iterrows():
    chat = row.get('outputs.chat_history', [])
    expected = row.get('inputs.expected_function_calls', [])
    actual = row.get('outputs.function_calls', [])
    metrics = {k: row.get(k, None) for k in metrics}
    chat_str = format_chat_history(chat)
    result = analyze_case_with_llm(chat_str, expected, actual, metrics)
    llm_results.append({
        'test_case': idx,
        'scenario': row.get('inputs.scenarioType', ''),
        'llm_analysis': result
    })
print('LLM root cause analysis complete for first 5 test cases.')

In [None]:
import pandas as pd
llm_df = pd.DataFrame(llm_results)
display(llm_df[['test_case', 'scenario', 'llm_analysis']])


---

## Usage Instructions

- Place this notebook in the output directory of a chatbot evaluation run (where `evaluation_results.json` is located).
- Run all cells to generate error analysis, including LLM-based root cause analysis (requires OpenAI API key).
- Review the metrics, function call analysis, and LLM insights to identify areas for improvement in the chatbot.
