# Models Behavior Analysis

## Setup

In [None]:
import json
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (8, 6)
plt.rcParams["axes.grid"] = False

# Paths – adjust if needed
BASELINE_CSV = "rft_eval_results_baseline.csv"
STEP10_CSV = "rft_eval_results_step10.csv"
BASELINE_JSONL = "rft_eval_results_baseline.jsonl"
STEP10_JSONL = "rft_eval_results_step10.jsonl"


## Global averages

### Reward, latency, output_tokens

In [None]:
# Load CSVs
baseline_df = pd.read_csv(f"build_hour/finqa_model_text/tool_evals/{BASELINE_CSV}")
step10_df = pd.read_csv(f"build_hour/finqa_model_text/tool_evals/{STEP10_CSV}")

# Replace -1 rewards → 0
baseline_df["reward"] = baseline_df["reward"].replace(-1, 0)
step10_df["reward"] = step10_df["reward"].replace(-1, 0)

def describe_cols(df, name):
    r = df["reward"].to_numpy()
    lat = df["latency_s"].to_numpy()
    toks = df["output_tokens"].to_numpy()
    out = {
        "model": name,
        "reward_mean": np.mean(r),
        "reward_std": np.std(r),
        "lat_mean": np.mean(lat),
        "lat_std": np.std(lat),
        "tokens_mean": np.mean(toks),
        "tokens_std": np.std(toks),
    }
    return out

baseline_summary = {k: round(v, 2) if isinstance(v, float) else v for k, v in describe_cols(baseline_df, "baseline").items()}
step10_summary = {k: round(v, 2) if isinstance(v, float) else v for k, v in describe_cols(step10_df, "step10").items()}

baseline_summary, step10_summary


In [None]:
import plotly.express as px
import pandas as pd

summary_df = pd.DataFrame([
    {"model": baseline_summary["model"], "reward": baseline_summary["reward_mean"], "latency": baseline_summary["lat_mean"]},
    {"model": step10_summary["model"], "reward": step10_summary["reward_mean"], "latency": step10_summary["lat_mean"]},
])

fig = px.scatter(
    summary_df,
    x="latency",
    y="reward",
    color="model",
    text="model",
    title="Performance vs Latency — Baseline vs Step10",
)

fig.update_traces(textposition="top center", marker=dict(size=14))
fig.update_layout(
    xaxis_title="Avg Latency (s)",
    yaxis_title="Avg Reward",
    width=800,
    height=500,
)
fig.show()


### Tool calls per trace

In [None]:
def load_jsonl(path):
    rows = []
    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except Exception:
                continue
    return rows

baseline_rows = load_jsonl(f"build_hour/finqa_model_text/tool_evals/{BASELINE_JSONL}")
step10_rows = load_jsonl(f"build_hour/finqa_model_text/tool_evals/{STEP10_JSONL}")


def extract_tool_calls_from_row(row):
    traj_raw = row.get("trajectory")
    if not traj_raw:
        return []
    try:
        traj = json.loads(traj_raw)
    except Exception:
        return []
    calls = []
    for msg in traj:
        if msg.get("type") == "function_call":
            name = msg.get("name")
            if name:
                calls.append(name)
    return calls


def analyze_tool_calls(rows):
    per_trace_counts = []
    tool_counter = Counter()
    first_tool_counter = Counter()
    last_tool_counter = Counter()
    seq_counter = Counter()
    tool_calls_by_item = defaultdict(list)

    for r in rows:
        calls = extract_tool_calls_from_row(r)
        per_trace_counts.append(len(calls))
        tool_counter.update(calls)

        if calls:
            first_tool_counter[calls[0]] += 1
            last_tool_counter[calls[-1]] += 1
            for a, b in zip(calls, calls[1:]):
                seq_counter[(a, b)] += 1

        item = r.get("item_index")
        if item is not None:
            tool_calls_by_item[item].append(len(calls))

    return {
        "per_trace_counts": per_trace_counts,
        "tool_counter": tool_counter,
        "first_tool_counter": first_tool_counter,
        "last_tool_counter": last_tool_counter,
        "seq_counter": seq_counter,
        "tool_calls_by_item": tool_calls_by_item,
    }


baseline_stats = analyze_tool_calls(baseline_rows)
step10_stats = analyze_tool_calls(step10_rows)

def summarize_counts(counts):
    arr = np.array(counts)
    return {
        "mean": float(np.mean(arr)),
        "std": float(np.std(arr)),
        "min": int(arr.min()),
        "max": int(arr.max()),
    }

baseline_tool_summary = {k: round(v, 2) if isinstance(v, float) else v for k, v in summarize_counts(baseline_stats["per_trace_counts"]).items()}
step10_tool_summary = {k: round(v, 2) if isinstance(v, float) else v for k, v in summarize_counts(step10_stats["per_trace_counts"]).items()}

baseline_tool_summary, step10_tool_summary


## Per-item deltas

In [None]:
# From CSV: per-item averages of reward & tool_calls
b_item = (
    baseline_df
    .groupby("item_index")
    .agg({"reward": "mean", "tool_calls": "mean"})
    .rename(columns={"reward": "b_reward", "tool_calls": "b_tools"})
)

s10_item = (
    step10_df
    .groupby("item_index")
    .agg({"reward": "mean", "tool_calls": "mean"})
    .rename(columns={"reward": "s_reward", "tool_calls": "s_tools"})
)

item_delta = b_item.join(s10_item, how="inner")
item_delta["reward_diff"] = item_delta["s_reward"] - item_delta["b_reward"]
item_delta["tool_diff"] = item_delta["s_tools"] - item_delta["b_tools"]

# Round to 2 decimals for displaying/plotting
item_delta_plot = item_delta.round(2)

item_delta_plot.head()


## Quadrant plot

In [None]:
import plotly.express as px
import plotly.graph_objects as go

def classify_row(row):
    rd = row["reward_diff"]
    td = row["tool_diff"]
    if rd > 0 and td < 0:
        return "Improved reward & fewer tools"
    elif rd > 0 and td > 0:
        return "Improved reward & more tools"
    elif rd < 0 and td < 0:
        return "Worse reward & fewer tools"
    elif rd < 0 and td > 0:
        return "Worse reward & more tools"
    else:
        return "No change / neutral"

# Recompute category if needed
item_delta["category"] = item_delta.apply(classify_row, axis=1)

# Put item_index back as a column for hover + plotting
item_delta_plot = item_delta.reset_index()  # item_index becomes a column

# Counts for legend labels
counts = item_delta_plot["category"].value_counts().to_dict()

# Label with counts
item_delta_plot["category_with_n"] = item_delta_plot["category"].map(
    lambda c: f"{c} (n={counts.get(c, 0)})"
)

colors = {
    "Improved reward & fewer tools": "green",
    "Improved reward & more tools": "blue",
    "Worse reward & fewer tools": "orange",
    "Worse reward & more tools": "red",
    "No change / neutral": "gray",
}

# Build discrete color map using the "(n=…)" labels
color_map = {
    f"{cat} (n={counts.get(cat, 0)})": col
    for cat, col in colors.items()
}

fig = px.scatter(
    item_delta_plot,
    x="tool_diff",
    y="reward_diff",
    color="category_with_n",
    color_discrete_map=color_map,
    hover_data=["item_index", "b_reward", "s_reward", "b_tools", "s_tools"],
)

# Add quadrant lines at 0
fig.add_shape(
    type="line",
    x0=0, x1=0,
    y0=item_delta_plot["reward_diff"].min(),
    y1=item_delta_plot["reward_diff"].max(),
    line=dict(color="black", width=1, dash="dash")
)
fig.add_shape(
    type="line",
    x0=item_delta_plot["tool_diff"].min(),
    x1=item_delta_plot["tool_diff"].max(),
    y0=0, y1=0,
    line=dict(color="black", width=1, dash="dash")
)

fig.update_layout(
    title="Quadrant Comparison: Step10 vs Baseline",
    xaxis_title="Δ Tool Calls (Step10 - Baseline)",
    yaxis_title="Δ Reward (Step10 - Baseline)",
    width=900,
    height=600,  
    legend=dict(
        x=1.0,
        y=1.0,
        xanchor="right",
        yanchor="top"
    ),
    template="plotly_white"
)

fig.update_traces(marker=dict(size=12))

fig.show()


## Trajectory / sequence analysis & example trace


### Per-tool averages per trace

In [None]:
def per_trace_tool_usage(stats):
    total_traces = len(stats["per_trace_counts"])
    return {tool: round(count / total_traces, 2) for tool, count in stats["tool_counter"].items()}

baseline_per_trace = per_trace_tool_usage(baseline_stats)
step10_per_trace = per_trace_tool_usage(step10_stats)

baseline_per_trace, step10_per_trace

### Repeat bigrams (search → search, cat → cat) and fraction of repeats

In [None]:
def repeat_fraction(stats):
    total_bigrams = sum(stats["seq_counter"].values())
    repeat = sum(
        c for (a, b), c in stats["seq_counter"].items() if a == b
    )
    fraction = round(repeat / total_bigrams, 2) if total_bigrams > 0 else np.nan
    return {
        "repeat": round(repeat, 2),
        "total_bigrams": round(total_bigrams, 2),
        "fraction": fraction,
    }

baseline_repeats = repeat_fraction(baseline_stats)
step10_repeats = repeat_fraction(step10_stats)

baseline_repeats, step10_repeats


### Pull an example where baseline loops search/search/... and Step10 doesn’t

In [None]:
# Build mapping item_index -> list of seqs of tool names
from collections import defaultdict

def extract_call_names(row):
    traj_raw = row.get("trajectory")
    if not traj_raw:
        return []
    try:
        traj = json.loads(traj_raw)
    except Exception:
        return []
    return [m.get("name") for m in traj if m.get("type") == "function_call"]


b_calls = defaultdict(list)
s_calls = defaultdict(list)

for r in baseline_rows:
    b_calls[r["item_index"]].append(extract_call_names(r))

for r in step10_rows:
    s_calls[r["item_index"]].append(extract_call_names(r))


def count_search_loops(seq):
    """Count how many times we see search→search consecutively."""
    c = 0
    for a, b in zip(seq, seq[1:]):
        if a == "search" and b == "search":
            c += 1
    return c


candidates = []
for item in b_calls:
    if item not in s_calls:
        continue
    b_seqs = [seq for seq in b_calls[item] if seq]
    s_seqs = [seq for seq in s_calls[item] if seq]
    if not b_seqs or not s_seqs:
        continue
    b_loop = max(count_search_loops(seq) for seq in b_seqs)
    s_loop = max(count_search_loops(seq) for seq in s_seqs)
    # Strong baseline loop, none in step10
    if b_loop >= 2 and s_loop == 0:
        candidates.append(item)

candidates[:10], len(candidates)


In [None]:
example_item = candidates[0]

def best_baseline_trace_for_item(item):
    best_seq = None
    best_loops = -1
    for r in baseline_rows:
        if r["item_index"] != item:
            continue
        seq = extract_call_names(r)
        loops = count_search_loops(seq)
        if loops > best_loops:
            best_loops = loops
            best_seq = seq
    return best_seq

def any_step10_trace_for_item(item):
    for r in step10_rows:
        if r["item_index"] != item:
            continue
        return extract_call_names(r)
    return []

b_seq = best_baseline_trace_for_item(example_item)
s_seq = any_step10_trace_for_item(example_item)

example_item, b_seq, s_seq