 ## NaN

In [3]:
import os
import json
from collections import defaultdict
from llm_arithmetic.types import Trial

from rich.table import Table
from rich.console import Console

# Path to results directory
results_dir = "results"

# Group trials by model
model_trials = defaultdict(list)

# Iterate over all .jsonl files in the results directory
for fname in os.listdir(results_dir):
    if fname.endswith(".jsonl"):
        with open(os.path.join(results_dir, fname), "r") as f:
            for line in f:
                data = json.loads(line)
                # Convert tokens dict to flat fields if needed
                if "tokens" in data:
                    data["prompt_tokens"] = data["tokens"].get("prompt_tokens", 0)
                    data["completion_tokens"] = data["tokens"].get("completion_tokens", 0)
                    del data["tokens"]
                # Fill missing fields with defaults if necessary
                for field in ["extra_context", "attempts", "failed_to_get_reply"]:
                    if field not in data:
                        if field == "extra_context":
                            data[field] = 0
                        elif field == "attempts":
                            data[field] = 1
                        elif field == "failed_to_get_reply":
                            data[field] = False
                trial = Trial(**data)
                model_trials[trial.model].append(trial)

# Count NaN in classification per model and calculate percentages
nan_data = {}
for model, trials in model_trials.items():
    nan_count = sum(t.classification == "NaN" for t in trials)
    total_count = len(trials)
    nan_percentage = (nan_count / total_count * 100) if total_count > 0 else 0
    nan_data[model] = (nan_count, total_count, nan_percentage)

# Create sorted list of (model, data) tuples by NaN count
sorted_data = sorted(nan_data.items(), key=lambda x: x[1][0], reverse=True)

# Create Rich table
table = Table(title="NaN Classifications by Model")
table.add_column("Model", style="cyan")
table.add_column("NaN Classifications", style="bold yellow")
table.add_column("Percentage", style="bold green")

for model, (nan_count, total_count, percentage) in sorted_data:
    table.add_row(model, f"{nan_count}/{total_count}", f"{percentage:.1f}%")

# Display table
console = Console()
console.print(table)

In [6]:
# Show NaN trials for grok-3-mini-beta-high in compact format: correct - raw
model_name = "grok-3-mini-beta-high"

if model_name in model_trials:
    trials = model_trials[model_name]
    nan_trials = [t for t in trials if t.classification == "NaN"]
    
    print(f"NaN trials for {model_name} ({len(nan_trials)} cases):")
    print("Correct -> Raw Response")
    print("-" * 50)
    
    for trial in nan_trials:
        print(f"{trial.correct} -> {repr(trial.raw_response)}")
else:
    print(f"Model {model_name} not found in the data")

NaN trials for grok-3-mini-beta-high (17 cases):
Correct -> Raw Response
--------------------------------------------------
1128 -> '\\boxed{1128}'
1164943 -> '\\boxed{1164943}'
5402825 -> '\\boxed{5402825}'
115755958 -> '\\boxed{115755958}'
52477 -> 'The subtraction 64051 - 11574 is computed step by step. After performing the calculation, the result is verified by addition.\n\n\\boxed{52477}'
1364 -> '\\boxed{1364}'
437552 -> '\\boxed{437552}'
112035 -> '\\boxed{112035}'
30818115 -> '\\boxed{30818115}'
15 -> '\\boxed{15}'
143.53 -> '\\boxed{143.53}'
7245.47 -> '\\boxed{7245.47}'
47363.87 -> '\\boxed{47363.87}'
-914.20 -> '\\boxed{-914.20}'
30089859.97 -> '\\boxed{30089859.97}'
3325.9781 -> '\\boxed{3325.9781}'
122953.0746 -> '\\boxed{122953.0746}'


In [7]:
model_name = "claude-sonnet-3.7-20250219-thinking4096"

if model_name in model_trials:
    trials = model_trials[model_name]
    nan_trials = [t for t in trials if t.classification == "NaN"]
    
    print(f"NaN trials for {model_name} ({len(nan_trials)} cases):")
    print("Correct -> Raw Response")
    print("-" * 50)
    
    for trial in nan_trials:
        print(f"{trial.correct} -> {repr(trial.raw_response)}")
else:
    print(f"Model {model_name} not found in the data")

NaN trials for claude-sonnet-3.7-20250219-thinking4096 (97 cases):
Correct -> Raw Response
--------------------------------------------------
1291972577 -> ''
-17361554 -> ''
-280113821 -> ''
6610509564 -> ''
-4438377056 -> ''
4272353979 -> ''
446670810 -> ''
3020510016 -> ''
1570813062 -> ''
4859204972 -> ''
370241826750 -> ''
37559501262 -> ''
811096868772 -> ''
835511114673 -> ''
22134054139368 -> ''
5869222017048 -> ''
31084596661788 -> ''
1712500922391175 -> ''
5739094492829595 -> ''
4889600339740209 -> ''
914381944706580 -> ''
3582983788464320 -> ''
584445667966190 -> ''
515693301793758396 -> ''
161053607521505736 -> ''
125252323661604564 -> ''
61575565731670143 -> ''
184250880339512845 -> ''
239121724375175640 -> ''
348499316001641958 -> ''
26527589690037975304 -> ''
26204076529781503296 -> ''
11210450538159555144 -> ''
9949751046499996400 -> ''
14656027415155492888 -> ''
22423 -> ''
76037 -> ''
624872 -> ''
950332 -> ''
4496361 -> ''
7840349 -> ''
6320682 -> ''
8221992 -> ''
22

In [None]:
model_name = "qwen3-32b@cerebras-thinking "

if model_name in model_trials:
    trials = model_trials[model_name]
    nan_trials = [t for t in trials if t.classification == "NaN"]
    
    print(f"NaN trials for {model_name} ({len(nan_trials)} cases):")
    print("Correct -> Raw Response")
    print("-" * 50)
    
    for trial in nan_trials:
        print(f"{trial.correct} -> {repr(trial.raw_response)}")
else:
    print(f"Model {model_name} not found in the data")

NaN trials for claude-sonnet-3.7-20250219-thinking4096 (97 cases):
Correct -> Raw Response
--------------------------------------------------
1291972577 -> ''
-17361554 -> ''
-280113821 -> ''
6610509564 -> ''
-4438377056 -> ''
4272353979 -> ''
446670810 -> ''
3020510016 -> ''
1570813062 -> ''
4859204972 -> ''
370241826750 -> ''
37559501262 -> ''
811096868772 -> ''
835511114673 -> ''
22134054139368 -> ''
5869222017048 -> ''
31084596661788 -> ''
1712500922391175 -> ''
5739094492829595 -> ''
4889600339740209 -> ''
914381944706580 -> ''
3582983788464320 -> ''
584445667966190 -> ''
515693301793758396 -> ''
161053607521505736 -> ''
125252323661604564 -> ''
61575565731670143 -> ''
184250880339512845 -> ''
239121724375175640 -> ''
348499316001641958 -> ''
26527589690037975304 -> ''
26204076529781503296 -> ''
11210450538159555144 -> ''
9949751046499996400 -> ''
14656027415155492888 -> ''
22423 -> ''
76037 -> ''
624872 -> ''
950332 -> ''
4496361 -> ''
7840349 -> ''
6320682 -> ''
8221992 -> ''
22