## 1. Setup and Data Loading

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
from pathlib import Path
import json
import warnings

warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (14, 8)

print("✓ Libraries imported successfully")

✓ Libraries imported successfully


### 1.1 Define Array Run Path and Task Configurations

In [2]:
# Path to the array run directory
ARRAY_RUN_DIR = Path('experiments/array_run_828901_20251215_130321')

# Task configurations
TASK_CONFIGS = {
    'task0': {
        'name': 'Multi-Game PD+HD→SH',
        'training_games': ['prisoners-dilemma', 'hawk-dove'],
        'test_game': 'stag-hunt',
        'path': ARRAY_RUN_DIR / 'multi_game_training_task0'
    },
    'task1': {
        'name': 'Single-Game PD→HD',
        'training_games': ['prisoners-dilemma'],
        'test_game': 'hawk-dove',
        'path': ARRAY_RUN_DIR / 'single_game_pd_test_hd_task1'
    },
    'task2': {
        'name': 'Single-Game HD→BOS',
        'training_games': ['hawk-dove'],
        'test_game': 'battle-of-sexes',
        'path': ARRAY_RUN_DIR / 'single_game_hd_test_bos_task2'
    },
    'task3': {
        'name': 'Multi-Game SH+BOS→PD',
        'training_games': ['stag-hunt', 'battle-of-sexes'],
        'test_game': 'prisoners-dilemma',
        'path': ARRAY_RUN_DIR / 'multi_game_sh_bos_task3'
    }
}

# Verify all directories exist
print("Checking task directories:")
for task_id, config in TASK_CONFIGS.items():
    exists = config['path'].exists()
    status = "✓" if exists else "✗"
    print(f"{status} {task_id}: {config['name']}")
    print(f"   Path: {config['path']}")
    print(f"   Training: {', '.join(config['training_games'])} → Test: {config['test_game']}")

Checking task directories:
✓ task0: Multi-Game PD+HD→SH
   Path: experiments\array_run_828901_20251215_130321\multi_game_training_task0
   Training: prisoners-dilemma, hawk-dove → Test: stag-hunt
✓ task1: Single-Game PD→HD
   Path: experiments\array_run_828901_20251215_130321\single_game_pd_test_hd_task1
   Training: prisoners-dilemma → Test: hawk-dove
✓ task2: Single-Game HD→BOS
   Path: experiments\array_run_828901_20251215_130321\single_game_hd_test_bos_task2
   Training: hawk-dove → Test: battle-of-sexes
✓ task3: Multi-Game SH+BOS→PD
   Path: experiments\array_run_828901_20251215_130321\multi_game_sh_bos_task3
   Training: stag-hunt, battle-of-sexes → Test: prisoners-dilemma


### 1.2 Load Training Data for All Tasks

In [3]:
def load_training_data(task_id, config):
    """Load training logs for a specific task."""
    training_log_path = config['path'] / 'checkpoints' / 'detailed_training_logs' / 'detailed_training_log.csv'
    
    if not training_log_path.exists():
        print(f"⚠️  Training log not found for {task_id}: {training_log_path}")
        return None
    
    try:
        df = pd.read_csv(training_log_path)
        df['task_id'] = task_id
        df['task_name'] = config['name']
        df['training_type'] = 'multi-game' if len(config['training_games']) > 1 else 'single-game'
        return df
    except Exception as e:
        print(f"⚠️  Error loading {task_id}: {e}")
        return None

# Load all training data
training_dfs = []
for task_id, config in TASK_CONFIGS.items():
    df = load_training_data(task_id, config)
    if df is not None:
        training_dfs.append(df)
        print(f"✓ Loaded {len(df)} training records for {task_id}")

# Combine all training data
if training_dfs:
    all_training = pd.concat(training_dfs, ignore_index=True)
    print(f"\n✓ Total training records: {len(all_training)}")
    print(f"  Tasks loaded: {all_training['task_id'].nunique()}")
    print(f"  Columns: {list(all_training.columns)}")
else:
    print("⚠️  No training data loaded!")
    all_training = None

✓ Loaded 53025 training records for task0
✓ Loaded 60525 training records for task1
✓ Loaded 60525 training records for task1
✓ Loaded 26525 training records for task2
✓ Loaded 26525 training records for task2
✓ Loaded 52025 training records for task3

✓ Total training records: 192100
  Tasks loaded: 4
  Columns: ['network_serial_id', 'iteration', 'epoch', 'game_step', 'timestamp', 'game_name', 'opponent_name', 'opponent_type', 'total_loss', 'rl_loss', 'rl_loss_normalized', 'opponent_policy_loss', 'opponent_policy_loss_normalized', 'loss_ratio', 'alpha_contribution', 'alpha', 'policy_logit_cooperate', 'policy_logit_defect', 'opponent_policy_logit_defect', 'opponent_policy_logit_cooperate', 'value_estimate', 'policy_prob_cooperate', 'policy_prob_defect', 'opponent_policy_prob_defect', 'opponent_policy_prob_cooperate', 'agent_action', 'opponent_action', 'agent_reward', 'opponent_reward', 'true_opponent_defect_prob', 'true_opponent_cooperate_prob', 'advantage', 'temperature', 'gradient_no

### 1.3 Load Testing Data for All Tasks

In [4]:
def load_testing_data(task_id, config):
    """Load testing logs for a specific task."""
    testing_log_path = config['path'] / 'logs' / 'detailed_testing_logs' / 'detailed_testing_log.csv'
    
    if not testing_log_path.exists():
        print(f"⚠️  Testing log not found for {task_id}: {testing_log_path}")
        return None
    
    try:
        df = pd.read_csv(testing_log_path)
        df['task_id'] = task_id
        df['task_name'] = config['name']
        df['test_game'] = config['test_game']
        df['training_type'] = 'multi-game' if len(config['training_games']) > 1 else 'single-game'
        return df
    except Exception as e:
        print(f"⚠️  Error loading {task_id}: {e}")
        return None

# Load all testing data
testing_dfs = []
for task_id, config in TASK_CONFIGS.items():
    df = load_testing_data(task_id, config)
    if df is not None:
        testing_dfs.append(df)
        print(f"✓ Loaded {len(df)} testing records for {task_id}")

# Combine all testing data
if testing_dfs:
    all_testing = pd.concat(testing_dfs, ignore_index=True)
    print(f"\n✓ Total testing records: {len(all_testing)}")
    print(f"  Tasks tested: {all_testing['task_id'].nunique()}")
    print(f"  Columns: {list(all_testing.columns)}")
else:
    print("⚠️  No testing data loaded!")
    all_testing = None

✓ Loaded 25000 testing records for task0
✓ Loaded 25000 testing records for task1
✓ Loaded 25000 testing records for task1
✓ Loaded 25000 testing records for task2
✓ Loaded 25000 testing records for task2
✓ Loaded 25000 testing records for task3

✓ Total testing records: 100000
  Tasks tested: 4
  Columns: ['network_serial_id', 'test_session', 'test_iteration', 'timestamp', 'game_name', 'opponent_name', 'opponent_type', 'game_step_in_session', 'predicted_opponent_policy_defect', 'predicted_opponent_policy_cooperate', 'predicted_opponent_value', 'predicted_opponent_cooperation_likelihood', 'agent_policy_logit_cooperate', 'agent_policy_logit_defect', 'agent_value_estimate', 'agent_policy_prob_cooperate', 'agent_policy_prob_defect', 'agent_sampled_action', 'opponent_actual_action', 'agent_reward', 'opponent_reward', 'total_reward', 'true_opponent_defect_prob', 'true_opponent_cooperate_prob', 'prediction_accuracy', 'action_prediction_error', 'value_prediction_error', 'cumulative_agent_rewa

### 1.4 Preview Data Structure

In [5]:
if all_training is not None:
    print("=" * 80)
    print("TRAINING DATA PREVIEW")
    print("=" * 80)
    print(all_training.head())
    print(f"\nShape: {all_training.shape}")
    print(f"\nKey metrics available:")
    metric_cols = [col for col in all_training.columns if any(x in col.lower() for x in ['loss', 'reward', 'action', 'coop', 'opponent'])]
    for col in metric_cols:
        print(f"  - {col}")

if all_testing is not None:
    print("\n" + "=" * 80)
    print("TESTING DATA PREVIEW")
    print("=" * 80)
    print(all_testing.head())
    print(f"\nShape: {all_testing.shape}")
    print(f"\nKey metrics available:")
    metric_cols = [col for col in all_testing.columns if any(x in col.lower() for x in ['loss', 'reward', 'action', 'coop', 'opponent'])]
    for col in metric_cols:
        print(f"  - {col}")

TRAINING DATA PREVIEW
              network_serial_id  iteration  epoch  game_step  \
0  NET_20251215_130326_4a236900          1      0          0   
1  NET_20251215_130326_4a236900          2      0          1   
2  NET_20251215_130326_4a236900          3      0          2   
3  NET_20251215_130326_4a236900          4      0          3   
4  NET_20251215_130326_4a236900          5      0          4   

                    timestamp          game_name        opponent_name  \
0  2025-12-15T13:04:36.177266  prisoners-dilemma  Probabilistic-p0.70   
1  2025-12-15T13:04:36.177501  prisoners-dilemma  Probabilistic-p0.70   
2  2025-12-15T13:04:36.177658  prisoners-dilemma  Probabilistic-p0.70   
3  2025-12-15T13:04:36.177823  prisoners-dilemma  Probabilistic-p0.70   
4  2025-12-15T13:04:36.177946  prisoners-dilemma  Probabilistic-p0.70   

   opponent_type  total_loss    rl_loss  ...  agent_reward  opponent_reward  \
0            0.5         2.0  22.993235  ...           0.0              0.0

## 2. Training Dynamics Analysis
### 2.1 Loss Components Over Time

In [6]:
# Identify loss columns
loss_cols = [col for col in all_training.columns if 'loss' in col.lower() and col != 'task_id']

print("Loss components found:")
for col in loss_cols:
    print(f"  - {col}")

# Plot loss curves for all tasks
if 'total_loss' in all_training.columns or 'loss' in all_training.columns:
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=[config['name'] for config in TASK_CONFIGS.values()],
        vertical_spacing=0.12,
        horizontal_spacing=0.10
    )
    
    colors = {'task0': '#FF6B6B', 'task1': '#4ECDC4', 'task2': '#45B7D1', 'task3': '#FFA07A'}
    
    for idx, (task_id, config) in enumerate(TASK_CONFIGS.items()):
        row = idx // 2 + 1
        col = idx % 2 + 1
        
        task_data = all_training[all_training['task_id'] == task_id]
        
        # Try to find the total loss column
        loss_col = 'total_loss' if 'total_loss' in task_data.columns else 'loss' if 'loss' in task_data.columns else None
        
        if loss_col and not task_data[loss_col].isna().all():
            # Group by epoch and take mean
            if 'epoch' in task_data.columns:
                epoch_loss = task_data.groupby('epoch')[loss_col].mean().reset_index()
                x_col = 'epoch'
            else:
                epoch_loss = task_data.reset_index()
                x_col = 'index'
            
            fig.add_trace(
                go.Scatter(
                    x=epoch_loss[x_col],
                    y=epoch_loss[loss_col],
                    mode='lines',
                    name=task_id,
                    line=dict(color=colors[task_id], width=2),
                    showlegend=False
                ),
                row=row, col=col
            )
    
    fig.update_xaxes(title_text="Epoch", row=2, col=1)
    fig.update_xaxes(title_text="Epoch", row=2, col=2)
    fig.update_yaxes(title_text="Total Loss", row=1, col=1)
    fig.update_yaxes(title_text="Total Loss", row=2, col=1)
    
    fig.update_layout(
        title_text="Training Loss Curves Across All Tasks",
        height=800,
        showlegend=False
    )
    
    fig.show()
else:
    print("⚠️  No loss columns found in training data")

Loss components found:
  - total_loss
  - rl_loss
  - rl_loss_normalized
  - opponent_policy_loss
  - opponent_policy_loss_normalized
  - loss_ratio


### 2.2 RL Loss vs Opponent Prediction Loss

In [7]:
# Check for RL and opponent prediction loss components
rl_loss_col = [col for col in all_training.columns if 'rl_loss' in col.lower() or 'reward_loss' in col.lower()]
opp_loss_col = [col for col in all_training.columns if 'opponent' in col.lower() and 'loss' in col.lower()]

print("RL Loss columns:", rl_loss_col)
print("Opponent Loss columns:", opp_loss_col)

if rl_loss_col and opp_loss_col:
    rl_col = rl_loss_col[0]
    opp_col = opp_loss_col[0]
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=[f"{config['name']}<br>RL vs OpPred Loss" for config in TASK_CONFIGS.values()],
        vertical_spacing=0.12,
        horizontal_spacing=0.10
    )
    
    for idx, (task_id, config) in enumerate(TASK_CONFIGS.items()):
        row = idx // 2 + 1
        col = idx % 2 + 1
        
        task_data = all_training[all_training['task_id'] == task_id].copy()
        
        if 'epoch' in task_data.columns:
            # Group by epoch
            epoch_stats = task_data.groupby('epoch').agg({
                rl_col: 'mean',
                opp_col: 'mean'
            }).reset_index()
        else:
            epoch_stats = task_data[[rl_col, opp_col]].reset_index()
            epoch_stats.rename(columns={'index': 'epoch'}, inplace=True)
        
        # Add RL loss
        fig.add_trace(
            go.Scatter(
                x=epoch_stats['epoch'],
                y=epoch_stats[rl_col],
                mode='lines',
                name='RL Loss',
                line=dict(color='#FF6B6B', width=2),
                showlegend=(idx == 0)
            ),
            row=row, col=col
        )
        
        # Add opponent prediction loss
        fig.add_trace(
            go.Scatter(
                x=epoch_stats['epoch'],
                y=epoch_stats[opp_col],
                mode='lines',
                name='Opponent Pred Loss',
                line=dict(color='#4ECDC4', width=2),
                showlegend=(idx == 0)
            ),
            row=row, col=col
        )
    
    fig.update_xaxes(title_text="Epoch", row=2, col=1)
    fig.update_xaxes(title_text="Epoch", row=2, col=2)
    fig.update_yaxes(title_text="Loss", row=1, col=1)
    fig.update_yaxes(title_text="Loss", row=2, col=1)
    
    fig.update_layout(
        title_text="RL Loss vs Opponent Prediction Loss During Training",
        height=800,
        legend=dict(x=1.05, y=0.5)
    )
    
    fig.show()
else:
    print("⚠️  Loss component columns not found")
    print("Available columns:", list(all_training.columns))

RL Loss columns: ['rl_loss', 'rl_loss_normalized']
Opponent Loss columns: ['opponent_policy_loss', 'opponent_policy_loss_normalized']


### 2.3 Action Distribution Evolution During Training

### ⚠️ Important Finding: Early Stopping Triggered

**All experiments stopped due to early stopping criterion, NOT due to reaching max_epochs (20,000):**

- **Task 0** (Multi-Game PD+HD→SH): Converged at **epoch 52** (early stopping)
- **Task 1** (Single-Game PD→HD): Converged at **epoch 120** (early stopping)  
- **Task 2** (Single-Game HD→BOS): Converged at **epoch 52** (early stopping)
- **Task 3** (Multi-Game SH+BOS→PD): Converged at **epoch 51** (early stopping)

**Early Stopping Criterion:**
- **Patience**: 100 epochs
- **Threshold**: 1e-4 (0.0001)
- Training stops if loss doesn't improve by ≥0.0001 for 100 consecutive epochs

**Why training stopped so early:**
1. The network quickly found a local minimum (likely pure cooperation or pure defection)
2. Loss stopped improving within the first ~50 epochs
3. After 100 epochs of no improvement, early stopping was triggered
4. This suggests the network converged to **suboptimal strategies** rather than learning nuanced ToM behaviors

**Task 1 exception (120 epochs):** This task ran slightly longer, suggesting the PD→HD transfer might have explored the solution space more before settling into a local minimum.

**Implications:**
- The rapid convergence indicates the network is NOT learning complex opponent modeling
- It's likely adopting simple fixed strategies (e.g., "always cooperate" or "always defect")
- The early stopping criterion may be too aggressive for this problem
- Consider: relaxing patience (e.g., 500 epochs) or using a stricter improvement threshold

In [12]:
# Look for action/cooperation columns
action_cols = [col for col in all_training.columns if any(x in col.lower() for x in ['action', 'coop', 'cooperation'])]

print("Action-related columns found:")
for col in action_cols:
    print(f"  - {col}")

# Compute cooperation rate from agent_action column
coop_col = None
if 'agent_action' in all_training.columns:
    print(f"\n✓ Computing cooperation rate from 'agent_action' column")
    print(f"  Convention: action=0 → cooperation, action=1 → defection")
    
    # Create cooperation rate column: 1 if action=0 (cooperate), 0 if action=1 (defect)
    all_training['cooperation_rate'] = (all_training['agent_action'] == 0).astype(float)
    coop_col = 'cooperation_rate'
    
    # Show distribution
    print(f"\n  Cooperation rate statistics:")
    print(f"    Mean: {all_training['cooperation_rate'].mean():.4f}")
    print(f"    Std: {all_training['cooperation_rate'].std():.4f}")
else:
    print("\n⚠️  'agent_action' column not found - cannot compute cooperation rate")
    print(f"  Available columns: {list(all_training.columns)}")

if coop_col:
    print(f"\n✓ Using cooperation column: {coop_col}")
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=[config['name'] for config in TASK_CONFIGS.values()],
        vertical_spacing=0.12,
        horizontal_spacing=0.10
    )
    
    for idx, (task_id, config) in enumerate(TASK_CONFIGS.items()):
        row = idx // 2 + 1
        col = idx % 2 + 1
        
        task_data = all_training[all_training['task_id'] == task_id].copy()
        
        if 'epoch' in task_data.columns:
            epoch_coop = task_data.groupby('epoch')[coop_col].mean().reset_index()
            x_col = 'epoch'
        else:
            epoch_coop = task_data[[coop_col]].reset_index()
            x_col = 'index'
            epoch_coop.rename(columns={'index': 'epoch'}, inplace=True)
        
        fig.add_trace(
            go.Scatter(
                x=epoch_coop[x_col],
                y=epoch_coop[coop_col],
                mode='lines',
                name=task_id,
                line=dict(width=2),
                fill='tozeroy',
                showlegend=False
            ),
            row=row, col=col
        )
        
        # Add 50% reference line
        fig.add_hline(y=0.5, line_dash="dash", line_color="gray", 
                     annotation_text="50%", row=row, col=col)
    
    fig.update_xaxes(title_text="Epoch", row=2, col=1)
    fig.update_xaxes(title_text="Epoch", row=2, col=2)
    fig.update_yaxes(title_text="Cooperation Rate", row=1, col=1)
    fig.update_yaxes(title_text="Cooperation Rate", row=2, col=1)
    fig.update_yaxes(range=[0, 1])
    
    fig.update_layout(
        title_text="Agent Cooperation Rate Evolution During Training",
        height=800
    )
    
    fig.show()

Action-related columns found:
  - policy_logit_cooperate
  - opponent_policy_logit_cooperate
  - policy_prob_cooperate
  - opponent_policy_prob_cooperate
  - agent_action
  - opponent_action
  - true_opponent_cooperate_prob
  - cooperation_rate

✓ Computing cooperation rate from 'agent_action' column
  Convention: action=0 → cooperation, action=1 → defection

  Cooperation rate statistics:
    Mean: 0.3803
    Std: 0.4855

✓ Using cooperation column: cooperation_rate
    Std: 0.4855

✓ Using cooperation column: cooperation_rate


### 2.4 Reward Accumulation During Training

In [13]:
# Look for reward columns
reward_cols = [col for col in all_training.columns if 'reward' in col.lower()]

print("Reward columns found:")
for col in reward_cols:
    print(f"  - {col}")

reward_col = None
for col in reward_cols:
    if 'loss' not in col.lower():  # Exclude reward_loss
        reward_col = col
        break

if reward_col:
    print(f"\nUsing reward column: {reward_col}")
    
    fig = go.Figure()
    
    for task_id, config in TASK_CONFIGS.items():
        task_data = all_training[all_training['task_id'] == task_id].copy()
        
        if 'epoch' in task_data.columns:
            epoch_reward = task_data.groupby('epoch')[reward_col].mean().reset_index()
            x_col = 'epoch'
        else:
            epoch_reward = task_data[[reward_col]].reset_index()
            x_col = 'index'
            epoch_reward.rename(columns={'index': 'epoch'}, inplace=True)
        
        fig.add_trace(
            go.Scatter(
                x=epoch_reward[x_col],
                y=epoch_reward[reward_col],
                mode='lines',
                name=config['name'],
                line=dict(width=2)
            )
        )
    
    fig.update_layout(
        title="Average Reward During Training (All Tasks)",
        xaxis_title="Epoch",
        yaxis_title="Average Reward",
        height=500,
        hovermode='x unified'
    )
    
    fig.show()
    
    # Summary statistics
    print("\n" + "=" * 80)
    print("TRAINING REWARD SUMMARY")
    print("=" * 80)
    for task_id, config in TASK_CONFIGS.items():
        task_data = all_training[all_training['task_id'] == task_id]
        if not task_data[reward_col].isna().all():
            print(f"\n{config['name']}:")
            print(f"  Mean Reward: {task_data[reward_col].mean():.4f}")
            print(f"  Std Reward: {task_data[reward_col].std():.4f}")
            print(f"  Final 10 epochs mean: {task_data[reward_col].tail(10).mean():.4f}")
else:
    print("⚠️  No suitable reward column found")

Reward columns found:
  - agent_reward
  - opponent_reward

Using reward column: agent_reward



TRAINING REWARD SUMMARY

Multi-Game PD+HD→SH:
  Mean Reward: 2.0335
  Std Reward: 2.6039
  Final 10 epochs mean: 2.1000

Single-Game PD→HD:
  Mean Reward: 2.7651
  Std Reward: 2.0084
  Final 10 epochs mean: 3.4000

Single-Game HD→BOS:
  Mean Reward: 1.7542
  Std Reward: 3.0997
  Final 10 epochs mean: 1.8000

Multi-Game SH+BOS→PD:
  Mean Reward: 1.6281
  Std Reward: 1.4169
  Final 10 epochs mean: 0.8000


## 3. Opponent Prediction Accuracy Analysis
### 3.1 Opponent Prediction Accuracy Over Training

In [None]:
# Look for opponent prediction accuracy
opp_acc_cols = [col for col in all_training.columns if 'opponent' in col.lower() and ('acc' in col.lower() or 'prediction' in col.lower())]

print("Opponent prediction columns found:")
for col in opp_acc_cols:
    print(f"  - {col}")

# Also check for opponent cooperation probability predictions
opp_prob_cols = [col for col in all_training.columns if 'opponent' in col.lower() and 'prob' in col.lower()]
print("\nOpponent probability columns:")
for col in opp_prob_cols:
    print(f"  - {col}")

# Try to find accuracy or prediction quality metric
acc_col = None
if opp_acc_cols:
    acc_col = opp_acc_cols[0]
elif 'opponent_prediction_accuracy' in all_training.columns:
    acc_col = 'opponent_prediction_accuracy'

if acc_col:
    print(f"\nUsing accuracy column: {acc_col}")
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=[config['name'] for config in TASK_CONFIGS.values()],
        vertical_spacing=0.12,
        horizontal_spacing=0.10
    )
    
    for idx, (task_id, config) in enumerate(TASK_CONFIGS.items()):
        row = idx // 2 + 1
        col = idx % 2 + 1
        
        task_data = all_training[all_training['task_id'] == task_id].copy()
        
        if 'epoch' in task_data.columns:
            epoch_acc = task_data.groupby('epoch')[acc_col].mean().reset_index()
            x_col = 'epoch'
        else:
            epoch_acc = task_data[[acc_col]].reset_index()
            x_col = 'index'
            epoch_acc.rename(columns={'index': 'epoch'}, inplace=True)
        
        fig.add_trace(
            go.Scatter(
                x=epoch_acc[x_col],
                y=epoch_acc[acc_col],
                mode='lines',
                name=task_id,
                line=dict(width=2),
                showlegend=False
            ),
            row=row, col=col
        )
        
        # Add 50% baseline (random guessing)
        fig.add_hline(y=0.5, line_dash="dash", line_color="red", 
                     annotation_text="Random", row=row, col=col)
    
    fig.update_xaxes(title_text="Epoch", row=2, col=1)
    fig.update_xaxes(title_text="Epoch", row=2, col=2)
    fig.update_yaxes(title_text="Opponent Prediction Accuracy", row=1, col=1)
    fig.update_yaxes(title_text="Opponent Prediction Accuracy", row=2, col=1)
    fig.update_yaxes(range=[0, 1])
    
    fig.update_layout(
        title_text="Theory of Mind Development: Opponent Prediction Accuracy",
        height=800
    )
    
    fig.show()
else:
    print("⚠️  No opponent prediction accuracy column found")
    print("\nTrying to compute accuracy from opponent predictions...")
    
    # If we have opponent actions and predictions, compute accuracy
    if 'opponent_action' in all_training.columns and any('opponent_coop_prob' in col for col in all_training.columns):
        print("✓ Can compute accuracy from opponent actions and predictions")

## 4. Test Performance Analysis
### 4.1 Test Performance Summary

In [None]:
if all_testing is not None:
    print("=" * 80)
    print("TEST PERFORMANCE SUMMARY")
    print("=" * 80)
    
    # Find reward column in testing data
    test_reward_col = None
    for col in all_testing.columns:
        if 'reward' in col.lower() and 'loss' not in col.lower():
            test_reward_col = col
            break
    
    if test_reward_col:
        for task_id, config in TASK_CONFIGS.items():
            task_test = all_testing[all_testing['task_id'] == task_id]
            if len(task_test) > 0:
                print(f"\n{config['name']}:")
                print(f"  Training: {', '.join(config['training_games'])}")
                print(f"  Test Game: {config['test_game']}")
                print(f"  Test Records: {len(task_test)}")
                print(f"  Mean Reward: {task_test[test_reward_col].mean():.4f} ± {task_test[test_reward_col].std():.4f}")
                print(f"  Median Reward: {task_test[test_reward_col].median():.4f}")
                print(f"  Min/Max Reward: {task_test[test_reward_col].min():.4f} / {task_test[test_reward_col].max():.4f}")
    else:
        print("⚠️  No reward column found in testing data")
        print("Available columns:", list(all_testing.columns))

### 4.2 Test Performance by Opponent Type

In [None]:
if all_testing is not None and test_reward_col:
    # Check for opponent type column
    opp_type_col = None
    for col in all_testing.columns:
        if 'opponent' in col.lower() and ('type' in col.lower() or 'prob' in col.lower() or 'name' in col.lower()):
            opp_type_col = col
            break
    
    if opp_type_col:
        print(f"Using opponent type column: {opp_type_col}")
        
        fig = go.Figure()
        
        for task_id, config in TASK_CONFIGS.items():
            task_test = all_testing[all_testing['task_id'] == task_id]
            if len(task_test) > 0:
                # Group by opponent type
                opp_stats = task_test.groupby(opp_type_col)[test_reward_col].agg(['mean', 'std']).reset_index()
                
                fig.add_trace(
                    go.Scatter(
                        x=opp_stats[opp_type_col],
                        y=opp_stats['mean'],
                        error_y=dict(type='data', array=opp_stats['std']),
                        mode='lines+markers',
                        name=config['name'],
                        line=dict(width=2),
                        marker=dict(size=8)
                    )
                )
        
        fig.update_layout(
            title="Test Performance by Opponent Type (All Tasks)",
            xaxis_title="Opponent Defection Probability",
            yaxis_title="Average Reward",
            height=500,
            hovermode='x unified'
        )
        
        fig.show()
    else:
        print("⚠️  No opponent type column found in testing data")

### 4.3 Training vs Testing Reward Comparison

In [None]:
if all_training is not None and all_testing is not None and reward_col and test_reward_col:
    print("=" * 80)
    print("TRAINING vs TESTING COMPARISON")
    print("=" * 80)
    
    comparison_data = []
    
    for task_id, config in TASK_CONFIGS.items():
        # Training performance (final epoch)
        train_data = all_training[all_training['task_id'] == task_id]
        if 'epoch' in train_data.columns:
            final_train = train_data[train_data['epoch'] == train_data['epoch'].max()]
        else:
            final_train = train_data.tail(10)  # Last 10 records
        
        train_reward = final_train[reward_col].mean() if len(final_train) > 0 else np.nan
        
        # Testing performance
        test_data = all_testing[all_testing['task_id'] == task_id]
        test_reward = test_data[test_reward_col].mean() if len(test_data) > 0 else np.nan
        
        comparison_data.append({
            'Task': config['name'],
            'Training Type': config['training_type'],
            'Training Reward': train_reward,
            'Test Reward': test_reward,
            'Generalization Gap': train_reward - test_reward
        })
        
        print(f"\n{config['name']}:")
        print(f"  Training Reward (final): {train_reward:.4f}")
        print(f"  Test Reward: {test_reward:.4f}")
        print(f"  Generalization Gap: {train_reward - test_reward:.4f}")
    
    # Create comparison DataFrame
    comparison_df = pd.DataFrame(comparison_data)
    
    # Plot comparison
    fig = go.Figure()
    
    x_labels = comparison_df['Task']
    
    fig.add_trace(go.Bar(
        name='Training Reward',
        x=x_labels,
        y=comparison_df['Training Reward'],
        marker_color='#4ECDC4'
    ))
    
    fig.add_trace(go.Bar(
        name='Test Reward',
        x=x_labels,
        y=comparison_df['Test Reward'],
        marker_color='#FF6B6B'
    ))
    
    fig.update_layout(
        title="Training vs Test Performance Comparison",
        xaxis_title="Task",
        yaxis_title="Average Reward",
        barmode='group',
        height=500
    )
    
    fig.show()
    
    print("\n" + "=" * 80)
    print("GENERALIZATION ANALYSIS")
    print("=" * 80)
    print(f"\nBest Generalization (smallest gap): {comparison_df.loc[comparison_df['Generalization Gap'].idxmin(), 'Task']}")
    print(f"  Gap: {comparison_df['Generalization Gap'].min():.4f}")
    print(f"\nWorst Generalization (largest gap): {comparison_df.loc[comparison_df['Generalization Gap'].idxmax(), 'Task']}")
    print(f"  Gap: {comparison_df['Generalization Gap'].max():.4f}")

## 5. Cross-Task Comparison
### 5.1 Multi-Game vs Single-Game Training

In [None]:
if all_testing is not None and test_reward_col:
    print("=" * 80)
    print("MULTI-GAME vs SINGLE-GAME TRAINING COMPARISON")
    print("=" * 80)
    
    # Group by training type
    multi_game_tasks = [tid for tid, cfg in TASK_CONFIGS.items() if len(cfg['training_games']) > 1]
    single_game_tasks = [tid for tid, cfg in TASK_CONFIGS.items() if len(cfg['training_games']) == 1]
    
    multi_test_rewards = all_testing[all_testing['task_id'].isin(multi_game_tasks)][test_reward_col]
    single_test_rewards = all_testing[all_testing['task_id'].isin(single_game_tasks)][test_reward_col]
    
    print(f"\nMulti-Game Training (Tasks {multi_game_tasks}):")
    print(f"  Mean Test Reward: {multi_test_rewards.mean():.4f} ± {multi_test_rewards.std():.4f}")
    print(f"  N = {len(multi_test_rewards)}")
    
    print(f"\nSingle-Game Training (Tasks {single_game_tasks}):")
    print(f"  Mean Test Reward: {single_test_rewards.mean():.4f} ± {single_test_rewards.std():.4f}")
    print(f"  N = {len(single_test_rewards)}")
    
    # Statistical test
    if len(multi_test_rewards) > 0 and len(single_test_rewards) > 0:
        t_stat, p_value = stats.ttest_ind(multi_test_rewards, single_test_rewards)
        print(f"\nIndependent t-test:")
        print(f"  t-statistic: {t_stat:.4f}")
        print(f"  p-value: {p_value:.4f}")
        print(f"  Significant difference: {'Yes' if p_value < 0.05 else 'No'} (α=0.05)")
    
    # Box plot comparison
    fig = go.Figure()
    
    fig.add_trace(go.Box(
        y=multi_test_rewards,
        name='Multi-Game',
        marker_color='#FF6B6B'
    ))
    
    fig.add_trace(go.Box(
        y=single_test_rewards,
        name='Single-Game',
        marker_color='#4ECDC4'
    ))
    
    fig.update_layout(
        title="Test Performance: Multi-Game vs Single-Game Training",
        yaxis_title="Test Reward",
        height=500
    )
    
    fig.show()

## 6. Key Findings Summary

In [None]:
print("=" * 80)
print("KEY FINDINGS SUMMARY")
print("=" * 80)

if all_training is not None:
    print("\n📊 TRAINING DYNAMICS:")
    print(f"  - Total training records analyzed: {len(all_training):,}")
    print(f"  - Tasks trained: {all_training['task_id'].nunique()}")
    
    if 'epoch' in all_training.columns:
        for task_id, config in TASK_CONFIGS.items():
            task_data = all_training[all_training['task_id'] == task_id]
            if len(task_data) > 0:
                max_epoch = task_data['epoch'].max()
                print(f"  - {config['name']}: {max_epoch} epochs")

if all_testing is not None:
    print("\n🎯 TEST PERFORMANCE:")
    print(f"  - Total test records analyzed: {len(all_testing):,}")
    if test_reward_col:
        best_task = all_testing.groupby('task_id')[test_reward_col].mean().idxmax()
        best_reward = all_testing.groupby('task_id')[test_reward_col].mean().max()
        print(f"  - Best performing task: {TASK_CONFIGS[best_task]['name']}")
        print(f"  - Best test reward: {best_reward:.4f}")

if 'comparison_df' in locals():
    print("\n🔄 GENERALIZATION:")
    best_gen_idx = comparison_df['Generalization Gap'].abs().idxmin()
    print(f"  - Best generalization: {comparison_df.loc[best_gen_idx, 'Task']}")
    print(f"  - Generalization gap: {comparison_df.loc[best_gen_idx, 'Generalization Gap']:.4f}")

print("\n✅ Analysis complete!")