# Cognitive Therapy AI - Training and Testing Analysis

This notebook provides comprehensive analysis of training and testing results from the ToM-RL (Theory of Mind Reinforcement Learning) experiments.

## Analysis Overview:
1. **Training Results by Schema**: Analyze loss curves, convergence patterns, and learning dynamics
2. **Test Performance**: Evaluate generalization to unseen games and opponents
3. **Training vs Testing Comparison**: Identify overfitting, transfer learning effectiveness
4. **Cross-Game Generalization**: Compare performance across different game types
5. **Statistical Analysis**: Quantify performance differences and correlations

## 1. Import Required Libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats
from pathlib import Path
import json
import glob
import warnings

warnings.filterwarnings('ignore')

# Set style for better-looking plots
sns.set_style("whitegrid")
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Libraries imported successfully")

✓ Libraries imported successfully


## 2. Load Training and Testing Data

Load data from the experiments directory structure:
- Training logs: `experiments/.../checkpoints/detailed_training_logs/detailed_training_log.csv`
- Testing logs: `experiments/.../logs/detailed_testing_logs/detailed_testing_log.csv`
- Experiment config: `experiments/.../experiment_config.json`

### Diagnostic: Check JSON File Integrity

In [4]:
# Diagnostic: Find and inspect corrupted JSON files
from pathlib import Path
import json

experiments_path = Path('experiments')
experiment_dirs = sorted([d for d in experiments_path.glob('mixed_motive_experiment_*') if d.is_dir()])

print("Checking all experiment_config.json files...\n")

corrupted_files = []
for exp_dir in experiment_dirs:
    config_file = exp_dir / 'experiment_config.json'
    if config_file.exists():
        try:
            with open(config_file, 'r') as f:
                json.load(f)
            print(f"✓ {exp_dir.name}")
        except json.JSONDecodeError as e:
            print(f"✗ {exp_dir.name}")
            print(f"  Error: {e}")
            corrupted_files.append((config_file, e))
            
            # Show the problematic area
            with open(config_file, 'r') as f:
                content = f.read()
                lines = content.split('\n')
                error_line = e.lineno if hasattr(e, 'lineno') else None
                if error_line:
                    start = max(0, error_line - 3)
                    end = min(len(lines), error_line + 3)
                    print(f"\n  Content around line {error_line}:")
                    for i in range(start, end):
                        marker = " >>>" if i == error_line - 1 else "    "
                        print(f"  {marker} {i+1:3d}: {lines[i]}")
            print()

if corrupted_files:
    print(f"\n⚠️  Found {len(corrupted_files)} corrupted file(s)")
    print("\nTo fix: Open the file(s) and remove any content after the final }")
else:
    print("\n✓ All JSON files are valid!")

Checking all experiment_config.json files...

✓ mixed_motive_experiment_20251211_082327

✓ All JSON files are valid!


In [5]:
def load_experiment_data(experiments_dir='experiments'):
    """
    Load all training and testing data from experiments directory.
    
    Returns:
        training_data: DataFrame with all training logs
        testing_data: DataFrame with all testing logs
        experiment_metadata: Dict with experiment configurations
    """
    experiments_path = Path(experiments_dir)
    
    training_dfs = []
    testing_dfs = []
    metadata = {}
    
    # Find all experiment directories
    experiment_dirs = sorted([d for d in experiments_path.glob('mixed_motive_experiment_*') if d.is_dir()])
    
    print(f"Found {len(experiment_dirs)} experiments:")
    
    for exp_dir in experiment_dirs:
        exp_name = exp_dir.name
        print(f"  Loading: {exp_name}")
        
        # Load training data
        training_log = exp_dir / 'checkpoints' / 'detailed_training_logs' / 'detailed_training_log.csv'
        if training_log.exists():
            df_train = pd.read_csv(training_log)
            df_train['experiment_id'] = exp_name
            training_dfs.append(df_train)
        
        # Load testing data
        testing_log = exp_dir / 'logs' / 'detailed_testing_logs' / 'detailed_testing_log.csv'
        if testing_log.exists():
            df_test = pd.read_csv(testing_log)
            df_test['experiment_id'] = exp_name
            testing_dfs.append(df_test)
        
        # Load experiment config with error handling
        config_file = exp_dir / 'experiment_config.json'
        if config_file.exists():
            try:
                with open(config_file, 'r') as f:
                    metadata[exp_name] = json.load(f)
            except json.JSONDecodeError as e:
                print(f"    ⚠️  Warning: Could not parse config file - {e}")
                print(f"    Skipping config for {exp_name}")
                # Add minimal metadata to avoid missing key errors
                metadata[exp_name] = {
                    'training_games': [],
                    'test_game': 'Unknown',
                    'error': str(e)
                }
    
    # Combine all dataframes
    training_data = pd.concat(training_dfs, ignore_index=True) if training_dfs else pd.DataFrame()
    testing_data = pd.concat(testing_dfs, ignore_index=True) if testing_dfs else pd.DataFrame()
    
    print(f"\n✓ Loaded {len(training_data)} training records")
    print(f"✓ Loaded {len(testing_data)} testing records")
    print(f"✓ Loaded {len(metadata)} experiment configurations")
    
    return training_data, testing_data, metadata

# Load the data
training_df, testing_df, exp_metadata = load_experiment_data()

# Display basic info
print("\n" + "="*60)
print("Training Data Shape:", training_df.shape)
print("Testing Data Shape:", testing_df.shape)
if not training_df.empty:
    print("\nTraining Columns:", list(training_df.columns[:10]), "...")

Found 1 experiments:
  Loading: mixed_motive_experiment_20251211_082327

✓ Loaded 600100 training records
✓ Loaded 100000 testing records
✓ Loaded 1 experiment configurations

Training Data Shape: (600100, 35)
Testing Data Shape: (100000, 31)

Training Columns: ['network_serial_id', 'iteration', 'epoch', 'game_step', 'timestamp', 'game_name', 'opponent_name', 'opponent_type', 'total_loss', 'rl_loss'] ...

✓ Loaded 600100 training records
✓ Loaded 100000 testing records
✓ Loaded 1 experiment configurations

Training Data Shape: (600100, 35)
Testing Data Shape: (100000, 31)

Training Columns: ['network_serial_id', 'iteration', 'epoch', 'game_step', 'timestamp', 'game_name', 'opponent_name', 'opponent_type', 'total_loss', 'rl_loss'] ...


## 3. Preprocess and Structure Data by Training Schema

### 3.1 Check Available Columns in Data

In [6]:
# Display available columns to understand data structure
print("Training DataFrame Columns:")
if not training_df.empty:
    print(f"  Total columns: {len(training_df.columns)}")
    print(f"  Columns: {list(training_df.columns)}")
    print(f"\nFirst few rows:")
    print(training_df.head(3))
else:
    print("  (empty)")

print("\n" + "="*60)
print("Testing DataFrame Columns:")
if not testing_df.empty:
    print(f"  Total columns: {len(testing_df.columns)}")
    print(f"  Columns: {list(testing_df.columns)}")
    print(f"\nFirst few rows:")
    print(testing_df.head(3))
else:
    print("  (empty)")

Training DataFrame Columns:
  Total columns: 35
  Columns: ['network_serial_id', 'iteration', 'epoch', 'game_step', 'timestamp', 'game_name', 'opponent_name', 'opponent_type', 'total_loss', 'rl_loss', 'rl_loss_normalized', 'opponent_policy_loss', 'opponent_policy_loss_normalized', 'loss_ratio', 'alpha_contribution', 'alpha', 'policy_logit_cooperate', 'policy_logit_defect', 'opponent_policy_logit_defect', 'opponent_policy_logit_cooperate', 'value_estimate', 'policy_prob_cooperate', 'policy_prob_defect', 'opponent_policy_prob_defect', 'opponent_policy_prob_cooperate', 'agent_action', 'opponent_action', 'agent_reward', 'opponent_reward', 'true_opponent_defect_prob', 'true_opponent_cooperate_prob', 'advantage', 'temperature', 'gradient_norm', 'experiment_id']

First few rows:
              network_serial_id  iteration  epoch  game_step  \
0  NET_20251211_082328_b03f29b2          1      0          0   
1  NET_20251211_082328_b03f29b2          2      0          1   
2  NET_20251211_082328_b0

### 3.2 Calculate Derived Metrics

We need to calculate cooperation rate from action columns since it's not directly available.

In [7]:
# Calculate cooperation rate from actions
# agent_action: 0 = Cooperate, 1 = Defect (convention from framework)
if not training_df.empty:
    training_df['cooperation_rate'] = (training_df['agent_action'] == 0).astype(float)
    print("✓ Added 'cooperation_rate' to training data")

if not testing_df.empty:
    testing_df['cooperation_rate'] = (testing_df['agent_sampled_action'] == 0).astype(float)
    print("✓ Added 'cooperation_rate' to testing data")

# Verify the new columns
print("\nTraining columns now include:", 'cooperation_rate' in training_df.columns)
print("Testing columns now include:", 'cooperation_rate' in testing_df.columns)

✓ Added 'cooperation_rate' to training data
✓ Added 'cooperation_rate' to testing data

Training columns now include: True
Testing columns now include: True


In [8]:
# Extract training schema information from metadata
training_schemas = {}
for exp_id, config in exp_metadata.items():
    training_games = config.get('training_games', [])
    test_game = config.get('test_game', '')
    schema_name = f"{','.join(training_games)} → {test_game}"
    training_schemas[exp_id] = {
        'training_games': training_games,
        'test_game': test_game,
        'schema_name': schema_name,
        'num_training_games': len(training_games)
    }

# Add schema information to dataframes
if not training_df.empty:
    training_df['schema_name'] = training_df['experiment_id'].map(
        lambda x: training_schemas.get(x, {}).get('schema_name', 'Unknown')
    )
    training_df['test_game'] = training_df['experiment_id'].map(
        lambda x: training_schemas.get(x, {}).get('test_game', 'Unknown')
    )

if not testing_df.empty:
    testing_df['schema_name'] = testing_df['experiment_id'].map(
        lambda x: training_schemas.get(x, {}).get('schema_name', 'Unknown')
    )
    testing_df['training_games'] = testing_df['experiment_id'].map(
        lambda x: ','.join(training_schemas.get(x, {}).get('training_games', []))
    )

# Display unique schemas
print("Training Schemas Found:")
for schema_name, info in {v['schema_name']: v for v in training_schemas.values()}.items():
    print(f"  • {schema_name}")
    print(f"    Training: {', '.join(info['training_games'])}")
    print(f"    Testing: {info['test_game']}\n")

Training Schemas Found:
  • prisoners-dilemma → hawk-dove
    Training: prisoners-dilemma
    Testing: hawk-dove



## 4. Training Performance Analysis

In [9]:
# Calculate training statistics by schema and epoch
if not training_df.empty:
    training_summary = training_df.groupby(['schema_name', 'epoch']).agg({
        'total_loss': ['mean', 'std', 'min', 'max'],
        'rl_loss': ['mean', 'std'],
        'opponent_policy_loss': ['mean', 'std'],
        'agent_reward': ['mean', 'std'],
        'cooperation_rate': ['mean', 'std']
    }).reset_index()
    
    # Final epoch statistics
    max_epoch = training_df['epoch'].max()
    final_epoch_stats = training_df[training_df['epoch'] == max_epoch].groupby('schema_name').agg({
        'total_loss': ['mean', 'std'],
        'agent_reward': ['mean', 'std'],
        'cooperation_rate': ['mean', 'std']
    }).round(4)
    
    print("Final Epoch Training Performance by Schema:")
    print(final_epoch_stats)
    print(f"\n(Statistics from epoch {max_epoch})")
else:
    print("No training data available for analysis")

Final Epoch Training Performance by Schema:
                              total_loss         agent_reward          \
                                    mean     std         mean     std   
schema_name                                                             
prisoners-dilemma → hawk-dove     1.3594  0.3278       2.1335  2.6692   

                              cooperation_rate          
                                          mean     std  
schema_name                                             
prisoners-dilemma → hawk-dove           0.4006  0.4901  

(Statistics from epoch 199)


### 4.1 Training Loss Curves Over Epochs

In [10]:
if not training_df.empty:
    # Loss components over time
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Total Loss', 'RL Loss', 'Opponent Policy Loss', 'Loss Ratio (RL/OpPolicy)'),
        vertical_spacing=0.12,
        horizontal_spacing=0.1
    )
    
    loss_columns = [
        ('total_loss', 1, 1),
        ('rl_loss', 1, 2),
        ('opponent_policy_loss', 2, 1),
        ('loss_ratio', 2, 2)
    ]
    
    for schema in training_df['schema_name'].unique():
        schema_data = training_df[training_df['schema_name'] == schema].groupby('epoch').agg({
            'total_loss': 'mean',
            'rl_loss': 'mean',
            'opponent_policy_loss': 'mean',
            'loss_ratio': 'mean'
        }).reset_index()
        
        for col_name, row, col in loss_columns:
            fig.add_trace(
                go.Scatter(x=schema_data['epoch'], y=schema_data[col_name],
                          mode='lines', name=schema, showlegend=(row==1 and col==1)),
                row=row, col=col
            )
    
    fig.update_layout(
        title_text="Training Loss Components Over Epochs by Schema",
        height=700,
        hovermode='x unified'
    )
    fig.update_xaxes(title_text="Epoch")
    fig.update_yaxes(title_text="Loss")
    fig.show()
else:
    print("No training data available for visualization")

### 4.2 Training Reward and Cooperation Rate

In [11]:
if not training_df.empty:
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=('Average Reward Over Epochs', 'Cooperation Rate Over Epochs'),
        horizontal_spacing=0.12
    )
    
    for schema in training_df['schema_name'].unique():
        schema_data = training_df[training_df['schema_name'] == schema].groupby('epoch').agg({
            'agent_reward': 'mean',
            'cooperation_rate': 'mean'
        }).reset_index()
        
        fig.add_trace(
            go.Scatter(x=schema_data['epoch'], y=schema_data['agent_reward'],
                      mode='lines', name=schema, showlegend=True),
            row=1, col=1
        )
        
        fig.add_trace(
            go.Scatter(x=schema_data['epoch'], y=schema_data['cooperation_rate'],
                      mode='lines', name=schema, showlegend=False),
            row=1, col=2
        )
    
    fig.update_layout(
        title_text="Training Performance Metrics by Schema",
        height=400,
        hovermode='x unified'
    )
    fig.update_xaxes(title_text="Epoch")
    fig.update_yaxes(title_text="Average Reward", row=1, col=1)
    fig.update_yaxes(title_text="Cooperation Rate", row=1, col=2)
    fig.show()
else:
    print("No training data available for visualization")

## 5. Testing Performance Analysis

In [12]:
if not testing_df.empty:
    # Calculate testing statistics by schema
    testing_summary = testing_df.groupby(['schema_name', 'opponent_type', 'game_name']).agg({
        'agent_reward': ['mean', 'std', 'count'],
        'cooperation_rate': ['mean', 'std']
    }).reset_index()
    
    # Overall testing performance by schema
    overall_testing = testing_df.groupby('schema_name').agg({
        'agent_reward': ['mean', 'std'],
        'cooperation_rate': ['mean', 'std']
    }).round(4)
    
    print("Overall Testing Performance by Schema:")
    print(overall_testing)
    print("\nDetailed breakdown available in testing_summary dataframe")
    
    # Breakdown by test game
    test_game_performance = testing_df.groupby(['schema_name', 'game_name']).agg({
        'agent_reward': 'mean',
        'cooperation_rate': 'mean'
    }).round(4)
    
    print("\nTesting Performance by Test Game:")
    print(test_game_performance)
else:
    print("No testing data available for analysis")

Overall Testing Performance by Schema:
                              agent_reward         cooperation_rate        
                                      mean     std             mean     std
schema_name                                                                
prisoners-dilemma → hawk-dove       1.7494  2.2869           0.3257  0.4686

Detailed breakdown available in testing_summary dataframe

Testing Performance by Test Game:
                                                 agent_reward  \
schema_name                   game_name                         
prisoners-dilemma → hawk-dove Battle-of-Sexes          1.2262   
                              Hawk-Dove                2.0103   
                              Prisoners-Dilemma        1.7616   
                              Stag-Hunt                1.9995   

                                                 cooperation_rate  
schema_name                   game_name                            
prisoners-dilemma → hawk-dove Battle

### 5.1 Testing Performance Visualizations

In [13]:
if not testing_df.empty:
    # Box plots for reward distribution by schema
    fig = go.Figure()
    
    for schema in testing_df['schema_name'].unique():
        schema_data = testing_df[testing_df['schema_name'] == schema]
        fig.add_trace(go.Box(
            y=schema_data['agent_reward'],
            name=schema,
            boxmean='sd'
        ))
    
    fig.update_layout(
        title="Testing Reward Distribution by Schema",
        yaxis_title="Reward",
        xaxis_title="Training Schema",
        height=500,
        showlegend=True
    )
    fig.show()
    
    # Cooperation rate by opponent type
    fig2 = px.box(testing_df, x='opponent_type', y='cooperation_rate', 
                  color='schema_name',
                  title="Testing Cooperation Rate by Opponent Type and Schema",
                  labels={'opponent_type': 'Opponent Defection Probability',
                         'cooperation_rate': 'Cooperation Rate'})
    fig2.update_layout(height=500)
    fig2.show()
else:
    print("No testing data available for visualization")

## 6. Training vs Testing Comparison

In [14]:
if not training_df.empty and not testing_df.empty:
    # Get final epoch training performance
    max_epoch = training_df['epoch'].max()
    train_final = training_df[training_df['epoch'] == max_epoch].groupby('schema_name').agg({
        'agent_reward': 'mean',
        'cooperation_rate': 'mean'
    }).reset_index()
    train_final.columns = ['schema_name', 'train_reward', 'train_coop_rate']
    
    # Get testing performance
    test_agg = testing_df.groupby('schema_name').agg({
        'agent_reward': 'mean',
        'cooperation_rate': 'mean'
    }).reset_index()
    test_agg.columns = ['schema_name', 'test_reward', 'test_coop_rate']
    
    # Merge and calculate gaps
    comparison = train_final.merge(test_agg, on='schema_name')
    comparison['reward_gap'] = comparison['train_reward'] - comparison['test_reward']
    comparison['coop_gap'] = comparison['train_coop_rate'] - comparison['test_coop_rate']
    
    print("Training vs Testing Performance Comparison:")
    print(comparison.round(4))
    print("\nNote: Positive gap indicates training > testing (potential overfitting)")
    print("      Negative gap indicates testing > training (positive transfer)")
else:
    print("Both training and testing data needed for comparison")

Training vs Testing Performance Comparison:
                     schema_name  train_reward  train_coop_rate  test_reward  \
0  prisoners-dilemma → hawk-dove        2.1335           0.4006       1.7494   

   test_coop_rate  reward_gap  coop_gap  
0          0.3257      0.3841    0.0749  

Note: Positive gap indicates training > testing (potential overfitting)
      Negative gap indicates testing > training (positive transfer)


### 6.1 Side-by-Side Comparison Visualization

In [15]:
if not training_df.empty and not testing_df.empty:
    # Prepare data for grouped bar chart
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=('Reward: Training vs Testing', 'Cooperation Rate: Training vs Testing'),
        horizontal_spacing=0.15
    )
    
    schemas = comparison['schema_name'].tolist()
    
    # Reward comparison
    fig.add_trace(go.Bar(name='Training', x=schemas, y=comparison['train_reward'], 
                         marker_color='lightblue'), row=1, col=1)
    fig.add_trace(go.Bar(name='Testing', x=schemas, y=comparison['test_reward'],
                         marker_color='lightcoral'), row=1, col=1)
    
    # Cooperation rate comparison
    fig.add_trace(go.Bar(name='Training', x=schemas, y=comparison['train_coop_rate'],
                         marker_color='lightblue', showlegend=False), row=1, col=2)
    fig.add_trace(go.Bar(name='Testing', x=schemas, y=comparison['test_coop_rate'],
                         marker_color='lightcoral', showlegend=False), row=1, col=2)
    
    fig.update_layout(
        title_text="Training vs Testing Performance by Schema",
        height=500,
        barmode='group'
    )
    fig.update_xaxes(tickangle=45)
    fig.update_yaxes(title_text="Average Reward", row=1, col=1)
    fig.update_yaxes(title_text="Cooperation Rate", row=1, col=2)
    fig.show()
else:
    print("Both training and testing data needed for visualization")

## 7. Statistical Analysis

In [16]:
# Correlation between training and testing performance
if not training_df.empty and not testing_df.empty:
    print("="*60)
    print("CORRELATION ANALYSIS")
    print("="*60)
    
    # Pearson correlation
    pearson_reward = stats.pearsonr(comparison['train_reward'], comparison['test_reward'])
    pearson_coop = stats.pearsonr(comparison['train_coop_rate'], comparison['test_coop_rate'])
    
    print("\nPearson Correlation (Training vs Testing):")
    print(f"  Reward:          r = {pearson_reward[0]:.4f}, p-value = {pearson_reward[1]:.4f}")
    print(f"  Cooperation:     r = {pearson_coop[0]:.4f}, p-value = {pearson_coop[1]:.4f}")
    
    # Spearman correlation (non-parametric)
    spearman_reward = stats.spearmanr(comparison['train_reward'], comparison['test_reward'])
    spearman_coop = stats.spearmanr(comparison['train_coop_rate'], comparison['test_coop_rate'])
    
    print("\nSpearman Correlation (Training vs Testing):")
    print(f"  Reward:          ρ = {spearman_reward[0]:.4f}, p-value = {spearman_reward[1]:.4f}")
    print(f"  Cooperation:     ρ = {spearman_coop[0]:.4f}, p-value = {spearman_coop[1]:.4f}")
    
    # Scatter plot with regression line
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=('Reward Correlation', 'Cooperation Rate Correlation')
    )
    
    fig.add_trace(go.Scatter(
        x=comparison['train_reward'], y=comparison['test_reward'],
        mode='markers+text', text=comparison['schema_name'],
        textposition='top center', name='Schemas'
    ), row=1, col=1)
    
    fig.add_trace(go.Scatter(
        x=comparison['train_coop_rate'], y=comparison['test_coop_rate'],
        mode='markers+text', text=comparison['schema_name'],
        textposition='top center', name='Schemas', showlegend=False
    ), row=1, col=2)
    
    # Add diagonal reference line (perfect correlation)
    min_reward = min(comparison['train_reward'].min(), comparison['test_reward'].min())
    max_reward = max(comparison['train_reward'].max(), comparison['test_reward'].max())
    fig.add_trace(go.Scatter(
        x=[min_reward, max_reward], y=[min_reward, max_reward],
        mode='lines', line=dict(dash='dash', color='gray'),
        name='Perfect Correlation', showlegend=False
    ), row=1, col=1)
    
    fig.add_trace(go.Scatter(
        x=[0, 1], y=[0, 1],
        mode='lines', line=dict(dash='dash', color='gray'),
        showlegend=False
    ), row=1, col=2)
    
    fig.update_layout(title_text="Training-Testing Correlation Analysis", height=500)
    fig.update_xaxes(title_text="Training Performance")
    fig.update_yaxes(title_text="Testing Performance")
    fig.show()
else:
    print("Both training and testing data needed for correlation analysis")

CORRELATION ANALYSIS


ValueError: `x` and `y` must have length at least 2.

### 7.1 Statistical Tests for Schema Differences

In [None]:
if not testing_df.empty:
    print("="*60)
    print("STATISTICAL TESTS FOR SCHEMA DIFFERENCES")
    print("="*60)
    
    # Prepare groups for ANOVA
    schemas = testing_df['schema_name'].unique()
    if len(schemas) >= 2:
        # ANOVA for reward differences
        reward_groups = [testing_df[testing_df['schema_name'] == s]['reward'].values for s in schemas]
        f_stat_reward, p_value_reward = stats.f_oneway(*reward_groups)
        
        print(f"\nOne-Way ANOVA - Testing Reward by Schema:")
        print(f"  F-statistic = {f_stat_reward:.4f}")
        print(f"  p-value = {p_value_reward:.4f}")
        print(f"  Result: {'Significant' if p_value_reward < 0.05 else 'Not significant'} difference between schemas")
        
        # ANOVA for cooperation rate
        coop_groups = [testing_df[testing_df['schema_name'] == s]['cooperation_rate'].values for s in schemas]
        f_stat_coop, p_value_coop = stats.f_oneway(*coop_groups)
        
        print(f"\nOne-Way ANOVA - Testing Cooperation Rate by Schema:")
        print(f"  F-statistic = {f_stat_coop:.4f}")
        print(f"  p-value = {p_value_coop:.4f}")
        print(f"  Result: {'Significant' if p_value_coop < 0.05 else 'Not significant'} difference between schemas")
        
        # Pairwise t-tests (if significant)
        if p_value_reward < 0.05 and len(schemas) >= 2:
            print("\nPairwise t-tests for Reward (Bonferroni corrected):")
            from itertools import combinations
            pairs = list(combinations(schemas, 2))
            alpha_corrected = 0.05 / len(pairs)
            
            for s1, s2 in pairs:
                group1 = testing_df[testing_df['schema_name'] == s1]['reward']
                group2 = testing_df[testing_df['schema_name'] == s2]['reward']
                t_stat, p_val = stats.ttest_ind(group1, group2)
                sig = "***" if p_val < alpha_corrected else ""
                print(f"  {s1[:20]:<20} vs {s2[:20]:<20}: t={t_stat:6.3f}, p={p_val:.4f} {sig}")
    else:
        print("\nInsufficient schemas for ANOVA (need at least 2)")
else:
    print("No testing data available for statistical tests")

## 8. Cross-Game Performance Heatmap

In [None]:
if not testing_df.empty:
    # Create pivot table for heatmap
    heatmap_data = testing_df.groupby(['schema_name', 'game_name'])['agent_reward'].mean().reset_index()
    heatmap_pivot = heatmap_data.pivot(index='schema_name', columns='game_name', values='agent_reward')
    
    # Create heatmap using plotly
    fig = go.Figure(data=go.Heatmap(
        z=heatmap_pivot.values,
        x=heatmap_pivot.columns,
        y=heatmap_pivot.index,
        colorscale='RdYlGn',
        text=np.round(heatmap_pivot.values, 3),
        texttemplate='%{text}',
        textfont={"size": 12},
        colorbar=dict(title="Avg Reward")
    ))
    
    fig.update_layout(
        title="Average Reward by Schema and Test Game",
        xaxis_title="Test Game",
        yaxis_title="Training Schema",
        height=max(400, len(heatmap_pivot) * 60),
        width=max(600, len(heatmap_pivot.columns) * 100)
    )
    fig.show()
    
    # Cooperation rate heatmap
    coop_data = testing_df.groupby(['schema_name', 'game_name'])['cooperation_rate'].mean().reset_index()
    coop_pivot = coop_data.pivot(index='schema_name', columns='game_name', values='cooperation_rate')
    
    fig2 = go.Figure(data=go.Heatmap(
        z=coop_pivot.values,
        x=coop_pivot.columns,
        y=coop_pivot.index,
        colorscale='Blues',
        text=np.round(coop_pivot.values, 3),
        texttemplate='%{text}',
        textfont={"size": 12},
        colorbar=dict(title="Coop Rate")
    ))
    
    fig2.update_layout(
        title="Cooperation Rate by Schema and Test Game",
        xaxis_title="Test Game",
        yaxis_title="Training Schema",
        height=max(400, len(coop_pivot) * 60),
        width=max(600, len(coop_pivot.columns) * 100)
    )
    fig2.show()
else:
    print("No testing data available for heatmap")

## 9. Performance by Opponent Type

In [None]:
if not testing_df.empty:
    # Performance across different opponent types
    opponent_analysis = testing_df.groupby(['schema_name', 'opponent_type']).agg({
        'agent_reward': ['mean', 'std'],
        'cooperation_rate': ['mean', 'std']
    }).round(4)
    
    print("Performance by Opponent Type:")
    print(opponent_analysis)
    
    # Line plot showing performance vs opponent difficulty with smooth curves
    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=('Reward vs Opponent Defection Prob', 'Cooperation vs Opponent Defection Prob'),
        horizontal_spacing=0.12
    )
    
    for schema in testing_df['schema_name'].unique():
        schema_data = testing_df[testing_df['schema_name'] == schema].groupby('opponent_type').agg({
            'agent_reward': 'mean',
            'cooperation_rate': 'mean'
        }).reset_index()
        
        # Convert opponent_type to numeric for plotting
        schema_data['opp_numeric'] = schema_data['opponent_type'].astype(float)
        schema_data = schema_data.sort_values('opp_numeric')
        
        fig.add_trace(go.Scatter(
            x=schema_data['opp_numeric'], y=schema_data['agent_reward'],
            mode='lines+markers', name=schema,
            line=dict(shape='spline', smoothing=1.3)
        ), row=1, col=1)
        
        fig.add_trace(go.Scatter(
            x=schema_data['opp_numeric'], y=schema_data['cooperation_rate'],
            mode='lines+markers', name=schema, showlegend=False,
            line=dict(shape='spline', smoothing=1.3)
        ), row=1, col=2)
    
    fig.update_xaxes(title_text="Opponent Defection Probability")
    fig.update_yaxes(title_text="Average Reward", row=1, col=1)
    fig.update_yaxes(title_text="Cooperation Rate", row=1, col=2)
    fig.update_layout(
        title_text="Schema Performance Across Opponent Types",
        height=500,
        hovermode='x unified'
    )
    fig.show()
else:
    print("No testing data available for opponent analysis")

### 9.1 Agent Cooperation Rate vs Opponent Cooperation Rate

Analyze how the agent's cooperation strategy adapts to opponent cooperation levels.

In [None]:
if not testing_df.empty:
    # Calculate opponent cooperation rate (1 - defection probability)
    # opponent_type is defection probability, so cooperation = 1 - defection
    
    # Group by schema and opponent type, calculate agent cooperation rate
    coop_analysis = testing_df.groupby(['schema_name', 'opponent_type']).agg({
        'cooperation_rate': 'mean'  # Agent cooperation rate
    }).reset_index()
    
    # Calculate opponent cooperation rate
    coop_analysis['opponent_coop_rate'] = 1 - coop_analysis['opponent_type'].astype(float)
    
    print("Agent Cooperation vs Opponent Cooperation:")
    print(coop_analysis.round(4))
    
    # Create scatter plot with smoothed lines for each schema
    fig = go.Figure()
    
    for schema in coop_analysis['schema_name'].unique():
        schema_data = coop_analysis[coop_analysis['schema_name'] == schema].sort_values('opponent_coop_rate')
        
        # Original data points with line
        fig.add_trace(go.Scatter(
            x=schema_data['opponent_coop_rate'],
            y=schema_data['cooperation_rate'],
            mode='lines+markers',
            name=schema,
            marker=dict(size=10),
            line=dict(width=2, shape='spline', smoothing=1.3)
        ))
    
    # Add diagonal reference line (tit-for-tat strategy)
    fig.add_trace(go.Scatter(
        x=[0, 1], y=[0, 1],
        mode='lines',
        name='Tit-for-Tat (perfect matching)',
        line=dict(dash='dash', color='gray', width=2),
        showlegend=True
    ))
    
    fig.update_layout(
        title="Agent Cooperation Rate vs Opponent Cooperation Rate",
        xaxis_title="Opponent Cooperation Rate",
        yaxis_title="Agent Cooperation Rate",
        height=600,
        width=900,
        hovermode='x unified',
        xaxis=dict(range=[0, 1], dtick=0.1),
        yaxis=dict(range=[0, 1], dtick=0.1),
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=0.01
        )
    )
    
    fig.show()
    
    # Additional analysis: Calculate correlation between agent and opponent cooperation
    print("\n" + "="*60)
    print("COOPERATION MATCHING ANALYSIS")
    print("="*60)
    
    for schema in coop_analysis['schema_name'].unique():
        schema_data = coop_analysis[coop_analysis['schema_name'] == schema]
        correlation = schema_data[['opponent_coop_rate', 'cooperation_rate']].corr().iloc[0, 1]
        
        # Calculate mean absolute deviation from tit-for-tat
        tft_deviation = abs(schema_data['cooperation_rate'] - schema_data['opponent_coop_rate']).mean()
        
        print(f"\n{schema}:")
        print(f"  Correlation: {correlation:.4f}")
        print(f"  Avg deviation from Tit-for-Tat: {tft_deviation:.4f}")
        
else:
    print("No testing data available for cooperation analysis")

## 10. Summary Report and Key Findings

In [None]:
print("="*80)
print(" " * 20 + "COGNITIVE THERAPY AI - EXPERIMENT SUMMARY")
print("="*80)

if not training_df.empty and not testing_df.empty:
    # Best performing schema
    best_schema_train = comparison.loc[comparison['train_reward'].idxmax(), 'schema_name']
    best_schema_test = comparison.loc[comparison['test_reward'].idxmax(), 'schema_name']
    
    print("\n📊 OVERALL PERFORMANCE")
    print("-" * 80)
    print(f"Total Experiments Analyzed: {len(exp_metadata)}")
    print(f"Training Records: {len(training_df):,}")
    print(f"Testing Records: {len(testing_df):,}")
    print(f"Unique Training Schemas: {training_df['schema_name'].nunique()}")
    
    print("\n🏆 BEST PERFORMING SCHEMAS")
    print("-" * 80)
    print(f"Training Phase:  {best_schema_train}")
    print(f"  → Reward: {comparison.loc[comparison['schema_name'] == best_schema_train, 'train_reward'].values[0]:.4f}")
    print(f"\nTesting Phase:   {best_schema_test}")
    print(f"  → Reward: {comparison.loc[comparison['schema_name'] == best_schema_test, 'test_reward'].values[0]:.4f}")
    
    # Generalization gap
    print("\n📈 GENERALIZATION ANALYSIS")
    print("-" * 80)
    avg_train_reward = comparison['train_reward'].mean()
    avg_test_reward = comparison['test_reward'].mean()
    generalization_gap = avg_train_reward - avg_test_reward
    
    print(f"Average Training Reward:  {avg_train_reward:.4f}")
    print(f"Average Testing Reward:   {avg_test_reward:.4f}")
    print(f"Generalization Gap:       {generalization_gap:.4f}")
    if generalization_gap > 0:
        print(f"  → Models show {generalization_gap:.2%} performance drop in testing")
    else:
        print(f"  → Models show positive transfer (+{abs(generalization_gap):.2%} improvement)")
    
    # Schema with best generalization
    comparison['generalization_score'] = comparison['test_reward'] / comparison['train_reward']
    best_generalization = comparison.loc[comparison['generalization_score'].idxmax(), 'schema_name']
    gen_score = comparison['generalization_score'].max()
    
    print(f"\nBest Generalization:      {best_generalization}")
    print(f"  → Generalization Score: {gen_score:.4f} (test/train ratio)")
    
    # Training convergence
    if 'epoch' in training_df.columns:
        max_epoch = training_df['epoch'].max()
        early_epochs = training_df[training_df['epoch'] <= 10]
        late_epochs = training_df[training_df['epoch'] >= max_epoch - 10]
        
        avg_early_loss = early_epochs['total_loss'].mean()
        avg_late_loss = late_epochs['total_loss'].mean()
        loss_improvement = (avg_early_loss - avg_late_loss) / avg_early_loss
        
        print("\n📉 TRAINING CONVERGENCE")
        print("-" * 80)
        print(f"Total Training Epochs:    {max_epoch}")
        print(f"Early Loss (Epoch 1-10):  {avg_early_loss:.4f}")
        print(f"Late Loss (Final 10):     {avg_late_loss:.4f}")
        print(f"Loss Improvement:         {loss_improvement:.2%}")
    
    # Opponent adaptation
    if 'opponent_type' in testing_df.columns:
        print("\n🎯 OPPONENT ADAPTATION")
        print("-" * 80)
        for opp_type in sorted(testing_df['opponent_type'].unique()):
            opp_reward = testing_df[testing_df['opponent_type'] == opp_type]['reward'].mean()
            opp_coop = testing_df[testing_df['opponent_type'] == opp_type]['cooperation_rate'].mean()
            print(f"Opponent Defection {opp_type}:  Reward={opp_reward:.4f}, Coop={opp_coop:.4f}")
    
    print("\n" + "="*80)
    print(" " * 25 + "END OF SUMMARY REPORT")
    print("="*80)
    
elif not training_df.empty:
    print("\n⚠️  Only training data available. Run experiments with testing phase for full analysis.")
    print(f"Training records analyzed: {len(training_df):,}")
    
elif not testing_df.empty:
    print("\n⚠️  Only testing data available. Training data needed for comparative analysis.")
    print(f"Testing records analyzed: {len(testing_df):,}")
    
else:
    print("\n❌ No experiment data found. Please run experiments first using:")
    print("   python main_experiment.py --config config/default_config.json")