In [2]:
import os
import yaml
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm

# 1. Load Cricsheet YAMLs
def load_matches(data_dir):
    files = glob.glob(os.path.join(data_dir, '*.yaml'))
    print(f"🔍 Found {len(files)} YAML files in: {data_dir}")
    matches = []
    for file in tqdm(files, desc="Parsing YAML matches"):
        with open(file, 'r') as f:
            try:
                match = yaml.safe_load(f)
                matches.append(match)
            except Exception as e:
                print(f"⚠️ Failed to load {file}: {e}")
                continue
    return matches

# 2. Debug and estimate runs per win from run-margin results
def estimate_runs_per_win_debug(matches):
    run_margins = []
    total_matches = len(matches)
    matches_with_outcome = 0
    matches_with_by = 0
    matches_with_runs = 0
    matches_with_wickets = 0
    
    print("🔍 Debugging outcome structure...")
    
    for i, match in enumerate(matches):
        outcome = match.get('info', {}).get('outcome', {})
        
        if outcome:
            matches_with_outcome += 1
            
            # Debug: Print first few outcome structures
            if i < 3:
                print(f"Sample outcome {i+1}: {outcome}")
            
            if 'by' in outcome:
                matches_with_by += 1
                by_info = outcome['by']
                
                if 'runs' in by_info:
                    matches_with_runs += 1
                    margin = by_info['runs']
                    run_margins.append(margin)
                    
                    # Debug: Print first few margins
                    if len(run_margins) <= 5:
                        print(f"Run margin found: {margin}")
                        
                elif 'wickets' in by_info:
                    matches_with_wickets += 1
    
    print(f"\n📊 Debug Statistics:")
    print(f"  Total matches: {total_matches}")
    print(f"  Matches with outcome: {matches_with_outcome}")
    print(f"  Matches with 'by' field: {matches_with_by}")
    print(f"  Matches won by runs: {matches_with_runs}")
    print(f"  Matches won by wickets: {matches_with_wickets}")
    
    if not run_margins:
        print("⚠️ No run-margin victories found.")
        return None, []
    
    print(f"  📈 Run margins found: {len(run_margins)}")
    print(f"  📈 Min margin: {min(run_margins)}")
    print(f"  📈 Max margin: {max(run_margins)}")
    print(f"  📈 Sample margins: {run_margins[:10]}")
    
    return np.mean(run_margins), run_margins

# 3. Enhanced analysis with all outcome types
def analyze_all_outcomes(matches):
    outcomes_summary = {
        'by_runs': [],
        'by_wickets': [],
        'tie': 0,
        'no_result': 0,
        'other': []
    }
    
    for match in matches:
        outcome = match.get('info', {}).get('outcome', {})
        
        if not outcome:
            outcomes_summary['no_result'] += 1
        elif 'result' in outcome and outcome['result'] == 'tie':
            outcomes_summary['tie'] += 1
        elif 'by' in outcome:
            by_info = outcome['by']
            if 'runs' in by_info:
                outcomes_summary['by_runs'].append(by_info['runs'])
            elif 'wickets' in by_info:
                outcomes_summary['by_wickets'].append(by_info['wickets'])
            else:
                outcomes_summary['other'].append(by_info)
        else:
            outcomes_summary['other'].append(outcome)
    
    return outcomes_summary

# 4. Main pipeline with enhanced debugging
def run_pipeline(data_dir='./odis'):
    matches = load_matches(data_dir)
    if not matches:
        print("❌ No matches loaded.")
        return
    
    men_matches = [m for m in matches if m.get('info', {}).get('gender') == 'male']
    women_matches = [m for m in matches if m.get('info', {}).get('gender') == 'female']
    
    print(f"📦 Total matches: {len(matches)}")
    print(f"👨 Men's matches: {len(men_matches)}")
    print(f"👩 Women's matches: {len(women_matches)}")
    
    # Debug men's matches
    print(f"\n🔍 DEBUGGING MEN'S MATCHES:")
    men_rpw, men_margins = estimate_runs_per_win_debug(men_matches)
    men_outcomes = analyze_all_outcomes(men_matches)
    
    print(f"\n📊 Men's Outcome Breakdown:")
    print(f"  Wins by runs: {len(men_outcomes['by_runs'])}")
    print(f"  Wins by wickets: {len(men_outcomes['by_wickets'])}")
    print(f"  Ties: {men_outcomes['tie']}")
    print(f"  No results: {men_outcomes['no_result']}")
    print(f"  Other outcomes: {len(men_outcomes['other'])}")
    
    # Debug women's matches
    print(f"\n🔍 DEBUGGING WOMEN'S MATCHES:")
    women_rpw, women_margins = estimate_runs_per_win_debug(women_matches)
    women_outcomes = analyze_all_outcomes(women_matches)
    
    print(f"\n📊 Women's Outcome Breakdown:")
    print(f"  Wins by runs: {len(women_outcomes['by_runs'])}")
    print(f"  Wins by wickets: {len(women_outcomes['by_wickets'])}")
    print(f"  Ties: {women_outcomes['tie']}")
    print(f"  No results: {women_outcomes['no_result']}")
    print(f"  Other outcomes: {len(women_outcomes['other'])}")
    
    # Final results
    print("\n🎯 Average Run Margins (when teams win by runs):")
    if men_rpw is not None:
        print(f"  🏏 Men's ODIs: {men_rpw:.2f} runs")
    else:
        print("  🏏 Men's ODIs: Not available")
    
    if women_rpw is not None:
        print(f"  🏏 Women's ODIs: {women_rpw:.2f} runs")
    else:
        print("  🏏 Women's ODIs: Not available")

# 5. Execute
run_pipeline()

🔍 Found 2980 YAML files in: ./odis


Parsing YAML matches: 100%|████████████████████████████████████████████████████| 2980/2980 [15:38<00:00,  3.17it/s]


📦 Total matches: 2980
👨 Men's matches: 2460
👩 Women's matches: 520

🔍 DEBUGGING MEN'S MATCHES:
🔍 Debugging outcome structure...
Sample outcome 1: {'winner': 'Nepal', 'by': {'wickets': 3}}
Sample outcome 2: {'result': 'no result'}
Sample outcome 3: {'winner': 'Pakistan', 'by': {'runs': 12}}
Run margin found: 12
Run margin found: 40
Run margin found: 5
Run margin found: 110
Run margin found: 204

📊 Debug Statistics:
  Total matches: 2460
  Matches with outcome: 2460
  Matches with 'by' field: 2334
  Matches won by runs: 1139
  Matches won by wickets: 1195
  📈 Run margins found: 1139
  📈 Min margin: 1
  📈 Max margin: 317
  📈 Sample margins: [12, 40, 5, 110, 204, 91, 31, 88, 148, 121]

📊 Men's Outcome Breakdown:
  Wins by runs: 1139
  Wins by wickets: 1195
  Ties: 26
  No results: 0
  Other outcomes: 100

🔍 DEBUGGING WOMEN'S MATCHES:
🔍 Debugging outcome structure...
Sample outcome 1: {'winner': 'South Africa', 'by': {'wickets': 8}}
Sample outcome 2: {'winner': 'Thailand', 'by': {'runs': 8}