### Naive WAR (batting)

In [7]:
import os
import yaml
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm

# 1. Load Cricsheet YAMLs
def load_matches(data_dir, max_files=10000):
    files = glob.glob(os.path.join(data_dir, '*.yaml'))[:max_files]
    matches = []
    for file in tqdm(files, desc="Loading YAMLs"):
        with open(file, 'r') as f:
            try:
                match = yaml.safe_load(f)
                matches.append(match)
            except:
                continue
    return matches

# 2. Extract bowling records
def extract_bowling_data(matches):
    records = []
    for match in matches:
        try:
            info = match.get('info', {})
            gender = info.get('gender', 'unknown')
            match_id = info.get('dates', ['NA'])[0] + "_" + "_vs_".join(info.get('teams', []))
            innings_data = match.get('innings', [])
            
            for innings in innings_data:
                for _, details in innings.items():
                    team = details.get('team', 'NA')
                    deliveries = details.get('deliveries', [])
                    
                    for delivery in deliveries:
                        for ball_key, ball_data in delivery.items():
                            bowler = ball_data.get('bowler')
                            runs_total = ball_data.get('runs', {}).get('total', 0)
                            is_wicket = 'wicket' in ball_data
                            if not bowler:
                                continue
                            records.append({
                                'match_id': match_id,
                                'team_bowling': team,
                                'bowler': bowler,
                                'runs_conceded': runs_total,
                                'wicket': int(is_wicket),
                                'gender': gender
                            })
        except:
            continue
    return pd.DataFrame(records)

# 3. Compute bowling WAR
def compute_bowling_war(df, runs_per_win=10, replacement_quantile=0.2):
    agg = df.groupby('bowler').agg(
        balls=('runs_conceded', 'count'),
        runs_conceded=('runs_conceded', 'sum'),
        wickets=('wicket', 'sum')
    ).reset_index()

    agg['runs_per_ball'] = agg['runs_conceded'] / agg['balls']
    replacement_rpb = agg['runs_per_ball'].quantile(1 - replacement_quantile)
    agg['replacement_runs'] = agg['balls'] * replacement_rpb
    agg['bowling_war'] = (agg['replacement_runs'] - agg['runs_conceded']) / runs_per_win
    return agg.sort_values(by='bowling_war', ascending=False)

# 4. Full pipeline
def run_pipeline(data_dir='./odis'):
    matches = load_matches(data_dir)
    df = extract_bowling_data(matches)

    if df.empty:
        print("❌ No bowling data extracted.")
        return

    print(f"✅ Extracted {len(df)} bowling deliveries")

    for gender in ['male', 'female']:
        df_gender = df[df['gender'] == gender]
        if df_gender.empty:
            print(f"⚠️ No data for gender = {gender}")
            continue

        war_df = compute_bowling_war(df_gender)
        output_csv = f'odi_bowling_war_{gender}.csv'
        war_df.to_csv(output_csv, index=False)
        print(f"\n✅ Top bowlers by WAR ({gender}):")
        print(war_df.head(10))
        print(f"📄 Saved to: {output_csv}")

# Execute
run_pipeline()

Loading YAMLs: 100%|███████████████████████████████████████████████████████████| 2980/2980 [41:39<00:00,  1.19it/s]


✅ Extracted 798157 bowling deliveries

✅ Top bowlers by WAR (male):
             bowler  balls  runs_conceded  wickets  runs_per_ball  \
145      BM Scholtz   3073           1707       92       0.555483   
616        MRJ Watt   3590           2530       98       0.704735   
869  SN Netravalkar   3051           2042       96       0.669289   
422       JJ Bumrah   3720           2846      126       0.765054   
524   Kuldeep Yadav   5392           4562      175       0.846068   
558  M Muralitharan   2177           1313       65       0.603124   
598      MJ Santner   3832           3012       83       0.786013   
694      NP Kenjige   2760           1950       60       0.706522   
826    S Lamichhane   3186           2389      126       0.749843   
864      SM Pollock   2184           1371       55       0.627747   

     replacement_runs  bowling_war  
145       3134.374554   142.737455  
616       3661.700179   113.170018  
869       3111.935166   106.993517  
422       3794.296564   