In [4]:
import os
import yaml
import glob
import pandas as pd
from tqdm import tqdm
import numpy as np

# --- Step 1: Load matches ---
def load_matches(data_dir, max_files=10000):
    files = glob.glob(os.path.join(data_dir, '*.yaml'))[:max_files]
    matches = []
    for file in tqdm(files, desc="Loading Test YAMLs"):
        with open(file, 'r') as f:
            try:
                match = yaml.safe_load(f)
                matches.append(match)
            except:
                continue
    return matches

# --- Step 2: Extract contextual bowling data ---
def extract_bowling_data(matches):
    data = []
    for match in matches:
        try:
            info = match.get('info', {})
            gender = info.get('gender', 'NA')
            date = info.get('dates', ['NA'])[0]
            year = pd.to_datetime(date).year
            venue = info.get('venue', 'NA')
            teams = info.get('teams', ['NA', 'NA'])
            innings_data = match.get('innings', [])

            for innings in innings_data:
                for _, details in innings.items():
                    deliveries = details.get('deliveries', [])
                    bowling_team = teams[1] if details.get('team') == teams[0] else teams[0]
                    for delivery in deliveries:
                        for _, ball_data in delivery.items():
                            bowler = ball_data.get('bowler')
                            runs_total = ball_data.get('runs', {}).get('total', 0)
                            is_wicket = 'wicket' in ball_data

                            data.append({
                                'bowler': bowler,
                                'runs_conceded': runs_total,
                                'wicket': int(is_wicket),
                                'year': year,
                                'venue': venue,
                                'opposition': bowling_team,
                                'gender': gender
                            })
        except:
            continue
    return pd.DataFrame(data)

# --- Step 3: Compute WAR ---
def compute_bowling_war(df, runs_per_win=10, replacement_quantile=0.2):
    result = []
    for gender, subdf in df.groupby('gender'):
        agg = subdf.groupby('bowler').agg(
            balls=('runs_conceded', 'count'),
            runs_conceded=('runs_conceded', 'sum'),
            wickets=('wicket', 'sum')
        ).reset_index()

        agg['runs_per_ball'] = agg['runs_conceded'] / agg['balls']
        replacement_rpb = agg['runs_per_ball'].quantile(1 - replacement_quantile)
        agg['replacement_runs'] = agg['balls'] * replacement_rpb
        agg['bowling_war'] = (agg['replacement_runs'] - agg['runs_conceded']) / runs_per_win
        agg['gender'] = gender
        result.append(agg)
    return pd.concat(result).sort_values(by='bowling_war', ascending=False)

# --- Step 4: Run pipeline ---
def run_pipeline(data_dir='./data/tests', output_csv='test_bowling_war.csv'):
    matches = load_matches(data_dir)
    df = extract_bowling_data(matches)

    if df.empty:
        print("❌ No data extracted. Check YAMLs.")
        return

    print(f"✅ Extracted {len(df)} deliveries")
    war_df = compute_bowling_war(df)
    war_df.to_csv(output_csv, index=False)
    print("✅ Top bowlers by WAR:")
    print(war_df.head(10))
    return war_df

# --- Execute ---
run_pipeline()

Loading Test YAMLs: 100%|████████████████████████████████████████████████████████| 869/869 [16:42<00:00,  1.15s/it]


✅ Extracted 1683114 deliveries
✅ Top bowlers by WAR:
              bowler  balls  runs_conceded  wickets  runs_per_ball  \
290      JM Anderson  39114          18880      705       0.482692   
486          NM Lyon  34360          17340      577       0.504657   
597        SCJ Broad  33896          17575      616       0.518498   
532         R Ashwin  27140          13302      553       0.490125   
229     HMRKB Herath  23563          11167      416       0.473921   
542        RA Jadeja  18760           8199      326       0.437047   
688       TG Southee  23557          12210      400       0.518317   
162       DL Vettori  14599           6328      190       0.433454   
233  Harbhajan Singh  17414           8739      240       0.501838   
302     JR Hazlewood  15254           7295      294       0.478235   

     replacement_runs  bowling_war gender  
290      26886.089665   800.608966   male  
486      23618.296285   627.829628   male  
597      23299.353052   572.435305   male  


Unnamed: 0,bowler,balls,runs_conceded,wickets,runs_per_ball,replacement_runs,bowling_war,gender
290,JM Anderson,39114,18880,705,0.482692,26886.089665,800.608966,male
486,NM Lyon,34360,17340,577,0.504657,23618.296285,627.829628,male
597,SCJ Broad,33896,17575,616,0.518498,23299.353052,572.435305,male
532,R Ashwin,27140,13302,553,0.490125,18655.429603,535.342960,male
229,HMRKB Herath,23563,11167,416,0.473921,16196.679725,502.967973,male
...,...,...,...,...,...,...,...,...
373,M Prasidh Krishna,545,481,13,0.882569,374.620823,-10.637918,male
73,B White,461,440,5,0.954447,316.881100,-12.311890,male
727,VR Aaron,1223,980,20,0.801308,840.662874,-13.933713,male
493,Nahid Rana,1361,1079,24,0.792799,935.520991,-14.347901,male
