### ODI Batting WAR

In [3]:
import os
import yaml
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm
from pygam import LinearGAM, s, f

# --- Step 1: Load YAML matches ---
def load_matches(data_dir, max_files=10000):
    files = glob.glob(os.path.join(data_dir, '*.yaml'))[:max_files]
    matches = []
    for file in tqdm(files, desc="Loading YAMLs"):
        with open(file, 'r') as f:
            try:
                match = yaml.safe_load(f)
                matches.append(match)
            except:
                continue
    return matches

# --- Step 2: Extract contextual ball-by-ball data ---
def extract_contextual_balls(matches):
    data = []
    for match in matches:
        try:
            info = match.get('info', {})
            gender = info.get('gender', 'unknown')
            date = info.get('dates', ['NA'])[0]
            year = pd.to_datetime(date).year
            innings_data = match.get('innings', [])

            for innings in innings_data:
                for _, details in innings.items():
                    deliveries = details.get('deliveries', [])
                    batter_order = {}
                    pos_counter = 1

                    for delivery in deliveries:
                        for ball_key, ball_data in delivery.items():
                            try:
                                over = int(ball_key)
                                ball = int(round((ball_key - over) * 10))

                                batsman = ball_data.get('batsman')
                                runs_batsman = ball_data.get('runs', {}).get('batsman', 0)

                                if not batsman:
                                    continue

                                if batsman not in batter_order:
                                    batter_order[batsman] = pos_counter
                                    pos_counter += 1
                                pos = batter_order[batsman]

                                if over < 10:
                                    phase = 'powerplay'
                                elif over < 40:
                                    phase = 'middle'
                                else:
                                    phase = 'death'

                                data.append({
                                    'batsman': batsman,
                                    'runs_batsman': runs_batsman,
                                    'year': year,
                                    'over': over + ball / 10,
                                    'batting_position': pos,
                                    'phase': phase,
                                    'gender': gender
                                })
                            except:
                                continue
        except:
            continue
    return pd.DataFrame(data)

# --- Step 3: Train GAM model ---
def train_gam(df):
    df = df[df['runs_batsman'].notna()]
    df['batting_position'] = df['batting_position'].astype('category')
    df['year'] = df['year'].astype('category')

    X = df[['over', 'batting_position', 'year']]
    y = df['runs_batsman']

    gam = LinearGAM(s(0) + f(1) + f(2)).fit(X, y)
    df['expected_runs'] = gam.predict(X)
    return df, gam

# --- Step 4: Compute WAR ---
def compute_war(df, replacement_ratio=0.85):
    agg = df.groupby('batsman').agg(
        actual_runs=('runs_batsman', 'sum'),
        expected_runs=('expected_runs', 'sum'),
        balls=('runs_batsman', 'count')
    ).reset_index()
    agg['replacement_exp'] = agg['expected_runs'] * replacement_ratio
    agg['batting_war'] = (agg['actual_runs'] - agg['replacement_exp']) / 10
    return agg.sort_values(by='batting_war', ascending=False)

# --- Step 5: Full pipeline per gender ---
def run_pipeline(data_dir='./odis'):
    matches = load_matches(data_dir)
    df = extract_contextual_balls(matches)

    if df.empty:
        print("❌ No data extracted.")
        return

    print(f"✅ Total deliveries: {len(df)}")

    for gender in ['male', 'female']:
        df_gender = df[df['gender'] == gender]
        if df_gender.empty:
            print(f"⚠️ No data for gender = {gender}")
            continue

        print(f"\n📊 Processing: {gender.upper()} ({len(df_gender)} deliveries)")
        df_gender, gam = train_gam(df_gender)
        war_df = compute_war(df_gender)
        out_csv = f"contextual_batting_war_{gender}.csv"
        war_df.to_csv(out_csv, index=False)
        print(f"✅ Saved WAR results to {out_csv}")
        print(war_df.head(10))

# --- Execute ---
run_pipeline()

Loading YAMLs: 100%|███████████████████████████████████████████████████████████| 2980/2980 [41:06<00:00,  1.21it/s]


✅ Total deliveries: 1578190

📊 Processing: MALE (1304637 deliveries)
✅ Saved WAR results to contextual_batting_war_male.csv
             batsman  actual_runs  expected_runs  balls  replacement_exp  \
1735         V Kohli        14059   12791.637483  15363     10872.891860   
43    AB de Villiers         9435    7668.069104   9488      6517.858738   
1737        V Sehwag         5692    4044.631255   5584      3437.936567   
1339       RG Sharma        10948   10274.853145  12067      8733.625173   
838    KC Sangakkara        11618   11338.445303  14634      9637.678508   
1583   Shahid Afridi         3913    2414.618262   2987      2052.425522   
1677      TM Dilshan         9212    8731.673334  10777      7421.922334   
49      AC Gilchrist         4355    3038.624541   4260      2582.830860   
710       JC Buttler         5232    4110.653759   4607      3494.055696   
1295       Q de Kock         6661    5864.772630   7024      4985.056736   

      batting_war  
1735   318.610814  