### Contextual WAR

In [2]:
import os
import yaml
import glob
import pandas as pd
from tqdm import tqdm
import numpy as np
from pygam import LinearGAM, s, f

# --- Step 1: Load up to 100 ODI YAML matches ---
def load_matches(data_dir, max_files=10000):
    files = glob.glob(os.path.join(data_dir, '*.yaml'))[:max_files]
    matches = []
    for file in tqdm(files, desc="Loading YAMLs"):
        with open(file, 'r') as f:
            try:
                match = yaml.safe_load(f)
                matches.append(match)
            except:
                continue
    return matches

# --- Step 2: Extract contextual ball-by-ball data ---
def extract_contextual_balls(matches):
    data = []
    for match in matches:
        try:
            date = match.get('info', {}).get('dates', ['NA'])[0]
            year = pd.to_datetime(date).year
            innings_data = match.get('innings', [])

            for innings in innings_data:
                for _, details in innings.items():
                    deliveries = details.get('deliveries', [])
                    batter_order = {}
                    pos_counter = 1

                    for delivery in deliveries:
                        if not isinstance(delivery, dict):
                            continue

                        for ball_key, ball_data in delivery.items():
                            try:
                                over = int(ball_key)
                                ball = int(round((ball_key - over) * 10))

                                batsman = ball_data.get('batsman')
                                runs_batsman = ball_data.get('runs', {}).get('batsman', 0)

                                if not batsman:
                                    continue

                                if batsman not in batter_order:
                                    batter_order[batsman] = pos_counter
                                    pos_counter += 1
                                pos = batter_order[batsman]

                                if over < 10:
                                    phase = 'powerplay'
                                elif over < 40:
                                    phase = 'middle'
                                else:
                                    phase = 'death'

                                data.append({
                                    'batsman': batsman,
                                    'runs_batsman': runs_batsman,
                                    'year': year,
                                    'over': over + ball / 10,
                                    'batting_position': pos,
                                    'phase': phase
                                })
                            except:
                                continue
        except:
            continue
    return pd.DataFrame(data)

# --- Step 3: Train GAM model to predict runs per ball ---
def train_gam(df):
    df = df[df['runs_batsman'].notna()]
    df['batting_position'] = df['batting_position'].astype('category')
    df['year'] = df['year'].astype('category')

    X = df[['over', 'batting_position', 'year']]
    y = df['runs_batsman']

    gam = LinearGAM(s(0) + f(1) + f(2)).fit(X, y)
    df['expected_runs'] = gam.predict(X)
    return df, gam

# --- Step 4: Compute WAR by comparing actual vs replacement expected runs ---
def compute_war(df, replacement_ratio=0.85):
    agg = df.groupby('batsman').agg(
        actual_runs=('runs_batsman', 'sum'),
        expected_runs=('expected_runs', 'sum'),
        balls=('runs_batsman', 'count')
    ).reset_index()

    agg['replacement_exp'] = agg['expected_runs'] * replacement_ratio
    agg['batting_war'] = (agg['actual_runs'] - agg['replacement_exp']) / 10
    agg = agg.sort_values(by='batting_war', ascending=False)
    return agg

# --- Step 5: Run the full pipeline ---
def run_pipeline(data_dir='./odis', output_csv='contextual_odi_batting_war.csv'):
    matches = load_matches(data_dir)
    df = extract_contextual_balls(matches)

    if df.empty:
        print("❌ No data extracted. Please check YAML structure.")
        return

    print(f"✅ Extracted {len(df)} deliveries")

    df, gam = train_gam(df)
    war_df = compute_war(df)
    war_df.to_csv(output_csv, index=False)
    print("✅ Top players by WAR:")
    print(war_df.head(10))
    return war_df

# --- Execute ---
run_pipeline()

Loading YAMLs: 100%|███████████████████████████████████████████████████████████| 2980/2980 [14:27<00:00,  3.44it/s]


✅ Extracted 1578190 deliveries
✅ Top players by WAR:
             batsman  actual_runs  expected_runs  balls  replacement_exp  \
2328         V Kohli        14059   12488.128187  15363     10614.908959   
78    AB de Villiers         9435    7548.484397   9488      6416.211738   
1790       RG Sharma        10948    9941.232313  12067      8450.047466   
2331        V Sehwag         5692    4025.609349   5584      3421.767947   
1100   KC Sangakkara        11618   11279.274183  14634      9587.383055   
2254      TM Dilshan         9212    8600.067378  10777      7310.057271   
2136   Shahid Afridi         3913    2384.330850   2987      2026.681223   
1732       Q de Kock         6661    5623.023188   7024      4779.569710   
942       JC Buttler         5232    3977.792748   4607      3381.123836   
514        DA Warner         6623    5683.147248   6974      4830.675161   

      batting_war  
2328   344.409104  
78     301.878826  
1790   249.795253  
2331   227.023205  
1100   203

Unnamed: 0,batsman,actual_runs,expected_runs,balls,replacement_exp,batting_war
2328,V Kohli,14059,12488.128187,15363,10614.908959,344.409104
78,AB de Villiers,9435,7548.484397,9488,6416.211738,301.878826
1790,RG Sharma,10948,9941.232313,12067,8450.047466,249.795253
2331,V Sehwag,5692,4025.609349,5584,3421.767947,227.023205
1100,KC Sangakkara,11618,11279.274183,14634,9587.383055,203.061694
...,...,...,...,...,...,...
2166,Sidra Ameen,1820,2403.971719,2919,2043.375961,-22.337596
1708,PRCS Kumarihami,283,598.634543,764,508.839361,-22.583936
2118,Sana Mir,714,1162.901311,1380,988.466114,-27.446611
1622,Nigar Sultana,1168,1724.273618,2128,1465.632575,-29.763258
