### Test WAR (batting)

In [22]:
import os
import yaml
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm

# --- Step 1: Load Test match YAMLs ---
def load_matches(data_dir, max_files=10000):
    files = glob.glob(os.path.join(data_dir, '*.yaml'))[:max_files]
    matches = []
    for file in tqdm(files, desc="Loading YAMLs"):
        with open(file, 'r') as f:
            try:
                match = yaml.safe_load(f)
                if match.get("info", {}).get("match_type") == "Test":
                    matches.append(match)
            except:
                continue
    return matches

# --- Step 2: Extract contextual batting records ---
def extract_batting_data(matches):
    records = []
    for match in matches:
        try:
            info = match.get("info", {})
            date = info.get("dates", [None])[0]
            year = pd.to_datetime(date).year if date else None
            venue = info.get("venue", "NA")
            teams = info.get("teams", ["NA", "NA"])
            innings_data = match.get("innings", [])

            for innings in innings_data:
                for _, details in innings.items():
                    deliveries = details.get("deliveries", [])
                    team = details.get("team", "NA")
                    opposition = teams[1] if team == teams[0] else teams[0]
                    batter_runs = {}

                    for delivery in deliveries:
                        for _, ball in delivery.items():
                            batsman = ball.get("batsman")
                            runs = ball.get("runs", {}).get("batsman", 0)
                            if batsman:
                                batter_runs[batsman] = batter_runs.get(batsman, 0) + runs

                    for batter, runs in batter_runs.items():
                        records.append({
                            "batsman": batter,
                            "team": team,
                            "opposition": opposition,
                            "venue": venue,
                            "year": year,
                            "runs": runs
                        })
        except:
            continue

    return pd.DataFrame(records)

# --- Step 3: Compute Contextual WAR ---
def compute_contextual_war(df, replacement_quantile=0.2, runs_per_win=100):
    group_cols = ["batsman"]
    agg = df.groupby(group_cols).agg(
        total_runs=("runs", "sum"),
        innings=("runs", "count")
    ).reset_index()

    # Compute replacement level (bottom 20% by runs per innings)
    agg["rpi"] = agg["total_runs"] / agg["innings"]
    rep_rpi = agg["rpi"].quantile(replacement_quantile)
    agg["replacement_runs"] = agg["innings"] * rep_rpi
    agg["test_batting_war"] = (agg["total_runs"] - agg["replacement_runs"]) / runs_per_win
    return agg.sort_values("test_batting_war", ascending=False)

# --- Step 4: Pipeline ---
def run_pipeline(data_dir="./data/tests", output_csv="test_batting_war.csv"):
    matches = load_matches(data_dir)
    df = extract_batting_data(matches)

    if df.empty:
        print("❌ No data extracted.")
        return

    war_df = compute_contextual_war(df)
    war_df.to_csv(output_csv, index=False)
    print("✅ Top batters by WAR:")
    print(war_df.head(10))
    return war_df

# --- Execute ---
run_pipeline()

Loading YAMLs: 100%|█████████████████████████████████████████████████████████████| 869/869 [13:37<00:00,  1.06it/s]


✅ Top batters by WAR:
             batsman  total_runs  innings        rpi  replacement_runs  \
421          JE Root       13087      281  46.572954            1770.3   
61           AN Cook       12472      291  42.859107            1833.3   
971        SPD Smith       10350      208  49.759615            1310.4   
514    KS Williamson        9276      186  49.870968            1171.8   
1106         V Kohli        9230      210  43.952381            1323.0   
348          HM Amla        9146      211  43.345972            1329.3   
482    KC Sangakkara        8489      151  56.218543             951.3   
221        DA Warner        8786      205  42.858537            1291.5   
35    AB de Villiers        8182      175  46.754286            1102.5   
40        AD Mathews        8073      211  38.260664            1329.3   

      test_batting_war  
421            113.167  
61             106.387  
971             90.396  
514             81.042  
1106            79.070  
348          

Unnamed: 0,batsman,total_runs,innings,rpi,replacement_runs,test_batting_war
421,JE Root,13087,281,46.572954,1770.3,113.167
61,AN Cook,12472,291,42.859107,1833.3,106.387
971,SPD Smith,10350,208,49.759615,1310.4,90.396
514,KS Williamson,9276,186,49.870968,1171.8,81.042
1106,V Kohli,9230,210,43.952381,1323.0,79.070
...,...,...,...,...,...,...
712,Mohammed Siraj,134,49,2.734694,308.7,-1.747
666,MS Panesar,217,63,3.444444,396.9,-1.799
439,JM Anderson,1290,251,5.139442,1581.3,-2.913
984,ST Gabriel,229,86,2.662791,541.8,-3.128
