In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json

In [None]:
fields_to_skip = ["meta", "info"]

In [None]:
def populate_wicket_data(idx, delivery):
    wicket_record = {}
    if "wickets" in delivery and delivery["wickets"]:
        for wicket in delivery["wickets"]:
            wicket_record = {
                "delivery_index": idx,
                "wicket.kind": wicket["kind"],
                "wicket.player_out": wicket["player_out"],
                "wicket.fielders": (
                    [fielder["name"] for fielder in wicket["fielders"]]
                    if wicket.get("fielders") != None
                    else None
                ),
            }
    else:
        wicket_record = {
            "delivery_index": idx,
            "wicket.kind": None,
            "wicket.player_out": None,
            "wicket.fielders": None,
        }
    return wicket_record

In [None]:
file_names = os.listdir(path="data")
file_names = filter(lambda x: str(x).endswith(".json"), file_names)

merged_deliveries = pd.DataFrame()
wickets_data = []
index = 0
match_number = 1

# file_names = ["829813.json", "829817.json", "829823.json"]

for file_idx, file_name in enumerate(file_names):
    file_path = "data/" + file_name
    with open(file=file_path, mode="r") as file:
        print("processing file: ", file_idx, file_name, sep=", ")
        data = json.load(file)
        info = data["info"]
        date = info["dates"][0]
        match_number = (
            info["event"].get("match_number")
            if info["event"].get("match_number") != None
            else info["event"].get("stage")
        )
        innings = data["innings"]
        for idx, inning in enumerate(innings):
            if inning.get("super_over") is not None:
                print("skipping super_over: ", date, match_number)
                continue
            overs = inning["overs"]
            df_deliveries = pd.json_normalize(
                overs, record_path=["deliveries"], meta=["over"]
            )
            df_deliveries["date"] = date
            df_deliveries["match_number"] = match_number
            df_deliveries["innings"] = idx + 1
            merged_deliveries = pd.concat([merged_deliveries, df_deliveries], axis=0)
            for over in overs:
                for delivery in over["deliveries"]:
                    wicket_data = populate_wicket_data(index, delivery)
                    wickets_data.append(wicket_data)
                    index += 1

df_wickets = pd.DataFrame(wickets_data)

merged_deliveries.reset_index(inplace=True)

merged_deliveries["delivery_index"] = merged_deliveries.index

df_merged = pd.merge(merged_deliveries, df_wickets, on="delivery_index", how="left")

df_merged.drop("delivery_index", inplace=True, axis=1)
# df_merged.drop("wickets", inplace=True, axis=1)

In [None]:
ordered_cols = [
    "date",
    "match_number",
    "innings",
    "over",
    "batter",
    "bowler",
    "non_striker",
    "runs.batter",
    "runs.extras",
    "runs.total",
    "extras.legbyes",
    "extras.wides",
    "extras.byes",
    "extras.noballs",
    "wicket.kind",
    "wicket.player_out",
    "wicket.fielders",
    "wickets",
]

In [None]:
df_merged = pd.read_csv("./output/ipl_ball_by_ball_output.csv")

In [None]:
df_merged = df_merged.reindex(columns=ordered_cols)
df_merged.sort_values(by=["date", "match_number"], inplace=True)

In [None]:
df_merged['season'] = df_merged['date'].str.slice(0,4)

In [None]:
df_wickets.to_csv("output/wickets.csv", index=False)

In [None]:
df_merged.to_csv("output/ipl_ball_by_ball_output.csv", index=False)

In [None]:
df_merged.head()

## Validate the resultant dataframe


In [None]:
print("min date: ", df_merged["date"].min())
print("max date: ", df_merged["date"].max())

### Since we have data for IPL matches between 2008 to 2024, the above date range looks right


In [None]:
df_merged["match_number"].unique()

### The above values for match_number looks right


In [None]:
df_merged["innings"].unique()

In [None]:
df_merged["over"].unique()

In [None]:
sorted(df_merged["runs.batter"].unique())

In [None]:
sorted(df_merged["runs.extras"].unique())

In [None]:
sorted(df_merged["runs.total"].unique())

In [None]:
sorted(df_merged["extras.legbyes"].unique())

In [None]:
sorted(df_merged["extras.wides"].unique())

In [None]:
sorted(df_merged["extras.byes"].unique())

In [None]:
sorted(df_merged["extras.noballs"].unique())

In [None]:
df_merged["wicket.kind"].unique()

In [None]:
df_merged["wicket.player_out"].unique()[:10]

In [None]:
df_merged[~pd.isnull(df_merged["wicket.fielders"])]["wicket.fielders"].count()

In [None]:
df_merged[~ pd.isnull(df_merged["wickets"])]['wickets'].count()

In [None]:
df_merged

In [None]:
df_merged

In [None]:
df_merged.drop(columns='wickets',inplace=True)

In [None]:
df_merged.groupby(['date','match_number','innings','batter']).agg({'runs.batter':np.sum,'runs.extras':np.sum,'bowler':'count'})

In [None]:
df_merged.groupby(['date','match_number','innings','bowler','runs.batter']).agg({'runs.total':'sum','wicket.kind':'count','batter':'count'})