In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json

In [2]:
fields_to_skip = ["meta", "info"]

In [3]:
def populate_wicket_data(idx, delivery):
    wicket_record = {}
    if "wickets" in delivery and delivery["wickets"]:
        for wicket in delivery["wickets"]:
            wicket_record = {
                "delivery_index": idx,
                "wicket.kind": wicket["kind"],
                "wicket.player_out": wicket["player_out"],
                "wicket.fielders": (
                    [fielder["name"] for fielder in wicket["fielders"]]
                    if wicket.get("fielders") != None
                    else None
                ),
            }
    else:
        wicket_record = {
            "delivery_index": idx,
            "wicket.kind": None,
            "wicket.player_out": None,
            "wicket.fielders": None,
        }
    return wicket_record

In [4]:
file_names = os.listdir(path="data")
file_names = filter(lambda x: str(x).endswith(".json"), file_names)

merged_deliveries = pd.DataFrame()
wickets_data = []
index = 0
match_number = 1

# file_names = ["829813.json", "829817.json", "829823.json"]

for file_idx, file_name in enumerate(file_names):
    file_path = "data/" + file_name
    with open(file=file_path, mode="r") as file:
        print("processing file: ", file_idx, file_name, sep=", ")
        data = json.load(file)
        info = data["info"]
        date = info["dates"][0]
        match_number = (
            info["event"].get("match_number")
            if info["event"].get("match_number") != None
            else info["event"].get("stage")
        )
        innings = data["innings"]
        for idx, inning in enumerate(innings):
            if inning.get("super_over") is not None:
                print("skipping super_over: ", date, match_number)
                continue
            overs = inning["overs"]
            df_deliveries = pd.json_normalize(
                overs, record_path=["deliveries"], meta=["over"]
            )
            df_deliveries["date"] = date
            df_deliveries["match_number"] = match_number
            df_deliveries["innings"] = idx + 1
            merged_deliveries = pd.concat([merged_deliveries, df_deliveries], axis=0)
            for over in overs:
                for delivery in over["deliveries"]:
                    wicket_data = populate_wicket_data(index, delivery)
                    wickets_data.append(wicket_data)
                    index += 1

df_wickets = pd.DataFrame(wickets_data)

merged_deliveries.reset_index(inplace=True)

merged_deliveries["delivery_index"] = merged_deliveries.index

df_merged = pd.merge(merged_deliveries, df_wickets, on="delivery_index", how="left")

df_merged.drop("delivery_index", inplace=True, axis=1)
# df_merged.drop("wickets", inplace=True, axis=1)

processing file: , 0, 1254103.json
processing file: , 1, 1136605.json
processing file: , 2, 1216501.json
processing file: , 3, 1082595.json
processing file: , 4, 829731.json
processing file: , 5, 1359507.json
processing file: , 6, 548315.json
processing file: , 7, 1304079.json
processing file: , 8, 1422119.json
processing file: , 9, 1178418.json
processing file: , 10, 1359487.json
processing file: , 11, 1359515.json
processing file: , 12, 336025.json
processing file: , 13, 729309.json
processing file: , 14, 1178407.json
processing file: , 15, 501239.json
processing file: , 16, 1178423.json
processing file: , 17, 1426305.json
processing file: , 18, 419107.json
processing file: , 19, 336036.json
processing file: , 20, 1426311.json
processing file: , 21, 1082637.json
processing file: , 22, 392232.json
processing file: , 23, 1216522.json
processing file: , 24, 1426307.json
processing file: , 25, 829713.json
processing file: , 26, 1136576.json
processing file: , 27, 1359498.json
processing 

In [5]:
ordered_cols = [
    "date",
    "match_number",
    "innings",
    "over",
    "batter",
    "bowler",
    "non_striker",
    "runs.batter",
    "runs.extras",
    "runs.total",
    "extras.legbyes",
    "extras.wides",
    "extras.byes",
    "extras.noballs",
    "wicket.kind",
    "wicket.player_out",
    "wicket.fielders",
    "wickets",
]

In [6]:
df_merged = df_merged.reindex(columns=ordered_cols)
df_merged.sort_values(by=["date", "match_number"], inplace=True)

In [7]:
df_wickets.to_csv("output/wickets.csv", index=False)

In [8]:
df_merged.to_csv("output/ipl_ball_by_ball_output.csv", index=False)

In [9]:
df_merged.head()

Unnamed: 0,date,match_number,innings,over,batter,bowler,non_striker,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,extras.byes,extras.noballs,wicket.kind,wicket.player_out,wicket.fielders,wickets
79344,2008-04-18,1,1,0,SC Ganguly,P Kumar,BB McCullum,0,1,1,1.0,,,,,,,
79345,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,
79346,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,1,1,,1.0,,,,,,
79347,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,
79348,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,


## Validate the resultant dataframe


In [10]:
print("min date: ", df_merged["date"].min())
print("max date: ", df_merged["date"].max())

min date:  2008-04-18
max date:  2024-05-26


### Since we have data for IPL matches between 2008 to 2024, the above date range looks right


In [11]:
df_merged["match_number"].unique()

array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
       37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 51, 52, 53, 54,
       55, 56, 'Semi Final', 'Final', 47, '3rd Place Play-Off', 57, 58,
       59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 'Qualifier 1',
       'Elimination Final', 'Qualifier 2', 71, 72, 'Eliminator'],
      dtype=object)

### The above values for match_number looks right


In [12]:
df_merged["innings"].unique()

array([1, 2])

In [13]:
df_merged["over"].unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19], dtype=object)

In [14]:
sorted(df_merged["runs.batter"].unique())

[np.int64(0),
 np.int64(1),
 np.int64(2),
 np.int64(3),
 np.int64(4),
 np.int64(5),
 np.int64(6)]

In [15]:
sorted(df_merged["runs.extras"].unique())

[np.int64(0),
 np.int64(1),
 np.int64(2),
 np.int64(3),
 np.int64(4),
 np.int64(5),
 np.int64(7)]

In [16]:
sorted(df_merged["runs.total"].unique())

[np.int64(0),
 np.int64(1),
 np.int64(2),
 np.int64(3),
 np.int64(4),
 np.int64(5),
 np.int64(6),
 np.int64(7)]

In [17]:
sorted(df_merged["extras.legbyes"].unique())

[np.float64(1.0),
 np.float64(nan),
 np.float64(2.0),
 np.float64(3.0),
 np.float64(4.0),
 np.float64(5.0)]

In [18]:
sorted(df_merged["extras.wides"].unique())

[np.float64(nan),
 np.float64(1.0),
 np.float64(2.0),
 np.float64(3.0),
 np.float64(4.0),
 np.float64(5.0)]

In [19]:
sorted(df_merged["extras.byes"].unique())

[np.float64(nan),
 np.float64(1.0),
 np.float64(2.0),
 np.float64(3.0),
 np.float64(4.0)]

In [20]:
sorted(df_merged["extras.noballs"].unique())

[np.float64(nan),
 np.float64(1.0),
 np.float64(2.0),
 np.float64(3.0),
 np.float64(5.0)]

In [21]:
df_merged["wicket.kind"].unique()

array([None, 'caught', 'bowled', 'run out', 'lbw', 'retired hurt',
       'stumped', 'caught and bowled', 'hit wicket',
       'obstructing the field', 'retired out'], dtype=object)

In [22]:
df_merged["wicket.player_out"].unique()[:10]

array([None, 'SC Ganguly', 'RT Ponting', 'DJ Hussey', 'R Dravid',
       'V Kohli', 'JH Kallis', 'W Jaffer', 'MV Boucher', 'B Akhil'],
      dtype=object)

In [23]:
df_merged[~pd.isnull(df_merged["wicket.fielders"])]["wicket.fielders"].count()

np.int64(9342)

In [24]:
df_merged[~ pd.isnull(df_merged["wickets"])]['wickets'].count()

np.int64(12923)