In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json

In [3]:
fields_to_skip = ["meta", "info"]

In [4]:
def populate_wicket_data(idx, delivery):
    wicket_record = {}
    if "wickets" in delivery and delivery["wickets"]:
        for wicket in delivery["wickets"]:
            wicket_record = {
                "delivery_index": idx,
                "wicket.kind": wicket["kind"],
                "wicket.player_out": wicket["player_out"],
                "wicket.fielders": (
                    [fielder["name"] for fielder in wicket["fielders"]]
                    if wicket.get("fielders") != None
                    else None
                ),
            }
    else:
        wicket_record = {
            "delivery_index": idx,
            "wicket.kind": None,
            "wicket.player_out": None,
            "wicket.fielders": None,
        }
    return wicket_record

In [5]:
file_names = os.listdir(path="data")
file_names = filter(lambda x: str(x).endswith(".json"), file_names)

merged_deliveries = pd.DataFrame()
wickets_data = []
index = 0
match_number = 1

# file_names = ["829813.json", "829817.json", "829823.json"]

for file_idx, file_name in enumerate(file_names):
    file_path = "data/" + file_name
    with open(file=file_path, mode="r") as file:
        print("processing file: ", file_idx, file_name, sep=", ")
        data = json.load(file)
        info = data["info"]
        date = info["dates"][0]
        match_number = (
            info["event"].get("match_number")
            if info["event"].get("match_number") != None
            else info["event"].get("stage")
        )
        innings = data["innings"]
        for idx, inning in enumerate(innings):
            if inning.get("super_over") is not None:
                print("skipping super_over: ", date, match_number)
                continue
            overs = inning["overs"]
            df_deliveries = pd.json_normalize(
                overs, record_path=["deliveries"], meta=["over"]
            )
            df_deliveries["date"] = date
            df_deliveries["match_number"] = match_number
            df_deliveries["innings"] = idx + 1
            merged_deliveries = pd.concat([merged_deliveries, df_deliveries], axis=0)
            for over in overs:
                for delivery in over["deliveries"]:
                    wicket_data = populate_wicket_data(index, delivery)
                    wickets_data.append(wicket_data)
                    index += 1

df_wickets = pd.DataFrame(wickets_data)

merged_deliveries.reset_index(inplace=True)

merged_deliveries["delivery_index"] = merged_deliveries.index

df_merged = pd.merge(merged_deliveries, df_wickets, on="delivery_index", how="left")

df_merged.drop("delivery_index", inplace=True, axis=1)
# df_merged.drop("wickets", inplace=True, axis=1)

processing file: , 0, 548368.json
processing file: , 1, 1254064.json
processing file: , 2, 1254107.json
processing file: , 3, 548343.json
processing file: , 4, 829817.json
processing file: , 5, 980945.json
processing file: , 6, 729291.json
processing file: , 7, 1422121.json
processing file: , 8, 598007.json
processing file: , 9, 1422128.json
processing file: , 10, 1178421.json
processing file: , 11, 1304103.json
processing file: , 12, 1304109.json
processing file: , 13, 598068.json
processing file: , 14, 829767.json
processing file: , 15, 1175358.json
processing file: , 16, 548380.json
processing file: , 17, 729283.json
processing file: , 18, 981009.json
processing file: , 19, 548328.json
processing file: , 20, 1136591.json
processing file: , 21, 829793.json
processing file: , 22, 1426270.json
processing file: , 23, 598065.json
processing file: , 24, 501243.json
processing file: , 25, 1254063.json
processing file: , 26, 1254089.json
processing file: , 27, 392214.json
processing file: ,

In [6]:
ordered_cols = [
    "date",
    "match_number",
    "innings",
    "over",
    "batter",
    "bowler",
    "non_striker",
    "runs.batter",
    "runs.extras",
    "runs.total",
    "extras.legbyes",
    "extras.wides",
    "extras.byes",
    "extras.noballs",
    "wicket.kind",
    "wicket.player_out",
    "wicket.fielders",
    "wickets",
]

In [7]:
df_merged = df_merged.reindex(columns=ordered_cols)
df_merged.sort_values(by=["date", "match_number"], inplace=True)

In [8]:
df_wickets.to_csv("output/wickets.csv", index=False)

In [9]:
df_merged.to_csv("output/ipl_ball_by_ball_output.csv", index=False)

In [10]:
df_merged.head()

Unnamed: 0,date,match_number,innings,over,batter,bowler,non_striker,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,extras.byes,extras.noballs,wicket.kind,wicket.player_out,wicket.fielders,wickets
126631,2008-04-18,1,1,0,SC Ganguly,P Kumar,BB McCullum,0,1,1,1.0,,,,,,,
126632,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,
126633,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,1,1,,1.0,,,,,,
126634,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,
126635,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,


## Validate the resultant dataframe


In [11]:
print("min date: ", df_merged["date"].min())
print("max date: ", df_merged["date"].max())

min date:  2008-04-18
max date:  2024-05-26


### Since we have data for IPL matches between 2008 to 2024, the above date range looks right


In [12]:
df_merged["match_number"].unique()

array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
       37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 51, 52, 53, 54,
       55, 56, 'Semi Final', 'Final', 47, '3rd Place Play-Off', 57, 58,
       59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 'Qualifier 1',
       'Elimination Final', 'Qualifier 2', 71, 72, 'Eliminator'],
      dtype=object)

### The above values for match_number looks right


In [13]:
df_merged["innings"].unique()

array([1, 2])

In [14]:
df_merged["over"].unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19], dtype=object)

In [15]:
sorted(df_merged["runs.batter"].unique())

[0, 1, 2, 3, 4, 5, 6]

In [16]:
sorted(df_merged["runs.extras"].unique())

[0, 1, 2, 3, 4, 5, 7]

In [17]:
sorted(df_merged["runs.total"].unique())

[0, 1, 2, 3, 4, 5, 6, 7]

In [18]:
sorted(df_merged["extras.legbyes"].unique())

[1.0, nan, 2.0, 3.0, 4.0, 5.0]

In [19]:
sorted(df_merged["extras.wides"].unique())

[nan, 1.0, 2.0, 3.0, 4.0, 5.0]

In [20]:
sorted(df_merged["extras.byes"].unique())

[nan, 1.0, 2.0, 3.0, 4.0]

In [21]:
sorted(df_merged["extras.noballs"].unique())

[nan, 1.0, 2.0, 3.0, 5.0]

In [22]:
df_merged["wicket.kind"].unique()

array([None, 'caught', 'bowled', 'run out', 'lbw', 'retired hurt',
       'stumped', 'caught and bowled', 'hit wicket',
       'obstructing the field', 'retired out'], dtype=object)

In [23]:
df_merged["wicket.player_out"].unique()[:10]

array([None, 'SC Ganguly', 'RT Ponting', 'DJ Hussey', 'R Dravid',
       'V Kohli', 'JH Kallis', 'W Jaffer', 'MV Boucher', 'B Akhil'],
      dtype=object)

In [24]:
df_merged[~pd.isnull(df_merged["wicket.fielders"])]["wicket.fielders"].count()

9342

In [25]:
df_merged[~ pd.isnull(df_merged["wickets"])]['wickets'].count()

12923

In [26]:
df_merged

Unnamed: 0,date,match_number,innings,over,batter,bowler,non_striker,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,extras.byes,extras.noballs,wicket.kind,wicket.player_out,wicket.fielders,wickets
126631,2008-04-18,1,1,0,SC Ganguly,P Kumar,BB McCullum,0,1,1,1.0,,,,,,,
126632,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,
126633,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,1,1,,1.0,,,,,,
126634,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,
126635,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254573,2024-05-26,Final,2,9,SS Iyer,AK Markram,VR Iyer,1,0,1,,,,,,,,
254574,2024-05-26,Final,2,9,VR Iyer,AK Markram,SS Iyer,1,0,1,,,,,,,,
254575,2024-05-26,Final,2,10,VR Iyer,Shahbaz Ahmed,SS Iyer,1,0,1,,,,,,,,
254576,2024-05-26,Final,2,10,SS Iyer,Shahbaz Ahmed,VR Iyer,1,0,1,,,,,,,,


In [50]:
df_merged['season'] = df_merged['date'].str.slice(0,4)

In [57]:
df_merged

Unnamed: 0,date,match_number,innings,over,batter,bowler,non_striker,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,extras.byes,extras.noballs,wicket.kind,wicket.player_out,wicket.fielders,wickets,season
126631,2008-04-18,1,1,0,SC Ganguly,P Kumar,BB McCullum,0,1,1,1.0,,,,,,,,2008
126632,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,,2008
126633,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,1,1,,1.0,,,,,,,2008
126634,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,,2008
126635,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254573,2024-05-26,Final,2,9,SS Iyer,AK Markram,VR Iyer,1,0,1,,,,,,,,,2024
254574,2024-05-26,Final,2,9,VR Iyer,AK Markram,SS Iyer,1,0,1,,,,,,,,,2024
254575,2024-05-26,Final,2,10,VR Iyer,Shahbaz Ahmed,SS Iyer,1,0,1,,,,,,,,,2024
254576,2024-05-26,Final,2,10,SS Iyer,Shahbaz Ahmed,VR Iyer,1,0,1,,,,,,,,,2024


In [58]:
df_merged.groupby(['date','match_number','innings','batter']).agg({'runs.batter':np.sum,'runs.extras':np.sum,'bowler':'count'})

  df_merged.groupby(['date','match_number','innings','batter']).agg({'runs.batter':np.sum,'runs.extras':np.sum,'bowler':'count'})
  df_merged.groupby(['date','match_number','innings','batter']).agg({'runs.batter':np.sum,'runs.extras':np.sum,'bowler':'count'})


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,runs.batter,runs.extras,bowler
date,match_number,innings,batter,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-04-18,1,1,BB McCullum,158,11,77
2008-04-18,1,1,DJ Hussey,12,0,12
2008-04-18,1,1,Mohammad Hafeez,5,0,3
2008-04-18,1,1,RT Ponting,20,4,20
2008-04-18,1,1,SC Ganguly,10,2,12
...,...,...,...,...,...,...
2024-05-26,Final,1,TM Head,0,0,1
2024-05-26,Final,2,Rahmanullah Gurbaz,39,7,35
2024-05-26,Final,2,SP Narine,6,0,2
2024-05-26,Final,2,SS Iyer,6,0,3


In [59]:
df_merged.groupby(['date','match_number','innings','bowler']).agg({'runs.total':np.sum,'wickets':'count','batter':'count'})

  df_merged.groupby(['date','match_number','innings','bowler']).agg({'runs.total':np.sum,'wickets':'count','batter':'count'})


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,runs.total,wickets,batter
date,match_number,innings,bowler,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-04-18,1,1,AA Noffke,41,1,25
2008-04-18,1,1,CL White,24,0,7
2008-04-18,1,1,JH Kallis,52,1,25
2008-04-18,1,1,P Kumar,41,0,25
2008-04-18,1,1,SB Joshi,26,0,18
...,...,...,...,...,...,...
2024-05-26,Final,2,B Kumar,25,0,13
2024-05-26,Final,2,JD Unadkat,9,0,6
2024-05-26,Final,2,PJ Cummins,18,1,13
2024-05-26,Final,2,Shahbaz Ahmed,28,1,15


In [65]:
df_merged.groupby(['batter','season']).agg({'runs.total':[np.sum,np.average],'bowler':np.max})

  df_merged.groupby(['batter','season']).agg({'runs.total':[np.sum,np.average],'bowler':np.max})
  df_merged.groupby(['batter','season']).agg({'runs.total':[np.sum,np.average],'bowler':np.max})


Unnamed: 0_level_0,Unnamed: 1_level_0,runs.total,runs.total,bowler
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,average,max
batter,season,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A Ashish Reddy,2012,40,1.333333,SW Tait
A Ashish Reddy,2013,126,1.400000,UT Yadav
A Ashish Reddy,2015,74,1.608696,TG Southee
A Ashish Reddy,2016,48,1.600000,YS Chahal
A Badoni,2022,173,1.244604,YS Chahal
...,...,...,...,...
Z Khan,2011,21,0.875000,SB Jakati
Z Khan,2012,12,0.705882,Shakib Al Hasan
Z Khan,2014,10,1.428571,IK Pathan
Z Khan,2016,6,0.461538,PP Chawla


In [63]:
df_merged.groupby(['bowler','season']).agg({'wickets':['count'],'batter':['count',np.max]})

  df_merged.groupby(['bowler','season']).agg({'wickets':['count'],'batter':['count',np.max]})


Unnamed: 0_level_0,Unnamed: 1_level_0,wickets,batter,batter
Unnamed: 0_level_1,Unnamed: 1_level_1,count,count,max
bowler,season,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A Ashish Reddy,2012,11,169,V Kohli
A Ashish Reddy,2013,3,41,V Kohli
A Ashish Reddy,2015,4,37,Yuvraj Singh
A Ashish Reddy,2016,1,23,V Kohli
A Badoni,2022,2,12,YBK Jaiswal
...,...,...,...,...
Z Khan,2013,5,38,SK Raina
Z Khan,2014,6,138,Yuvraj Singh
Z Khan,2015,8,155,WP Saha
Z Khan,2016,12,271,YK Pathan
