In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json

In [36]:
fields_to_skip = ["meta", "info"]

In [37]:
# def populate_wicket_data(idx, delivery):
#     wicket_record = {}
#     if "wickets" in delivery and delivery["wickets"]:
#         for wicket in delivery["wickets"]:
#             wicket_record = {
#                 "delivery_index": idx,
#                 "wicket.kind": wicket["kind"],
#                 "wicket.player_out": wicket["player_out"],
#                 "wicket.fielders": (
#                     [fielder["name"] for fielder in wicket["fielders"]]
#                     if wicket.get("fielders") != None
#                     else None
#                 ),
#             }
#     else:
#         wicket_record = {
#             "delivery_index": idx,
#             "wicket.kind": None,
#             "wicket.player_out": None,
#             "wicket.fielders": None,
#         }
#     return wicket_record

In [38]:
def populate_wicket_data(idx, delivery):
    wicket_record = {}
    if "wickets" in delivery and delivery["wickets"]:
        for wicket in delivery["wickets"]:
            wicket_record = {
                "delivery_index": idx,
                "wicket.kind": wicket["kind"],
                "wicket.player_out": wicket["player_out"],
                "wicket.fielders": (
                    [fielder["name"] for fielder in wicket["fielders"]]
                    if wicket.get("fielders") != None
                    else None
                ),
            }
    else:
        wicket_record = {
            "delivery_index": idx,
            "wicket.kind": None,
            "wicket.player_out": None,
            "wicket.fielders": None,
        }
    return wicket_record

In [39]:
file_names = os.listdir(path="data")
print(file_names[:10])
file_names = list(filter(lambda x: str(x).endswith(".json"), file_names))
file_names.sort()
print(file_names[:10])

merged_deliveries = pd.DataFrame()
wickets_data = []
index = 0
match_number = 1
prev_match_number = 0
date_file_name_dict = {}
prev_match_number = 0

for file_idx, file_name in enumerate(file_names):
    file_path = "data/" + file_name
    with open(file=file_path, mode="r") as file:
        data = json.load(file)
        info = data["info"]
        date = info["dates"][0]
        if info["event"].get("match_number") != None:
            match_number = info["event"].get("match_number")
        else:
            match_number = prev_match_number + 1
        
        if date_file_name_dict.get(date) != None:
            if prev_match_number < match_number:
                date_file_name_dict[date +"_1"] = file_name
            else:
                date_file_name_dict[date +"_1"] = date_file_name_dict[date] 
                date_file_name_dict[date] = file_name
        else:
            date_file_name_dict[date] = file_name
        prev_match_number = match_number

['1254103.json', '1136605.json', '1216501.json', '1082595.json', '829731.json', '1359507.json', '548315.json', '1304079.json', '1422119.json', '1178418.json']
['1082591.json', '1082592.json', '1082593.json', '1082594.json', '1082595.json', '1082596.json', '1082597.json', '1082598.json', '1082599.json', '1082600.json']


In [40]:
match_dates = list(date_file_name_dict.keys())
match_dates.sort()

In [41]:
# file_names = ["829813.json", "829817.json", "336038.json"]
index = 0
match_number = 1
prev_match_number = 0
prev_year = ""
stage = ""

for file_idx, match_date in enumerate(match_dates):
    file_name = date_file_name_dict[match_date]
    file_path = "data/" + file_name
    with open(file=file_path, mode="r") as file:
        data = json.load(file)
        info = data["info"]
        date = info["dates"][0]
        curr_year = date.split("-")[0]
        if curr_year != prev_year:
            print(curr_year, prev_year)
            prev_match_number = 0
        print("processing file: ", file_idx, file_name, match_date, date, prev_match_number, sep=", ")
        
        if info["event"].get("match_number") != None:
            match_number = info["event"].get("match_number")
        else:
            match_number = prev_match_number + 1
            print("match number not presentin file_name:", file_name, "populating with: ", match_number)
        
        if info["event"].get("stage") == None:
            stage = "group"
        else:
            stage = info["event"].get("stage")

        prev_match_number = match_number
        prev_year = date.split("-")[0]
        
        innings = data["innings"]
        for idx, inning in enumerate(innings):
            if inning.get("super_over") is not None:
                print("skipping super_over: ", date, match_number)
                continue
            overs = inning["overs"]
            df_deliveries = pd.json_normalize(
                overs, record_path=["deliveries"], meta=["over"]
            )
            df_deliveries["date"] = date
            df_deliveries["match_number"] = match_number
            df_deliveries["innings"] = idx + 1
            df_deliveries['stage'] = stage
            merged_deliveries = pd.concat([merged_deliveries, df_deliveries], axis=0)
            for over in overs:
                for delivery in over["deliveries"]:
                    wicket_data = populate_wicket_data(index, delivery)
                    wickets_data.append(wicket_data)
                    index += 1
                    
df_wickets = pd.DataFrame(wickets_data)

merged_deliveries.reset_index(inplace=True)

merged_deliveries["delivery_index"] = merged_deliveries.index

df_merged = pd.merge(merged_deliveries, df_wickets, on="delivery_index", how="left")

df_merged.drop("delivery_index", inplace=True, axis=1)
# df_merged.drop("wickets", inplace=True, axis=1)

2008 
processing file: , 0, 335982.json, 2008-04-18, 2008-04-18, 0
processing file: , 1, 335983.json, 2008-04-19, 2008-04-19, 1
processing file: , 2, 335984.json, 2008-04-19_1, 2008-04-19, 2
processing file: , 3, 335986.json, 2008-04-20, 2008-04-20, 3
processing file: , 4, 335985.json, 2008-04-20_1, 2008-04-20, 4
processing file: , 5, 335987.json, 2008-04-21, 2008-04-21, 5
processing file: , 6, 335988.json, 2008-04-22, 2008-04-22, 6
processing file: , 7, 335989.json, 2008-04-23, 2008-04-23, 7
processing file: , 8, 335990.json, 2008-04-24, 2008-04-24, 8
processing file: , 9, 335991.json, 2008-04-25, 2008-04-25, 9
processing file: , 10, 335993.json, 2008-04-26, 2008-04-26, 10
processing file: , 11, 335992.json, 2008-04-26_1, 2008-04-26, 11
processing file: , 12, 335995.json, 2008-04-27, 2008-04-27, 12
processing file: , 13, 335994.json, 2008-04-27_1, 2008-04-27, 13
processing file: , 14, 335996.json, 2008-04-28, 2008-04-28, 14
processing file: , 15, 335997.json, 2008-04-29, 2008-04-29, 1

In [8]:
df_merged.columns

Index(['index', 'batter', 'bowler', 'non_striker', 'extras.legbyes',
       'runs.batter', 'runs.extras', 'runs.total', 'extras.wides', 'wickets',
       'extras.byes', 'over', 'date', 'match_number', 'innings', 'stage',
       'extras.noballs', 'extras.penalty', 'replacements.role',
       'runs.non_boundary', 'review.by', 'review.umpire', 'review.batter',
       'review.decision', 'review.umpires_call', 'review.type',
       'replacements.match', 'wicket.kind', 'wicket.player_out',
       'wicket.fielders'],
      dtype='object')

In [9]:
ordered_cols = [
    "date",
    "match_number",
    "innings",
    "over",
    "batter",
    "bowler",
    "stage",
    "non_striker",
    "runs.batter",
    "runs.extras",
    "runs.total",
    "extras.legbyes",
    "extras.wides",
    "extras.byes",
    "extras.noballs",
    "wicket.kind",
    "wicket.player_out",
    "wicket.fielders",
    "wickets",
]

In [10]:
df_merged

Unnamed: 0,index,batter,bowler,non_striker,extras.legbyes,runs.batter,runs.extras,runs.total,extras.wides,wickets,...,review.by,review.umpire,review.batter,review.decision,review.umpires_call,review.type,replacements.match,wicket.kind,wicket.player_out,wicket.fielders
0,0,SC Ganguly,P Kumar,BB McCullum,1.0,0,1,1,,,...,,,,,,,,,,
1,1,BB McCullum,P Kumar,SC Ganguly,,0,0,0,,,...,,,,,,,,,,
2,2,BB McCullum,P Kumar,SC Ganguly,,0,1,1,1.0,,...,,,,,,,,,,
3,3,BB McCullum,P Kumar,SC Ganguly,,0,0,0,,,...,,,,,,,,,,
4,4,BB McCullum,P Kumar,SC Ganguly,,0,0,0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260754,62,SS Iyer,AK Markram,VR Iyer,,1,0,1,,,...,,,,,,,,,,
260755,63,VR Iyer,AK Markram,SS Iyer,,1,0,1,,,...,,,,,,,,,,
260756,64,VR Iyer,Shahbaz Ahmed,SS Iyer,,1,0,1,,,...,,,,,,,,,,
260757,65,SS Iyer,Shahbaz Ahmed,VR Iyer,,1,0,1,,,...,,,,,,,,,,


In [11]:
# df_merged = pd.read_csv("./output/ipl_ball_by_ball_output.csv")

In [12]:
df_merged = df_merged.reindex(columns=ordered_cols)
df_merged.sort_values(by=["date", "match_number"], inplace=True)

In [13]:
df_merged['season'] = df_merged['date'].str.slice(0,4)

In [14]:
df_wickets.to_csv("output/wickets.csv", index=False)

In [15]:
df_merged.to_csv("output/ipl_ball_by_ball_output.csv", index=False)

In [16]:
df_merged.head()

Unnamed: 0,date,match_number,innings,over,batter,bowler,stage,non_striker,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,extras.byes,extras.noballs,wicket.kind,wicket.player_out,wicket.fielders,wickets,season
0,2008-04-18,1,1,0,SC Ganguly,P Kumar,group,BB McCullum,0,1,1,1.0,,,,,,,,2008
1,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
2,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,1,1,,1.0,,,,,,,2008
3,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
4,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008


In [17]:
df_merged.stage.value_counts()

stage
group                 244937
Final                   4086
Qualifier 1             3406
Qualifier 2             3392
Eliminator              2597
Semi Final              1409
Elimination Final        734
3rd Place Play-Off       198
Name: count, dtype: int64

## Validate the resultant dataframe


In [18]:
df_merged.columns

Index(['date', 'match_number', 'innings', 'over', 'batter', 'bowler', 'stage',
       'non_striker', 'runs.batter', 'runs.extras', 'runs.total',
       'extras.legbyes', 'extras.wides', 'extras.byes', 'extras.noballs',
       'wicket.kind', 'wicket.player_out', 'wicket.fielders', 'wickets',
       'season'],
      dtype='object')

In [19]:
print("min date: ", df_merged["date"].min())
print("max date: ", df_merged["date"].max())

min date:  2008-04-18
max date:  2024-05-26


### Since we have data for IPL matches between 2008 to 2024, the above date range looks right


In [20]:
df_merged["match_number"].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 51, 52,
       53, 54, 55, 56, 57, 58, 59, 47, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76])

### The above values for match_number looks right


In [21]:
df_merged["innings"].unique()

array([1, 2])

In [22]:
df_merged["over"].unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19], dtype=object)

In [23]:
sorted(df_merged["runs.batter"].unique())

[np.int64(0),
 np.int64(1),
 np.int64(2),
 np.int64(3),
 np.int64(4),
 np.int64(5),
 np.int64(6)]

In [24]:
sorted(df_merged["runs.extras"].unique())

[np.int64(0),
 np.int64(1),
 np.int64(2),
 np.int64(3),
 np.int64(4),
 np.int64(5),
 np.int64(7)]

In [25]:
sorted(df_merged["runs.total"].unique())

[np.int64(0),
 np.int64(1),
 np.int64(2),
 np.int64(3),
 np.int64(4),
 np.int64(5),
 np.int64(6),
 np.int64(7)]

In [26]:
sorted(df_merged["extras.legbyes"].unique())

[np.float64(1.0),
 np.float64(nan),
 np.float64(2.0),
 np.float64(3.0),
 np.float64(4.0),
 np.float64(5.0)]

In [27]:
sorted(df_merged["extras.wides"].unique())

[np.float64(nan),
 np.float64(1.0),
 np.float64(2.0),
 np.float64(3.0),
 np.float64(4.0),
 np.float64(5.0)]

In [28]:
sorted(df_merged["extras.byes"].unique())

[np.float64(nan),
 np.float64(1.0),
 np.float64(2.0),
 np.float64(3.0),
 np.float64(4.0)]

In [29]:
sorted(df_merged["extras.noballs"].unique())

[np.float64(nan),
 np.float64(1.0),
 np.float64(2.0),
 np.float64(3.0),
 np.float64(5.0)]

In [30]:
df_merged["wicket.kind"].unique()

array([None, 'caught', 'bowled', 'run out', 'lbw', 'retired hurt',
       'stumped', 'caught and bowled', 'hit wicket',
       'obstructing the field', 'retired out'], dtype=object)

In [31]:
df_merged["wicket.player_out"].unique()[:10]

array([None, 'SC Ganguly', 'RT Ponting', 'DJ Hussey', 'R Dravid',
       'V Kohli', 'JH Kallis', 'W Jaffer', 'MV Boucher', 'B Akhil'],
      dtype=object)

In [32]:
df_merged[~pd.isnull(df_merged["wicket.fielders"])]["wicket.fielders"].count()

np.int64(9342)

In [33]:
df_merged[~ pd.isnull(df_merged["wickets"])]['wickets'].count()

np.int64(12923)

In [34]:
df_merged

Unnamed: 0,date,match_number,innings,over,batter,bowler,stage,non_striker,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,extras.byes,extras.noballs,wicket.kind,wicket.player_out,wicket.fielders,wickets,season
0,2008-04-18,1,1,0,SC Ganguly,P Kumar,group,BB McCullum,0,1,1,1.0,,,,,,,,2008
1,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
2,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,1,1,,1.0,,,,,,,2008
3,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
4,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260754,2024-05-26,73,2,9,SS Iyer,AK Markram,Final,VR Iyer,1,0,1,,,,,,,,,2024
260755,2024-05-26,73,2,9,VR Iyer,AK Markram,Final,SS Iyer,1,0,1,,,,,,,,,2024
260756,2024-05-26,73,2,10,VR Iyer,Shahbaz Ahmed,Final,SS Iyer,1,0,1,,,,,,,,,2024
260757,2024-05-26,73,2,10,SS Iyer,Shahbaz Ahmed,Final,VR Iyer,1,0,1,,,,,,,,,2024


In [None]:
df_merged

In [None]:
df_merged.drop(columns='wickets',inplace=True)

In [None]:
df_merged.groupby(['date','match_number','innings','batter']).agg({'runs.batter':np.sum,'runs.extras':np.sum,'bowler':'count'})

In [None]:
df_merged.groupby(['date','match_number','innings','bowler','runs.batter']).agg({'runs.total':'sum','wicket.kind':'count','batter':'count'})