In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json

In [2]:
fields_to_skip = ["meta", "info"]

In [3]:
def populate_wicket_data(idx, delivery):
    wicket_record = {}
    if "wickets" in delivery and delivery["wickets"]:
        for wicket in delivery["wickets"]:
            wicket_record = {
                "delivery_index": idx,
                "wicket.kind": wicket["kind"],
                "wicket.player_out": wicket["player_out"],
                "wicket.fielders": (
                    [fielder["name"] for fielder in wicket["fielders"]]
                    if wicket.get("fielders") != None
                    else None
                ),
            }
    else:
        wicket_record = {
            "delivery_index": idx,
            "wicket.kind": None,
            "wicket.player_out": None,
            "wicket.fielders": None,
        }
    return wicket_record

In [4]:
file_names = os.listdir(path="data")
print(file_names[:10])
file_names = list(filter(lambda x: str(x).endswith(".json"), file_names))
file_names.sort()
print(file_names[:10])

merged_deliveries = pd.DataFrame()
wickets_data = []
index = 0
match_number = 1
prev_match_number = 0
date_file_name_dict = {}
prev_match_number = 0

for file_idx, file_name in enumerate(file_names):
    file_path = "data/" + file_name
    with open(file=file_path, mode="r") as file:
        data = json.load(file)
        info = data["info"]
        date = info["dates"][0]
        if info["event"].get("match_number") != None:
            match_number = info["event"].get("match_number")
        else:
            match_number = prev_match_number + 1
        
        if date_file_name_dict.get(date) != None:
            if prev_match_number < match_number:
                date_file_name_dict[date +"_1"] = file_name
            else:
                date_file_name_dict[date +"_1"] = date_file_name_dict[date] 
                date_file_name_dict[date] = file_name
        else:
            date_file_name_dict[date] = file_name
        prev_match_number = match_number

['1254103.json', '1136605.json', '1216501.json', '1082595.json', '829731.json', '1359507.json', '548315.json', '1304079.json', '1422119.json', '1178418.json']
['1082591.json', '1082592.json', '1082593.json', '1082594.json', '1082595.json', '1082596.json', '1082597.json', '1082598.json', '1082599.json', '1082600.json']


In [5]:
match_dates = list(date_file_name_dict.keys())
match_dates.sort()

In [6]:
# file_names = ["829813.json", "829817.json", "336038.json"]
index = 0
match_number = 1
prev_match_number = 0
prev_year = ""
stage = ""

for file_idx, match_date in enumerate(match_dates):
    file_name = date_file_name_dict[match_date]
    file_path = "data/" + file_name
    with open(file=file_path, mode="r") as file:
        data = json.load(file)
        info = data["info"]
        date = info["dates"][0]
        curr_year = date.split("-")[0]
        if curr_year != prev_year:
            print(curr_year, prev_year)
            prev_match_number = 0
        print("processing file: ", file_idx, file_name, match_date, date, prev_match_number, sep=", ")
        
        if info["event"].get("match_number") != None:
            match_number = info["event"].get("match_number")
        else:
            match_number = prev_match_number + 1
            print("match number not presentin file_name:", file_name, "populating with: ", match_number)
        
        if info["event"].get("stage") == None:
            stage = "group"
        else:
            stage = info["event"].get("stage")

        prev_match_number = match_number
        prev_year = date.split("-")[0]
        
        innings = data["innings"]
        for idx, inning in enumerate(innings):
            if inning.get("super_over") is not None:
                print("skipping super_over: ", date, match_number)
                continue
            overs = inning["overs"]
            df_deliveries = pd.json_normalize(
                overs, record_path=["deliveries"], meta=["over"]
            )
            df_deliveries["date"] = date
            df_deliveries["match_number"] = match_number
            df_deliveries["innings"] = idx + 1
            df_deliveries['stage'] = stage
            merged_deliveries = pd.concat([merged_deliveries, df_deliveries], axis=0)
            for over in overs:
                for delivery in over["deliveries"]:
                    wicket_data = populate_wicket_data(index, delivery)
                    wickets_data.append(wicket_data)
                    index += 1
                    
df_wickets = pd.DataFrame(wickets_data)

merged_deliveries.reset_index(inplace=True)

merged_deliveries["delivery_index"] = merged_deliveries.index

df_merged = pd.merge(merged_deliveries, df_wickets, on="delivery_index", how="left")

df_merged.drop("delivery_index", inplace=True, axis=1)
# df_merged.drop("wickets", inplace=True, axis=1)

2008 
processing file: , 0, 335982.json, 2008-04-18, 2008-04-18, 0
processing file: , 1, 335983.json, 2008-04-19, 2008-04-19, 1
processing file: , 2, 335984.json, 2008-04-19_1, 2008-04-19, 2
processing file: , 3, 335986.json, 2008-04-20, 2008-04-20, 3
processing file: , 4, 335985.json, 2008-04-20_1, 2008-04-20, 4
processing file: , 5, 335987.json, 2008-04-21, 2008-04-21, 5
processing file: , 6, 335988.json, 2008-04-22, 2008-04-22, 6
processing file: , 7, 335989.json, 2008-04-23, 2008-04-23, 7
processing file: , 8, 335990.json, 2008-04-24, 2008-04-24, 8
processing file: , 9, 335991.json, 2008-04-25, 2008-04-25, 9
processing file: , 10, 335993.json, 2008-04-26, 2008-04-26, 10
processing file: , 11, 335992.json, 2008-04-26_1, 2008-04-26, 11
processing file: , 12, 335995.json, 2008-04-27, 2008-04-27, 12
processing file: , 13, 335994.json, 2008-04-27_1, 2008-04-27, 13
processing file: , 14, 335996.json, 2008-04-28, 2008-04-28, 14
processing file: , 15, 335997.json, 2008-04-29, 2008-04-29, 1

processing file: , 23, 336005.json, 2008-05-04_1, 2008-05-04, 23
processing file: , 24, 336006.json, 2008-05-05, 2008-05-05, 24
processing file: , 25, 336007.json, 2008-05-06, 2008-05-06, 25
processing file: , 26, 336008.json, 2008-05-07, 2008-05-07, 26
processing file: , 27, 336009.json, 2008-05-08, 2008-05-08, 27
processing file: , 28, 336010.json, 2008-05-08_1, 2008-05-08, 28
processing file: , 29, 336011.json, 2008-05-09, 2008-05-09, 29
processing file: , 30, 336013.json, 2008-05-10, 2008-05-10, 30
processing file: , 31, 336014.json, 2008-05-11, 2008-05-11, 31
processing file: , 32, 336015.json, 2008-05-11_1, 2008-05-11, 32
processing file: , 33, 336016.json, 2008-05-12, 2008-05-12, 33
processing file: , 34, 336017.json, 2008-05-13, 2008-05-13, 34
processing file: , 35, 336018.json, 2008-05-14, 2008-05-14, 35
processing file: , 36, 336020.json, 2008-05-15, 2008-05-15, 36
processing file: , 37, 336021.json, 2008-05-16, 2008-05-16, 37
processing file: , 38, 336023.json, 2008-05-17, 2

In [7]:
df_merged.columns

Index(['index', 'batter', 'bowler', 'non_striker', 'extras.legbyes',
       'runs.batter', 'runs.extras', 'runs.total', 'extras.wides', 'wickets',
       'extras.byes', 'over', 'date', 'match_number', 'innings', 'stage',
       'extras.noballs', 'extras.penalty', 'replacements.role',
       'runs.non_boundary', 'review.by', 'review.umpire', 'review.batter',
       'review.decision', 'review.umpires_call', 'review.type',
       'replacements.match', 'wicket.kind', 'wicket.player_out',
       'wicket.fielders'],
      dtype='object')

In [8]:
ordered_cols = [
    "date",
    "match_number",
    "innings",
    "over",
    "batter",
    "bowler",
    "stage",
    "non_striker",
    "runs.batter",
    "runs.extras",
    "runs.total",
    "extras.legbyes",
    "extras.wides",
    "extras.byes",
    "extras.noballs",
    "wicket.kind",
    "wicket.player_out",
    "wicket.fielders",
    "wickets",
]

In [9]:
df_merged

Unnamed: 0,index,batter,bowler,non_striker,extras.legbyes,runs.batter,runs.extras,runs.total,extras.wides,wickets,...,review.by,review.umpire,review.batter,review.decision,review.umpires_call,review.type,replacements.match,wicket.kind,wicket.player_out,wicket.fielders
0,0,SC Ganguly,P Kumar,BB McCullum,1.0,0,1,1,,,...,,,,,,,,,,
1,1,BB McCullum,P Kumar,SC Ganguly,,0,0,0,,,...,,,,,,,,,,
2,2,BB McCullum,P Kumar,SC Ganguly,,0,1,1,1.0,,...,,,,,,,,,,
3,3,BB McCullum,P Kumar,SC Ganguly,,0,0,0,,,...,,,,,,,,,,
4,4,BB McCullum,P Kumar,SC Ganguly,,0,0,0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260754,62,SS Iyer,AK Markram,VR Iyer,,1,0,1,,,...,,,,,,,,,,
260755,63,VR Iyer,AK Markram,SS Iyer,,1,0,1,,,...,,,,,,,,,,
260756,64,VR Iyer,Shahbaz Ahmed,SS Iyer,,1,0,1,,,...,,,,,,,,,,
260757,65,SS Iyer,Shahbaz Ahmed,VR Iyer,,1,0,1,,,...,,,,,,,,,,


In [10]:
# df_merged = pd.read_csv("./output/ipl_ball_by_ball_output.csv")

In [11]:
df_merged = df_merged.reindex(columns=ordered_cols)
df_merged.sort_values(by=["date", "match_number"], inplace=True)

In [12]:
df_merged['season'] = df_merged['date'].str.slice(0,4)

In [13]:
# df_wickets.to_csv("output/wickets.csv", index=False)

In [14]:
# df_merged.to_csv("output/ipl_ball_by_ball_output.csv", index=False)

In [15]:
df_merged.head()

Unnamed: 0,date,match_number,innings,over,batter,bowler,stage,non_striker,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,extras.byes,extras.noballs,wicket.kind,wicket.player_out,wicket.fielders,wickets,season
0,2008-04-18,1,1,0,SC Ganguly,P Kumar,group,BB McCullum,0,1,1,1.0,,,,,,,,2008
1,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
2,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,1,1,,1.0,,,,,,,2008
3,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
4,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008


## Validate the resultant dataframe


In [16]:
print("min date: ", df_merged["date"].min())
print("max date: ", df_merged["date"].max())

min date:  2008-04-18
max date:  2024-05-26


### Since we have data for IPL matches between 2008 to 2024, the above date range looks right


In [17]:
df_merged["match_number"].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 51, 52,
       53, 54, 55, 56, 57, 58, 59, 47, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76])

### The above values for match_number looks right


In [18]:
df_merged["innings"].unique()

array([1, 2])

In [19]:
df_merged["over"].unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19], dtype=object)

In [20]:
sorted(df_merged["runs.batter"].unique())

[np.int64(0),
 np.int64(1),
 np.int64(2),
 np.int64(3),
 np.int64(4),
 np.int64(5),
 np.int64(6)]

In [21]:
sorted(df_merged["runs.extras"].unique())

[np.int64(0),
 np.int64(1),
 np.int64(2),
 np.int64(3),
 np.int64(4),
 np.int64(5),
 np.int64(7)]

In [22]:
sorted(df_merged["runs.total"].unique())

[np.int64(0),
 np.int64(1),
 np.int64(2),
 np.int64(3),
 np.int64(4),
 np.int64(5),
 np.int64(6),
 np.int64(7)]

In [23]:
sorted(df_merged["extras.legbyes"].unique())

[np.float64(1.0),
 np.float64(nan),
 np.float64(2.0),
 np.float64(3.0),
 np.float64(4.0),
 np.float64(5.0)]

In [24]:
sorted(df_merged["extras.wides"].unique())

[np.float64(nan),
 np.float64(1.0),
 np.float64(2.0),
 np.float64(3.0),
 np.float64(4.0),
 np.float64(5.0)]

In [25]:
sorted(df_merged["extras.byes"].unique())

[np.float64(nan),
 np.float64(1.0),
 np.float64(2.0),
 np.float64(3.0),
 np.float64(4.0)]

In [26]:
sorted(df_merged["extras.noballs"].unique())

[np.float64(nan),
 np.float64(1.0),
 np.float64(2.0),
 np.float64(3.0),
 np.float64(5.0)]

In [27]:
df_merged["wicket.kind"].unique()

array([None, 'caught', 'bowled', 'run out', 'lbw', 'retired hurt',
       'stumped', 'caught and bowled', 'hit wicket',
       'obstructing the field', 'retired out'], dtype=object)

In [28]:
df_merged["wicket.player_out"].unique()[:10]

array([None, 'SC Ganguly', 'RT Ponting', 'DJ Hussey', 'R Dravid',
       'V Kohli', 'JH Kallis', 'W Jaffer', 'MV Boucher', 'B Akhil'],
      dtype=object)

In [29]:
df_merged[~pd.isnull(df_merged["wicket.fielders"])]["wicket.fielders"].count()

np.int64(9342)

In [30]:
df_merged[~ pd.isnull(df_merged["wickets"])]['wickets'].count()

np.int64(12923)

In [31]:
df_merged

Unnamed: 0,date,match_number,innings,over,batter,bowler,stage,non_striker,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,extras.byes,extras.noballs,wicket.kind,wicket.player_out,wicket.fielders,wickets,season
0,2008-04-18,1,1,0,SC Ganguly,P Kumar,group,BB McCullum,0,1,1,1.0,,,,,,,,2008
1,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
2,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,1,1,,1.0,,,,,,,2008
3,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
4,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260754,2024-05-26,73,2,9,SS Iyer,AK Markram,Final,VR Iyer,1,0,1,,,,,,,,,2024
260755,2024-05-26,73,2,9,VR Iyer,AK Markram,Final,SS Iyer,1,0,1,,,,,,,,,2024
260756,2024-05-26,73,2,10,VR Iyer,Shahbaz Ahmed,Final,SS Iyer,1,0,1,,,,,,,,,2024
260757,2024-05-26,73,2,10,SS Iyer,Shahbaz Ahmed,Final,VR Iyer,1,0,1,,,,,,,,,2024


In [32]:
df_merged

Unnamed: 0,date,match_number,innings,over,batter,bowler,stage,non_striker,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,extras.byes,extras.noballs,wicket.kind,wicket.player_out,wicket.fielders,wickets,season
0,2008-04-18,1,1,0,SC Ganguly,P Kumar,group,BB McCullum,0,1,1,1.0,,,,,,,,2008
1,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
2,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,1,1,,1.0,,,,,,,2008
3,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
4,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260754,2024-05-26,73,2,9,SS Iyer,AK Markram,Final,VR Iyer,1,0,1,,,,,,,,,2024
260755,2024-05-26,73,2,9,VR Iyer,AK Markram,Final,SS Iyer,1,0,1,,,,,,,,,2024
260756,2024-05-26,73,2,10,VR Iyer,Shahbaz Ahmed,Final,SS Iyer,1,0,1,,,,,,,,,2024
260757,2024-05-26,73,2,10,SS Iyer,Shahbaz Ahmed,Final,VR Iyer,1,0,1,,,,,,,,,2024


In [33]:
df_merged.drop(columns='wickets',inplace=True)

In [34]:
df_merged.groupby(['date','match_number','innings','batter']).agg({'runs.batter':np.sum,'runs.extras':np.sum,'bowler':'count'})

  df_merged.groupby(['date','match_number','innings','batter']).agg({'runs.batter':np.sum,'runs.extras':np.sum,'bowler':'count'})
  df_merged.groupby(['date','match_number','innings','batter']).agg({'runs.batter':np.sum,'runs.extras':np.sum,'bowler':'count'})


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,runs.batter,runs.extras,bowler
date,match_number,innings,batter,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-04-18,1,1,BB McCullum,158,11,77
2008-04-18,1,1,DJ Hussey,12,0,12
2008-04-18,1,1,Mohammad Hafeez,5,0,3
2008-04-18,1,1,RT Ponting,20,4,20
2008-04-18,1,1,SC Ganguly,10,2,12
...,...,...,...,...,...,...
2024-05-26,73,1,TM Head,0,0,1
2024-05-26,73,2,Rahmanullah Gurbaz,39,7,35
2024-05-26,73,2,SP Narine,6,0,2
2024-05-26,73,2,SS Iyer,6,0,3


In [35]:
df_merged.groupby(['date','match_number','innings','bowler','runs.batter']).agg({'runs.total':'sum','wicket.kind':'count','batter':'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,runs.total,wicket.kind,batter
date,match_number,innings,bowler,runs.batter,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2008-04-18,1,1,AA Noffke,0,6,1,8
2008-04-18,1,1,AA Noffke,1,11,0,11
2008-04-18,1,1,AA Noffke,2,4,0,2
2008-04-18,1,1,AA Noffke,4,8,0,2
2008-04-18,1,1,AA Noffke,6,12,0,2
...,...,...,...,...,...,...,...
2024-05-26,73,2,Shahbaz Ahmed,6,12,0,2
2024-05-26,73,2,T Natarajan,0,2,0,4
2024-05-26,73,2,T Natarajan,1,5,0,5
2024-05-26,73,2,T Natarajan,4,16,0,4


In [36]:
ball_by_ball = df_merged

### THE SUMMARY DATA

In [91]:
fields_to_skip = ["meta", "players", "registry", "innings"]

In [92]:
def flatten_json(y):
    out = {}

    def flatten(x, name=""):
        if type(x) is dict:
            for a in x:
                if a in fields_to_skip:
                    # print("skipping field: ", a)
                    pass
                else:
                    flatten(x[a], name + a + "_")
        elif type(x) is list:
            i = 1
            for a in x:
                flatten(a, name + str(i) + "_")
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

In [93]:
files = os.listdir(path="data")
print(files)

['1254103.json', '1136605.json', '1216501.json', '1082595.json', '829731.json', '1359507.json', '548315.json', '1304079.json', '1422119.json', '1178418.json', '1359487.json', '1359515.json', '336025.json', '729309.json', '1178407.json', '501239.json', '1178423.json', '1426305.json', '419107.json', '336036.json', '1426311.json', '1082637.json', '392232.json', '1216522.json', '1426307.json', '829713.json', '1136576.json', '1359498.json', '1082629.json', '1178413.json', '1216506.json', '1426282.json', '1082613.json', '1304105.json', '1082642.json', '1359514.json', '1216530.json', '980967.json', '1254067.json', '392225.json', '336032.json', '1304057.json', '548359.json', '1304061.json', '548314.json', '1136561.json', '335995.json', '1136620.json', '1359538.json', '598005.json', '1426266.json', '1359533.json', '1216494.json', '829807.json', '829767.json', '1304094.json', '1426281.json', '.ipynb_checkpoints', '1136595.json', '980929.json', '1359496.json', '419114.json', '1304095.json', '1178

In [94]:
rows = list()
files = os.listdir(path="data")

filter(lambda x: str(x).endswith(".json"), files)

for file in files:
    try:
        file_name = "data/" + file
        with open(file_name, "r") as file:
            json_data = json.load(file)
            flattened_data = flatten_json(json_data)
            rows.append(flattened_data)
    except:
        print(file_name)

data/.ipynb_checkpoints
data/test.txt


In [95]:
# Create a DataFrame with a single row
ipl_summary = pd.DataFrame(rows)

In [96]:
ipl_summary

Unnamed: 0,info_balls_per_over,info_city,info_dates_1,info_event_name,info_event_match_number,info_gender,info_match_type,info_officials_match_referees_1,info_officials_reserve_umpires_1,info_officials_tv_umpires_1,...,cells_3_id,metadata_kernelspec_display_name,metadata_kernelspec_language,metadata_kernelspec_name,metadata_language_info_codemirror_mode_name,metadata_language_info_file_extension,metadata_language_info_mimetype,metadata_language_info_name,nbformat,nbformat_minor
0,6.0,Dubai,2021-09-29,Indian Premier League,43.0,male,T20,M Nayyar,K Srinivasan,RK Illingworth,...,,,,,,,,,,
1,6.0,Delhi,2018-05-12,Indian Premier League,45.0,male,T20,Prakash Bhatt,K Srinath,AK Chaudhary,...,,,,,,,,,,
2,6.0,Abu Dhabi,2020-10-07,Indian Premier League,21.0,male,T20,V Narayan Kutty,K Srinivasan,C Shamshuddin,...,,,,,,,,,,
3,6.0,Bengaluru,2017-04-08,Indian Premier League,5.0,male,T20,J Srinath,Navdeep Singh,A Nand Kishore,...,,,,,,,,,,
4,6.0,Visakhapatnam,2015-04-18,Indian Premier League,13.0,male,T20,M Nayyar,VK Sharma,C Shamshuddin,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,6.0,Delhi,2008-05-08,Indian Premier League,28.0,male,T20,CH Lloyd,,K Hariharan,...,,,,,,,,,,
1092,6.0,Delhi,2008-04-19,Indian Premier League,3.0,male,T20,GR Viswanath,,IL Howell,...,,,,,,,,,,
1093,6.0,Mohali,2024-03-23,Indian Premier League,2.0,male,T20,V Narayan Kutty,PM Joshi,AG Wharf,...,,,,,,,,,,
1094,6.0,Navi Mumbai,2022-04-23,Indian Premier League,35.0,male,T20,V Narayan Kutty,Vinod Seshan,BNJ Oxenford,...,,,,,,,,,,


In [97]:
ipl_summary.to_csv("output/ipl_summary_raw.csv", index=False)

In [44]:
ipl_summary.head()

Unnamed: 0,info_balls_per_over,info_city,info_dates_1,info_event_name,info_event_match_number,info_gender,info_match_type,info_officials_match_referees_1,info_officials_reserve_umpires_1,info_officials_tv_umpires_1,...,cells_3_id,metadata_kernelspec_display_name,metadata_kernelspec_language,metadata_kernelspec_name,metadata_language_info_codemirror_mode_name,metadata_language_info_file_extension,metadata_language_info_mimetype,metadata_language_info_name,nbformat,nbformat_minor
0,6.0,Dubai,2021-09-29,Indian Premier League,43.0,male,T20,M Nayyar,K Srinivasan,RK Illingworth,...,,,,,,,,,,
1,6.0,Delhi,2018-05-12,Indian Premier League,45.0,male,T20,Prakash Bhatt,K Srinath,AK Chaudhary,...,,,,,,,,,,
2,6.0,Abu Dhabi,2020-10-07,Indian Premier League,21.0,male,T20,V Narayan Kutty,K Srinivasan,C Shamshuddin,...,,,,,,,,,,
3,6.0,Bengaluru,2017-04-08,Indian Premier League,5.0,male,T20,J Srinath,Navdeep Singh,A Nand Kishore,...,,,,,,,,,,
4,6.0,Visakhapatnam,2015-04-18,Indian Premier League,13.0,male,T20,M Nayyar,VK Sharma,C Shamshuddin,...,,,,,,,,,,


In [45]:
def remove_prefix(x: str, to_replace: str, replace_by: str):
    return x.replace(to_replace, replace_by)

In [46]:
ipl_summary.rename(
    lambda x: remove_prefix(str(x), to_replace="info_", replace_by=""),
    inplace=True,
    axis=1,
)

In [47]:
ipl_summary.rename(
    lambda x: remove_prefix(str(x), to_replace=" ", replace_by="_"),
    inplace=True,
    axis=1,
)

In [48]:
ipl_summary.rename(lambda x: str(x).lower(), inplace=True, axis=1)

In [49]:
ipl_summary.head()

Unnamed: 0,balls_per_over,city,dates_1,event_name,event_match_number,gender,match_type,officials_match_referees_1,officials_reserve_umpires_1,officials_tv_umpires_1,...,cells_3_id,metadata_kernelspec_display_name,metadata_kernelspec_language,metadata_kernelspec_name,metadata_language_codemirror_mode_name,metadata_language_file_extension,metadata_language_mimetype,metadata_language_name,nbformat,nbformat_minor
0,6.0,Dubai,2021-09-29,Indian Premier League,43.0,male,T20,M Nayyar,K Srinivasan,RK Illingworth,...,,,,,,,,,,
1,6.0,Delhi,2018-05-12,Indian Premier League,45.0,male,T20,Prakash Bhatt,K Srinath,AK Chaudhary,...,,,,,,,,,,
2,6.0,Abu Dhabi,2020-10-07,Indian Premier League,21.0,male,T20,V Narayan Kutty,K Srinivasan,C Shamshuddin,...,,,,,,,,,,
3,6.0,Bengaluru,2017-04-08,Indian Premier League,5.0,male,T20,J Srinath,Navdeep Singh,A Nand Kishore,...,,,,,,,,,,
4,6.0,Visakhapatnam,2015-04-18,Indian Premier League,13.0,male,T20,M Nayyar,VK Sharma,C Shamshuddin,...,,,,,,,,,,


In [50]:
def populate_team_1(row):
    toss_winner = row["toss_winner"]
    if row["toss_decision"] == "bat":
        team_1 = row["toss_winner"]
    else:
        teams_1 = row["teams_1"]
        teams_2 = row["teams_2"]
        team_1 = teams_2 if toss_winner == teams_1 else teams_1

    return team_1

In [51]:
ipl_summary["team_1"] = ipl_summary.apply(populate_team_1, axis=1)

In [52]:
def populate_team_2(row):
    toss_winner = row["toss_winner"]
    if row["toss_decision"] == "field":
        team_2 = row["toss_winner"]
    else:
        teams_1 = row["teams_1"]
        teams_2 = row["teams_2"]
        team_2 = teams_2 if toss_winner == teams_1 else teams_1

    return team_2

In [53]:
ipl_summary["team_2"] = ipl_summary.apply(populate_team_2, axis=1)

In [54]:
ipl_summary.columns

Index(['balls_per_over', 'city', 'dates_1', 'event_name', 'event_match_number',
       'gender', 'match_type', 'officials_match_referees_1',
       'officials_reserve_umpires_1', 'officials_tv_umpires_1',
       'officials_umpires_1', 'officials_umpires_2', 'outcome_winner',
       'outcome_by_wickets', 'overs', 'player_of_match_1', 'season',
       'team_type', 'teams_1', 'teams_2', 'toss_decision', 'toss_winner',
       'venue', 'outcome_by_runs', 'outcome_method', 'event_stage',
       'outcome_result', 'outcome_eliminator', 'dates_2', 'cells_1_cell_type',
       'cells_1_execution_count', 'cells_1_id', 'cells_1_source_1',
       'cells_1_source_2', 'cells_2_cell_type', 'cells_2_execution_count',
       'cells_2_id', 'cells_2_outputs_1_name', 'cells_2_outputs_1_output_type',
       'cells_2_outputs_1_text_1', 'cells_2_source_1', 'cells_3_cell_type',
       'cells_3_execution_count', 'cells_3_id',
       'metadata_kernelspec_display_name', 'metadata_kernelspec_language',
       'meta

In [55]:
sel_cols = [
    "teams_1",
    "teams_2",
    "toss_decision",
    "toss_winner",
    "outcome_winner",
    "team_1",
    "team_2",
]
ipl_summary[sel_cols].head()

Unnamed: 0,teams_1,teams_2,toss_decision,toss_winner,outcome_winner,team_1,team_2
0,Rajasthan Royals,Royal Challengers Bangalore,field,Royal Challengers Bangalore,Royal Challengers Bangalore,Rajasthan Royals,Royal Challengers Bangalore
1,Delhi Daredevils,Royal Challengers Bangalore,field,Royal Challengers Bangalore,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore
2,Kolkata Knight Riders,Chennai Super Kings,bat,Kolkata Knight Riders,Kolkata Knight Riders,Kolkata Knight Riders,Chennai Super Kings
3,Royal Challengers Bangalore,Delhi Daredevils,bat,Royal Challengers Bangalore,Royal Challengers Bangalore,Royal Challengers Bangalore,Delhi Daredevils
4,Sunrisers Hyderabad,Delhi Daredevils,bat,Delhi Daredevils,Delhi Daredevils,Delhi Daredevils,Sunrisers Hyderabad


In [56]:
ipl_summary.drop(columns=["teams_1", "teams_2"], inplace=True)

### Following columns have only a single value and so can be dropped from the dataframe

1. balls_per_over
2. event_name
3. gender
4. match_type
5. overs
6. team_type


In [57]:
ipl_summary["balls_per_over"].value_counts()

balls_per_over
6.0    1095
Name: count, dtype: int64

In [58]:
ipl_summary["event_name"].value_counts()

event_name
Indian Premier League    1095
Name: count, dtype: int64

In [59]:
ipl_summary["gender"].value_counts()

gender
male    1095
Name: count, dtype: int64

In [60]:
ipl_summary["match_type"].value_counts()

match_type
T20    1095
Name: count, dtype: int64

In [61]:
ipl_summary["overs"].value_counts()

overs
20.0    1095
Name: count, dtype: int64

In [62]:
ipl_summary["team_type"].value_counts()

team_type
club    1095
Name: count, dtype: int64

In [63]:
ipl_summary.drop(
    columns=[
        "balls_per_over",
        "event_name",
        "gender",
        "match_type",
        "overs",
        "team_type",
    ],
    inplace=True,
)

### Season has 2016 represented as string and numeric and so it is showing up as two different values. Convert to str type


In [64]:
ipl_summary[ipl_summary["season"].apply(lambda x: str(x).strip().find("2016") != -1)][
    "season"
].value_counts()

season
2016    59
2016     1
Name: count, dtype: int64

In [65]:
ipl_summary["season"] = ipl_summary["season"].astype(str)

In [66]:
ipl_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1096 entries, 0 to 1095
Data columns (total 47 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   city                                    1044 non-null   object 
 1   dates_1                                 1095 non-null   object 
 2   event_match_number                      1029 non-null   float64
 3   officials_match_referees_1              1095 non-null   object 
 4   officials_reserve_umpires_1             1071 non-null   object 
 5   officials_tv_umpires_1                  1091 non-null   object 
 6   officials_umpires_1                     1095 non-null   object 
 7   officials_umpires_2                     1095 non-null   object 
 8   outcome_winner                          1076 non-null   object 
 9   outcome_by_wickets                      578 non-null    float64
 10  player_of_match_1                       1090 non-null   obje

In [90]:
ipl_summary.to_csv("iplsummary.csv")

In [67]:
ipl_summary.loc[ipl_summary["event_match_number"].isnull(), "event_match_number"] = -1

In [68]:
ipl_summary["event_match_number"] = ipl_summary["event_match_number"].astype(int)

In [69]:
ipl_summary["event_match_number"].unique()

array([43, 45, 21,  5, 13, 33, 10,  1, 41, 16, 32, 42, 48, 67,  2, 53, -1,
       47, 52, 69, 24, 39, 38, 44, 23, 59, 40, 54, 34, 50, 11, 15,  9, 64,
        8, 28, 31, 35, 22, 49,  6, 14, 19, 55, 30,  7, 51, 62,  4, 63, 57,
       37, 20,  3, 56, 25, 26, 60, 65, 46, 36, 12, 27, 29, 17, 18, 66, 58,
       72, 68, 61, 70, 71])

In [70]:
ipl_summary.describe()

Unnamed: 0,event_match_number,outcome_by_wickets,outcome_by_runs,cells_1_execution_count,cells_2_execution_count,cells_3_execution_count,nbformat,nbformat_minor
count,1096.0,578.0,498.0,1.0,1.0,0.0,1.0,1.0
mean,29.451642,6.192042,30.104418,2.0,3.0,,4.0,5.0
std,19.401912,1.845733,26.739844,,,,,
min,-1.0,1.0,1.0,2.0,3.0,,4.0,5.0
25%,13.0,5.0,11.0,2.0,3.0,,4.0,5.0
50%,29.0,6.0,22.0,2.0,3.0,,4.0,5.0
75%,45.0,7.0,41.0,2.0,3.0,,4.0,5.0
max,72.0,10.0,146.0,2.0,3.0,,4.0,5.0


In [71]:
for col in ipl_summary.columns:
    print(col, ipl_summary[col].dtype, sep=" => ")

city => object
dates_1 => object
event_match_number => int64
officials_match_referees_1 => object
officials_reserve_umpires_1 => object
officials_tv_umpires_1 => object
officials_umpires_1 => object
officials_umpires_2 => object
outcome_winner => object
outcome_by_wickets => float64
player_of_match_1 => object
season => object
toss_decision => object
toss_winner => object
venue => object
outcome_by_runs => float64
outcome_method => object
event_stage => object
outcome_result => object
outcome_eliminator => object
dates_2 => object
cells_1_cell_type => object
cells_1_execution_count => float64
cells_1_id => object
cells_1_source_1 => object
cells_1_source_2 => object
cells_2_cell_type => object
cells_2_execution_count => float64
cells_2_id => object
cells_2_outputs_1_name => object
cells_2_outputs_1_output_type => object
cells_2_outputs_1_text_1 => object
cells_2_source_1 => object
cells_3_cell_type => object
cells_3_execution_count => float64
cells_3_id => object
metadata_kernelspec_di

# Handle Missing Values


### Populate missing city values based on the stadium

In [72]:

ipl_summary.loc[ipl_summary["venue"] == "Sharjah Cricket Stadium", "city"] = "Sharjah"
ipl_summary.loc[ipl_summary["venue"] == "Dubai International Cricket Stadium", "city"] = "Dubai"

In [73]:
# Drop dates_2 and outcome_eliminator columns as it has 98% missing values
ipl_summary = ipl_summary.drop(columns=["dates_2", "outcome_eliminator"])

In [74]:
# Defaulting the event_stage to group_stage as it is left blank
ipl_summary.loc[ipl_summary["event_stage"].isnull(), "event_stage"] = "group_stage"

In [75]:
ipl_summary.loc[ipl_summary["outcome_winner"].isnull(), "outcome_winner"] = ipl_summary["outcome_result"]

In [76]:
ipl_summary.loc[ipl_summary["outcome_method"].isnull(), "outcome_method"] = "regular"

In [77]:
# drop outcome_result as it is merged in the outcome_winner column
ipl_summary.drop(columns=["outcome_result"], inplace=True)

In [78]:
# Fix season values
ipl_summary.loc[ipl_summary["season"] == "2007/08", "season"] = "2008"
ipl_summary.loc[ipl_summary["season"] == "2009/10", "season"] = "2010"
ipl_summary.loc[ipl_summary["season"] == "2020/21", "season"] = "2020"

In [87]:
ipl_summary["season"].value_counts().sort_index()

season
2008    58
2009    57
2010    60
2011    73
2012    74
2013    76
2014    60
2015    59
2016    60
2017    59
2018    60
2019    60
2020    60
2021    60
2022    74
2023    74
2024    71
nan      1
Name: count, dtype: int64

In [88]:
ipl_summary[ipl_summary.season.isna()]

Unnamed: 0,city,dates_1,event_match_number,officials_match_referees_1,officials_reserve_umpires_1,officials_tv_umpires_1,officials_umpires_1,officials_umpires_2,outcome_winner,outcome_by_wickets,...,metadata_kernelspec_language,metadata_kernelspec_name,metadata_language_codemirror_mode_name,metadata_language_file_extension,metadata_language_mimetype,metadata_language_name,nbformat,nbformat_minor,team_1,team_2


In [89]:
ipl_summary["season"] = ipl_summary["season"].astype(int)

ValueError: invalid literal for int() with base 10: 'nan'

In [None]:
# Handle event_match_number
print(len(ipl_summary.loc[ipl_summary["event_match_number"] == -1, "event_match_number"]))
ipl_summary.loc[ipl_summary["event_match_number"] == -1, ["season", "dates_1"]].sort_values(by="dates_1")

66


Unnamed: 0,season,dates_1
853,2008,2008-05-30
207,2008,2008-05-31
610,2008,2008-06-01
1015,2009,2009-05-22
546,2009,2009-05-23
...,...,...
1094,2023,2023-05-29
563,2024,2024-05-21
759,2024,2024-05-22
20,2024,2024-05-24


In [None]:
# Not using this as we are populating the missing match numbers with event stage
def populate_match_numbers(df, season):
    df_part = df.loc[
        (df["season"] == season) & (df["event_match_number"].isnull()),
        ["event_match_number", "dates_1"],
    ].sort_values(by="dates_1")
    max_match_number = df[df["season"] == season]["event_match_number"].max() + 1
    print(df_part)
    for index, _ in df_part.iterrows():
        df.loc[index, "event_match_number"] = max_match_number
        max_match_number += 1

    print(df)

In [None]:
# Not using this as we are populating the missing match numbers with event stage
def populate_match_numbers_across_seasons(df):
    for seas in df.season.unique():
        populate_match_numbers(df, seas)

In [None]:
# Not using this as we are populating the missing match numbers with event stage
# populate_match_numbers_across_seasons(df)

In [None]:
# note that this would convert the null values to string "nan"
ipl_summary["event_match_number"] = ipl_summary["event_match_number"].astype(str)

In [None]:
ipl_summary.loc[ipl_summary["event_match_number"] == "-1", ["event_match_number"]] = ipl_summary["event_stage"]

In [None]:
ipl_summary["event_match_number"].unique()

array(['43', '45', '21', '5', '13', '33', '10', '1', '41', '16', '32',
       '42', '48', '67', '2', '53', 'Qualifier 2', '47', '52', '69', '24',
       '39', '38', '44', '23', '59', '40', '54', '34', '50', '11', '15',
       '9', 'Final', '64', '8', '28', '31', '35', '22', '49', '6', '14',
       '19', '55', '30', '7', 'Qualifier 1', '51', '62', '4', '63',
       'Semi Final', '57', '37', '20', '3', '56', '25', '26', '60', '65',
       '46', '36', '12', '27', '29', '17', '18', '66', '58', '72', '68',
       'Eliminator', '61', '70', 'Elimination Final',
       '3rd Place Play-Off', '71'], dtype=object)

In [None]:
pd.Series(ipl_summary["venue"].unique())

0                   Dubai International Cricket Stadium
1                                  Arun Jaitley Stadium
2                                  Sheikh Zayed Stadium
3                                 M.Chinnaswamy Stadium
4     Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket St...
5                                 Eden Gardens, Kolkata
6                                 M Chinnaswamy Stadium
7                    Dr DY Patil Sports Academy, Mumbai
8              MA Chidambaram Stadium, Chepauk, Chennai
9                                          Eden Gardens
10                     Narendra Modi Stadium, Ahmedabad
11                              Sharjah Cricket Stadium
12         Punjab Cricket Association IS Bindra Stadium
13            Rajiv Gandhi International Stadium, Uppal
14                   Rajiv Gandhi International Stadium
15                             Wankhede Stadium, Mumbai
16                                    Brabourne Stadium
17                               Sawai Mansingh 

In [None]:
ipl_summary["venue"] = ipl_summary["venue"].str.partition(",")[0]

In [None]:
ipl_summary["venue"] = ipl_summary["venue"].str.replace(".", "")

In [None]:
ipl_summary["venue"] = ipl_summary["venue"].str.replace(" ", "")

In [None]:
pd.Series(ipl_summary["venue"].unique())

0                      DubaiInternationalCricketStadium
1                                    ArunJaitleyStadium
2                                    SheikhZayedStadium
3                                   MChinnaswamyStadium
4            DrYSRajasekharaReddyACA-VDCACricketStadium
5                                           EdenGardens
6                                DrDYPatilSportsAcademy
7                                  MAChidambaramStadium
8                                   NarendraModiStadium
9                                 SharjahCricketStadium
10              PunjabCricketAssociationISBindraStadium
11                      RajivGandhiInternationalStadium
12                                      WankhedeStadium
13                                     BrabourneStadium
14                                 SawaiMansinghStadium
15                                  NewWanderersStadium
16                 MaharashtraCricketAssociationStadium
17    BharatRatnaShriAtalBihariVajpayeeEkanaCric

In [None]:
ipl_summary.columns

Index(['city', 'dates_1', 'event_match_number', 'officials_match_referees_1',
       'officials_reserve_umpires_1', 'officials_tv_umpires_1',
       'officials_umpires_1', 'officials_umpires_2', 'outcome_winner',
       'outcome_by_wickets', 'player_of_match_1', 'season', 'toss_decision',
       'toss_winner', 'venue', 'outcome_by_runs', 'outcome_method',
       'event_stage', 'team_1', 'team_2'],
      dtype='object')

In [None]:
ipl_summary = ipl_summary.rename(
    columns={
        "dates_1": "date",
        "officials_match_referees_1": "officials_match_referees",
        "officials_reserve_umpires_1": "officials_reserve_umpires",
        "officials_tv_umpires_1": "officials_tv_umpires",
        "teams_1": "team_1",
        "teams_2": "team_2",
        "player_of_match_1": "player_of_match",
    }
)

In [None]:
ipl_summary.columns

Index(['city', 'date', 'event_match_number', 'officials_match_referees',
       'officials_reserve_umpires', 'officials_tv_umpires',
       'officials_umpires_1', 'officials_umpires_2', 'outcome_winner',
       'outcome_by_wickets', 'player_of_match', 'season', 'toss_decision',
       'toss_winner', 'venue', 'outcome_by_runs', 'outcome_method',
       'event_stage', 'team_1', 'team_2'],
      dtype='object')

In [None]:
ipl_summary.rename(columns={"event_match_number": "match_number"}, inplace=True)

In [None]:
cols = [
    "date",
    "match_number",
    "city",
    "team_1",
    "team_2",
    "outcome_winner",
    "player_of_match",
    "toss_winner",
    "toss_decision",
    "officials_match_referees",
    "officials_reserve_umpires",
    "officials_tv_umpires",
    "officials_umpires_1",
    "officials_umpires_2",
    "outcome_by_wickets",
    "season",
    "venue",
    "outcome_by_runs",
    "event_stage",
    "outcome_method",
]
ipl_summary = ipl_summary.reindex(columns=cols)

In [None]:
ipl_summary = ipl_summary.sort_values(by=["date", "match_number"])

In [None]:
ipl_summary.to_csv("output/ipl_summary.csv", index=False)

In [None]:
ball_by_ball 

In [None]:
ball_by_ball.shape

(260759, 18)

In [None]:
ball_by_ball.head()

Unnamed: 0,date,match_number,innings,over,batter,bowler,non_striker,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,extras.byes,extras.noballs,wicket.kind,wicket.player_out,wicket.fielders,wickets
0,2008-04-18,1,1,0,SC Ganguly,P Kumar,BB McCullum,0,1,1,1.0,,,,,,,
1,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,
2,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,1,1,,1.0,,,,,,
3,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,
4,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,


In [None]:
ipl_summary["match_number"] = ipl_summary["match_number"].astype("str")

In [None]:
ipl_summary.shape

(1095, 20)

In [None]:
ipl_summary.head()

Unnamed: 0,date,match_number,city,team_1,team_2,outcome_winner,player_of_match,toss_winner,toss_decision,officials_match_referees,officials_reserve_umpires,officials_tv_umpires,officials_umpires_1,officials_umpires_2,outcome_by_wickets,season,venue,outcome_by_runs,event_stage,outcome_method
0,2008-04-18,1,Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,Kolkata Knight Riders,BB McCullum,Royal Challengers Bangalore,field,J Srinath,VN Kulkarni,AM Saheba,Asad Rauf,RE Koertzen,,2008,MChinnaswamyStadium,140.0,group_stage,regular
1,2008-04-19,2,Chandigarh,Chennai Super Kings,Kings XI Punjab,Chennai Super Kings,MEK Hussey,Chennai Super Kings,bat,S Venkataraghavan,MSS Ranawat,RB Tiffin,MR Benson,SL Shastri,,2008,PunjabCricketAssociationStadium,33.0,group_stage,regular
2,2008-04-19,3,Delhi,Rajasthan Royals,Delhi Daredevils,Delhi Daredevils,MF Maharoof,Rajasthan Royals,bat,GR Viswanath,,IL Howell,Aleem Dar,GA Pratapkumar,9.0,2008,FerozShahKotla,,group_stage,regular
3,2008-04-20,4,Kolkata,Deccan Chargers,Kolkata Knight Riders,Kolkata Knight Riders,DJ Hussey,Deccan Chargers,bat,FM Engineer,F Gomes,Asad Rauf,BF Bowden,K Hariharan,5.0,2008,EdenGardens,,group_stage,regular
4,2008-04-20,5,Mumbai,Mumbai Indians,Royal Challengers Bangalore,Royal Challengers Bangalore,MV Boucher,Mumbai Indians,bat,J Srinath,SN Bandekar,AV Jayaprakash,SJ Davis,DJ Harper,5.0,2008,WankhedeStadium,,group_stage,regular


In [None]:
def get_items_by_team(field, agg="sum"):
    if agg == "sum":
        grouped_df = ball_by_ball.groupby(["date", "match_number", "innings"])[field].sum()
    elif agg == "max":
        grouped_df = ball_by_ball.groupby(["date", "match_number", "innings"])[field].max()

    grouped_df = grouped_df.reset_index()

    df_pivot = grouped_df.pivot(
        index=["date", "match_number"], columns="innings", values=field
    )

    df_pivot.reset_index(inplace=True)

    df_pivot = df_pivot.rename(columns={1: "team_1_" + field, 2: "team_2_" + field})

    return df_pivot

In [None]:
df_pivot = get_items_by_team("runs.total")
df_merged = pd.merge(ipl_summary, df_pivot, how="outer", on=["date", "match_number"])

In [None]:
df_pivot = get_items_by_team("runs.extras")
df_merged = pd.merge(df_merged, df_pivot, how="outer", on=["date", "match_number"])

In [None]:
df_pivot = get_items_by_team("extras.legbyes")
df_merged = pd.merge(df_merged, df_pivot, how="outer", on=["date", "match_number"])

In [None]:
df_pivot = get_items_by_team("extras.wides")
df_merged = pd.merge(df_merged, df_pivot, how="outer", on=["date", "match_number"])

In [None]:
df_pivot = get_items_by_team("extras.byes")
df_merged = pd.merge(df_merged, df_pivot, how="outer", on=["date", "match_number"])

In [None]:
df_pivot = get_items_by_team("extras.noballs")
df_merged = pd.merge(df_merged, df_pivot, how="outer", on=["date", "match_number"])

In [None]:
df_pivot = get_items_by_team("over", "max")
df_merged = pd.merge(df_merged, df_pivot, how="outer", on=["date", "match_number"])

In [None]:
pd.set_option("display.max_columns", 35)
df_merged.head()

Unnamed: 0,date,match_number,city,team_1,team_2,outcome_winner,player_of_match,toss_winner,toss_decision,officials_match_referees,officials_reserve_umpires,officials_tv_umpires,officials_umpires_1,officials_umpires_2,outcome_by_wickets,season,venue,outcome_by_runs,event_stage,outcome_method,team_1_runs.total,team_2_runs.total,team_1_runs.extras,team_2_runs.extras,team_1_extras.legbyes,team_2_extras.legbyes,team_1_extras.wides,team_2_extras.wides,team_1_extras.byes,team_2_extras.byes,team_1_extras.noballs,team_2_extras.noballs,team_1_over,team_2_over
0,2008-04-18,1,Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,Kolkata Knight Riders,BB McCullum,Royal Challengers Bangalore,field,J Srinath,VN Kulkarni,AM Saheba,Asad Rauf,RE Koertzen,,2008,MChinnaswamyStadium,140.0,group_stage,regular,222.0,82.0,17.0,19.0,4.0,8.0,9.0,11.0,4.0,0.0,0.0,0.0,19.0,15.0
1,2008-04-19,2,Chandigarh,Chennai Super Kings,Kings XI Punjab,Chennai Super Kings,MEK Hussey,Chennai Super Kings,bat,S Venkataraghavan,MSS Ranawat,RB Tiffin,MR Benson,SL Shastri,,2008,PunjabCricketAssociationStadium,33.0,group_stage,regular,240.0,207.0,6.0,11.0,2.0,4.0,3.0,5.0,0.0,2.0,1.0,0.0,19.0,19.0
2,2008-04-19,3,Delhi,Rajasthan Royals,Delhi Daredevils,Delhi Daredevils,MF Maharoof,Rajasthan Royals,bat,GR Viswanath,,IL Howell,Aleem Dar,GA Pratapkumar,9.0,2008,FerozShahKotla,,group_stage,regular,129.0,132.0,7.0,10.0,3.0,0.0,3.0,10.0,1.0,0.0,0.0,0.0,19.0,15.0
3,2008-04-20,4,Kolkata,Deccan Chargers,Kolkata Knight Riders,Kolkata Knight Riders,DJ Hussey,Deccan Chargers,bat,FM Engineer,F Gomes,Asad Rauf,BF Bowden,K Hariharan,5.0,2008,EdenGardens,,group_stage,regular,110.0,112.0,10.0,28.0,4.0,8.0,4.0,15.0,0.0,4.0,2.0,1.0,18.0,18.0
4,2008-04-20,5,Mumbai,Mumbai Indians,Royal Challengers Bangalore,Royal Challengers Bangalore,MV Boucher,Mumbai Indians,bat,J Srinath,SN Bandekar,AV Jayaprakash,SJ Davis,DJ Harper,5.0,2008,WankhedeStadium,,group_stage,regular,165.0,166.0,11.0,5.0,6.0,0.0,3.0,5.0,2.0,0.0,0.0,0.0,19.0,19.0


In [None]:
cols = [
    "date",
    "match_number",
    "city",
    "team_1",
    "team_2",
    "team_1_runs.total",
    "team_2_runs.total",
    "outcome_winner",
    "player_of_match",
    "team_1_over",
    "team_2_over",
    "toss_winner",
    "toss_decision",
    "team_1_runs.extras",
    "team_2_runs.extras",
    "team_1_extras.legbyes",
    "team_2_extras.legbyes",
    "team_1_extras.wides",
    "team_2_extras.wides",
    "team_1_extras.byes",
    "team_2_extras.byes",
    "team_1_extras.noballs",
    "team_2_extras.noballs",
    "officials_match_referees",
    "officials_reserve_umpires",
    "officials_tv_umpires",
    "officials_umpires_1",
    "officials_umpires_2",
    "outcome_by_wickets",
    "season",
    "venue",
    "outcome_by_runs",
    "event_stage",
    "outcome_method",
]
df_merged = df_merged.reindex(columns=cols)

In [None]:
pd.set_option('display.max_columns', 40)

df_merged.head()

Unnamed: 0,date,match_number,city,team_1,team_2,team_1_runs.total,team_2_runs.total,outcome_winner,player_of_match,team_1_over,team_2_over,toss_winner,toss_decision,team_1_runs.extras,team_2_runs.extras,team_1_extras.legbyes,team_2_extras.legbyes,team_1_extras.wides,team_2_extras.wides,team_1_extras.byes,team_2_extras.byes,team_1_extras.noballs,team_2_extras.noballs,officials_match_referees,officials_reserve_umpires,officials_tv_umpires,officials_umpires_1,officials_umpires_2,outcome_by_wickets,season,venue,outcome_by_runs,event_stage,outcome_method
0,2008-04-18,1,Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,222.0,82.0,Kolkata Knight Riders,BB McCullum,19.0,15.0,Royal Challengers Bangalore,field,17.0,19.0,4.0,8.0,9.0,11.0,4.0,0.0,0.0,0.0,J Srinath,VN Kulkarni,AM Saheba,Asad Rauf,RE Koertzen,,2008,MChinnaswamyStadium,140.0,group_stage,regular
1,2008-04-19,2,Chandigarh,Chennai Super Kings,Kings XI Punjab,240.0,207.0,Chennai Super Kings,MEK Hussey,19.0,19.0,Chennai Super Kings,bat,6.0,11.0,2.0,4.0,3.0,5.0,0.0,2.0,1.0,0.0,S Venkataraghavan,MSS Ranawat,RB Tiffin,MR Benson,SL Shastri,,2008,PunjabCricketAssociationStadium,33.0,group_stage,regular
2,2008-04-19,3,Delhi,Rajasthan Royals,Delhi Daredevils,129.0,132.0,Delhi Daredevils,MF Maharoof,19.0,15.0,Rajasthan Royals,bat,7.0,10.0,3.0,0.0,3.0,10.0,1.0,0.0,0.0,0.0,GR Viswanath,,IL Howell,Aleem Dar,GA Pratapkumar,9.0,2008,FerozShahKotla,,group_stage,regular
3,2008-04-20,4,Kolkata,Deccan Chargers,Kolkata Knight Riders,110.0,112.0,Kolkata Knight Riders,DJ Hussey,18.0,18.0,Deccan Chargers,bat,10.0,28.0,4.0,8.0,4.0,15.0,0.0,4.0,2.0,1.0,FM Engineer,F Gomes,Asad Rauf,BF Bowden,K Hariharan,5.0,2008,EdenGardens,,group_stage,regular
4,2008-04-20,5,Mumbai,Mumbai Indians,Royal Challengers Bangalore,165.0,166.0,Royal Challengers Bangalore,MV Boucher,19.0,19.0,Mumbai Indians,bat,11.0,5.0,6.0,0.0,3.0,5.0,2.0,0.0,0.0,0.0,J Srinath,SN Bandekar,AV Jayaprakash,SJ Davis,DJ Harper,5.0,2008,WankhedeStadium,,group_stage,regular


In [None]:
df_merged.rename(columns={"outcome_winner": "match_winner"}, inplace=True)

In [None]:
df_merged["match_extras"] = df_merged["team_1_runs.extras"] + df_merged["team_2_runs.extras"]

In [None]:
df_merged["match_legbyes"] = df_merged["team_1_extras.legbyes"] + df_merged["team_2_extras.legbyes"]

In [None]:
df_merged["match_wides"] = df_merged["team_1_extras.wides"] + df_merged["team_2_extras.wides"]

In [None]:
df_merged["match_byes"] = df_merged["team_1_extras.byes"] + df_merged["team_2_extras.byes"]

In [None]:
df_merged["match_byes"] = df_merged["team_1_extras.byes"] + df_merged["team_2_extras.byes"]

In [None]:
df_merged["match_noballs"] = df_merged["team_1_extras.noballs"] + df_merged["team_2_extras.noballs"]

In [None]:
df_merged.head()

Unnamed: 0,date,match_number,city,team_1,team_2,team_1_runs.total,team_2_runs.total,match_winner,player_of_match,team_1_over,team_2_over,toss_winner,toss_decision,team_1_runs.extras,team_2_runs.extras,team_1_extras.legbyes,team_2_extras.legbyes,team_1_extras.wides,team_2_extras.wides,team_1_extras.byes,team_2_extras.byes,team_1_extras.noballs,team_2_extras.noballs,officials_match_referees,officials_reserve_umpires,officials_tv_umpires,officials_umpires_1,officials_umpires_2,outcome_by_wickets,season,venue,outcome_by_runs,event_stage,outcome_method,match_extras,match_legbyes,match_wides,match_byes,match_noballs
0,2008-04-18,1,Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,222.0,82.0,Kolkata Knight Riders,BB McCullum,19.0,15.0,Royal Challengers Bangalore,field,17.0,19.0,4.0,8.0,9.0,11.0,4.0,0.0,0.0,0.0,J Srinath,VN Kulkarni,AM Saheba,Asad Rauf,RE Koertzen,,2008,MChinnaswamyStadium,140.0,group_stage,regular,36.0,12.0,20.0,4.0,0.0
1,2008-04-19,2,Chandigarh,Chennai Super Kings,Kings XI Punjab,240.0,207.0,Chennai Super Kings,MEK Hussey,19.0,19.0,Chennai Super Kings,bat,6.0,11.0,2.0,4.0,3.0,5.0,0.0,2.0,1.0,0.0,S Venkataraghavan,MSS Ranawat,RB Tiffin,MR Benson,SL Shastri,,2008,PunjabCricketAssociationStadium,33.0,group_stage,regular,17.0,6.0,8.0,2.0,1.0
2,2008-04-19,3,Delhi,Rajasthan Royals,Delhi Daredevils,129.0,132.0,Delhi Daredevils,MF Maharoof,19.0,15.0,Rajasthan Royals,bat,7.0,10.0,3.0,0.0,3.0,10.0,1.0,0.0,0.0,0.0,GR Viswanath,,IL Howell,Aleem Dar,GA Pratapkumar,9.0,2008,FerozShahKotla,,group_stage,regular,17.0,3.0,13.0,1.0,0.0
3,2008-04-20,4,Kolkata,Deccan Chargers,Kolkata Knight Riders,110.0,112.0,Kolkata Knight Riders,DJ Hussey,18.0,18.0,Deccan Chargers,bat,10.0,28.0,4.0,8.0,4.0,15.0,0.0,4.0,2.0,1.0,FM Engineer,F Gomes,Asad Rauf,BF Bowden,K Hariharan,5.0,2008,EdenGardens,,group_stage,regular,38.0,12.0,19.0,4.0,3.0
4,2008-04-20,5,Mumbai,Mumbai Indians,Royal Challengers Bangalore,165.0,166.0,Royal Challengers Bangalore,MV Boucher,19.0,19.0,Mumbai Indians,bat,11.0,5.0,6.0,0.0,3.0,5.0,2.0,0.0,0.0,0.0,J Srinath,SN Bandekar,AV Jayaprakash,SJ Davis,DJ Harper,5.0,2008,WankhedeStadium,,group_stage,regular,16.0,6.0,8.0,2.0,0.0


In [None]:
ball_by_ball.head(10)

Unnamed: 0,date,match_number,innings,over,batter,bowler,non_striker,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,extras.byes,extras.noballs,wicket.kind,wicket.player_out,wicket.fielders,wickets
0,2008-04-18,1,1,0,SC Ganguly,P Kumar,BB McCullum,0,1,1,1.0,,,,,,,
1,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,
2,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,1,1,,1.0,,,,,,
3,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,
4,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,
5,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,0,0,,,,,,,,
6,2008-04-18,1,1,0,BB McCullum,P Kumar,SC Ganguly,0,1,1,1.0,,,,,,,
7,2008-04-18,1,1,1,BB McCullum,Z Khan,SC Ganguly,0,0,0,,,,,,,,
8,2008-04-18,1,1,1,BB McCullum,Z Khan,SC Ganguly,4,0,4,,,,,,,,
9,2008-04-18,1,1,1,BB McCullum,Z Khan,SC Ganguly,4,0,4,,,,,,,,


In [None]:
runs_groupby= ball_by_ball.groupby(['date', 'match_number', 'runs.batter'])['runs.batter'].agg(["count"])

In [None]:
runs_groupby

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
date,match_number,runs.batter,Unnamed: 3_level_1
2008-04-18,1,0,110
2008-04-18,1,1,66
2008-04-18,1,2,14
2008-04-18,1,4,18
2008-04-18,1,6,17
...,...,...,...
2024-05-26,Final,0,90
2024-05-26,Final,1,57
2024-05-26,Final,2,10
2024-05-26,Final,4,18


In [None]:
runs_df = runs_groupby.reset_index()

In [None]:
runs_df = runs_df[(runs_df['runs.batter'] == 4) | (runs_df['runs.batter'] == 6) ].sort_values(by=['date','match_number'])

In [None]:
runs_df

Unnamed: 0,date,match_number,runs.batter,count
3,2008-04-18,1,4,18
4,2008-04-18,1,6,17
9,2008-04-19,2,6,25
8,2008-04-19,2,4,38
14,2008-04-19,3,6,4
...,...,...,...,...
6057,2024-05-22,Eliminator,6,13
6062,2024-05-24,Qualifier 2,6,14
6061,2024-05-24,Qualifier 2,4,24
6067,2024-05-26,Final,6,9


In [None]:
runs_pivot = runs_df.pivot(index=["date", "match_number"], columns="runs.batter", values=["count"])

In [None]:
runs_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,count,count,count
Unnamed: 0_level_1,runs.batter,0,4,6
date,match_number,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2008-04-18,1,110,18,17
2008-04-19,2,72,38,25
2008-04-19,3,97,32,4
2008-04-20,4,147,11,10
2008-04-20,5,94,33,11
...,...,...,...,...
2024-05-19,69,82,31,26
2024-05-21,Qualifier 1,80,29,16
2024-05-22,Eliminator,81,33,13
2024-05-24,Qualifier 2,107,24,14


In [None]:
runs_pivot.reset_index(inplace=True)

In [None]:
runs_pivot.columns.values

array([('date', ''), ('match_number', ''), ('count', 0), ('count', 4),
       ('count', 6)], dtype=object)

In [None]:
runs_pivot.columns = [col[0] + "_" + str(col[1]) for col in runs_pivot.columns.values]

In [None]:
runs_pivot.columns

Index(['date_', 'match_number_', 'count_0', 'count_4', 'count_6'], dtype='object')

In [None]:
runs_pivot = runs_pivot.rename(columns={"date_": "date", "match_number_": "match_number", "count_0": "match_dotballs" , "count_4": "match_4's" , "count_6": "match_6's"} )


In [None]:
runs_pivot

Unnamed: 0,date,match_number,match_dotballs,match_4's,match_6's
0,2008-04-18,1,110,18,17
1,2008-04-19,2,72,38,25
2,2008-04-19,3,97,32,4
3,2008-04-20,4,147,11,10
4,2008-04-20,5,94,33,11
...,...,...,...,...,...
1090,2024-05-19,69,82,31,26
1091,2024-05-21,Qualifier 1,80,29,16
1092,2024-05-22,Eliminator,81,33,13
1093,2024-05-24,Qualifier 2,107,24,14


In [None]:
df_merged = pd.merge(df_merged, runs_pivot, how="outer", on=["date", "match_number"])

In [None]:
df_merged

Unnamed: 0,date,match_number,city,team_1,team_2,team_1_runs.total,team_2_runs.total,match_winner,player_of_match,team_1_over,team_2_over,toss_winner,toss_decision,team_1_runs.extras,team_2_runs.extras,team_1_extras.legbyes,team_2_extras.legbyes,team_1_extras.wides,team_2_extras.wides,team_1_extras.byes,...,officials_tv_umpires,officials_umpires_1,officials_umpires_2,outcome_by_wickets,season,venue,outcome_by_runs,event_stage,outcome_method,match_extras,match_legbyes,match_wides,match_byes,match_noballs,match_dotballs_x,match_4's_x,match_6's_x,match_dotballs_y,match_4's_y,match_6's_y
0,2008-04-18,1,Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,222.0,82.0,Kolkata Knight Riders,BB McCullum,19.0,15.0,Royal Challengers Bangalore,field,17.0,19.0,4.0,8.0,9.0,11.0,4.0,...,AM Saheba,Asad Rauf,RE Koertzen,,2008,MChinnaswamyStadium,140.0,group_stage,regular,36.0,12.0,20.0,4.0,0.0,110,18,17,110,18,17
1,2008-04-19,2,Chandigarh,Chennai Super Kings,Kings XI Punjab,240.0,207.0,Chennai Super Kings,MEK Hussey,19.0,19.0,Chennai Super Kings,bat,6.0,11.0,2.0,4.0,3.0,5.0,0.0,...,RB Tiffin,MR Benson,SL Shastri,,2008,PunjabCricketAssociationStadium,33.0,group_stage,regular,17.0,6.0,8.0,2.0,1.0,72,38,25,72,38,25
2,2008-04-19,3,Delhi,Rajasthan Royals,Delhi Daredevils,129.0,132.0,Delhi Daredevils,MF Maharoof,19.0,15.0,Rajasthan Royals,bat,7.0,10.0,3.0,0.0,3.0,10.0,1.0,...,IL Howell,Aleem Dar,GA Pratapkumar,9.0,2008,FerozShahKotla,,group_stage,regular,17.0,3.0,13.0,1.0,0.0,97,32,4,97,32,4
3,2008-04-20,4,Kolkata,Deccan Chargers,Kolkata Knight Riders,110.0,112.0,Kolkata Knight Riders,DJ Hussey,18.0,18.0,Deccan Chargers,bat,10.0,28.0,4.0,8.0,4.0,15.0,0.0,...,Asad Rauf,BF Bowden,K Hariharan,5.0,2008,EdenGardens,,group_stage,regular,38.0,12.0,19.0,4.0,3.0,147,11,10,147,11,10
4,2008-04-20,5,Mumbai,Mumbai Indians,Royal Challengers Bangalore,165.0,166.0,Royal Challengers Bangalore,MV Boucher,19.0,19.0,Mumbai Indians,bat,11.0,5.0,6.0,0.0,3.0,5.0,2.0,...,AV Jayaprakash,SJ Davis,DJ Harper,5.0,2008,WankhedeStadium,,group_stage,regular,16.0,6.0,8.0,2.0,0.0,94,33,11,94,33,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,2024-05-19,69,Hyderabad,Punjab Kings,Sunrisers Hyderabad,214.0,215.0,Sunrisers Hyderabad,Abhishek Sharma,19.0,19.0,Punjab Kings,bat,10.0,17.0,5.0,6.0,4.0,10.0,0.0,...,HAS Khalid,Nitin Menon,VK Sharma,4.0,2024,RajivGandhiInternationalStadium,,group_stage,regular,27.0,11.0,14.0,0.0,2.0,82,31,26,82,31,26
1091,2024-05-21,Qualifier 1,Ahmedabad,Sunrisers Hyderabad,Kolkata Knight Riders,159.0,164.0,Kolkata Knight Riders,MA Starc,19.0,13.0,Sunrisers Hyderabad,bat,7.0,11.0,0.0,9.0,5.0,2.0,0.0,...,KN Ananthapadmanabhan,AK Chaudhary,R Pandit,8.0,2024,NarendraModiStadium,,Qualifier 1,regular,18.0,9.0,7.0,0.0,2.0,80,29,16,80,29,16
1092,2024-05-22,Eliminator,Ahmedabad,Royal Challengers Bengaluru,Rajasthan Royals,172.0,174.0,Rajasthan Royals,R Ashwin,19.0,18.0,Rajasthan Royals,field,4.0,6.0,2.0,1.0,2.0,5.0,0.0,...,AK Chaudhary,KN Ananthapadmanabhan,MV Saidharshan Kumar,4.0,2024,NarendraModiStadium,,Eliminator,regular,10.0,3.0,7.0,0.0,0.0,81,33,13,81,33,13
1093,2024-05-24,Qualifier 2,Chennai,Sunrisers Hyderabad,Rajasthan Royals,175.0,139.0,Sunrisers Hyderabad,Shahbaz Ahmed,19.0,19.0,Rajasthan Royals,field,8.0,5.0,1.0,1.0,7.0,3.0,0.0,...,MA Gough,Nitin Menon,VK Sharma,,2024,MAChidambaramStadium,36.0,Qualifier 2,regular,13.0,2.0,10.0,0.0,1.0,107,24,14,107,24,14


In [None]:
df_merged.to_csv("output/ipl_match_level_stats.csv", index=False)