In [183]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import plotly.express as px


def gapminder(
    df: pd.DataFrame,
    x_col: str | pd.Series,
    y_col: str | pd.Series,
    size_col: str | pd.Series | None = None,
    color_col: str | pd.Series | None = None,
    hover_name: str | None = None,
    title: str = "Gapminder Plot",
    labels: dict | None = None,
) -> None:

    fig = px.scatter(
        df,
        x=x_col,
        y=y_col,
        size=size_col,
        color=color_col,
        hover_name=hover_name,
        title=title,
        labels=labels,
    )
    fig.show()


def parallelplot(
    df: pd.DataFrame,
    columns: list[str | pd.Series],
    color_col: str | pd.Series | None = None,
    title: str = "Parallel Coordinates Plot",
    labels: pd.Series | str | None | dict[str, str] = None,
) -> None:

    fig = px.parallel_coordinates(
        df, dimensions=columns, color=color_col, title=title, labels=labels
    )
    fig.show()

### BALL BY BALL DATA


In [2]:
fields_to_skip = ["meta", "info"]

In [3]:
# def populate_wicket_data(idx, delivery):
#     wicket_record = {}
#     if "wickets" in delivery and delivery["wickets"]:
#         for wicket in delivery["wickets"]:
#             wicket_record = {
#                 "delivery_index": idx,
#                 "wicket.kind": wicket["kind"],
#                 "wicket.player_out": wicket["player_out"],
#                 "wicket.fielders": (
#                     [fielder["name"] for fielder in wicket["fielders"]]
#                     if wicket.get("fielders") != None
#                     else None
#                 ),
#             }
#     else:
#         wicket_record = {
#             "delivery_index": idx,
#             "wicket.kind": None,
#             "wicket.player_out": None,
#             "wicket.fielders": None,
#         }
#     return wicket_record

In [4]:
def populate_wicket_data(idx, delivery):
    wicket_record = {}
    if "wickets" in delivery and delivery["wickets"]:
        for wicket in delivery["wickets"]:
            wicket_record = {
                "delivery_index": idx,
                "wicket.kind": wicket["kind"],
                "wicket.player_out": wicket["player_out"],
                "wicket.fielders": (
                    [fielder["name"] for fielder in wicket["fielders"]]
                    if wicket.get("fielders") != None
                    else None
                ),
            }
    else:
        wicket_record = {
            "delivery_index": idx,
            "wicket.kind": None,
            "wicket.player_out": None,
            "wicket.fielders": None,
        }
    return wicket_record

In [5]:
file_names = os.listdir(path="data")
print(file_names[:10])
file_names = list(filter(lambda x: str(x).endswith(".json"), file_names))
file_names.sort()
print(file_names[:10])

merged_deliveries = pd.DataFrame()
wickets_data = []
index = 0
match_number = 1
prev_match_number = 0
date_file_name_dict = {}
prev_match_number = 0

for file_idx, file_name in enumerate(file_names):
    file_path = "data/" + file_name
    with open(file=file_path, mode="r") as file:
        data = json.load(file)
        info = data["info"]
        date = info["dates"][0]
        if info["event"].get("match_number") != None:
            match_number = info["event"].get("match_number")
        else:
            match_number = prev_match_number + 1
        
        if date_file_name_dict.get(date) != None:
            if prev_match_number < match_number:
                date_file_name_dict[date +"_1"] = file_name
            else:
                date_file_name_dict[date +"_1"] = date_file_name_dict[date] 
                date_file_name_dict[date] = file_name
        else:
            date_file_name_dict[date] = file_name
        prev_match_number = match_number

['1254103.json', '1136605.json', '1216501.json', '1082595.json', '829731.json', '1359507.json', '548315.json', '1304079.json', '1422119.json', '1178418.json']
['1082591.json', '1082592.json', '1082593.json', '1082594.json', '1082595.json', '1082596.json', '1082597.json', '1082598.json', '1082599.json', '1082600.json']


In [6]:
match_dates = list(date_file_name_dict.keys())
match_dates.sort()

In [7]:
# file_names = ["829813.json", "829817.json", "336038.json"]
index = 0
match_number = 1
prev_match_number = 0
prev_year = ""
stage = ""

for file_idx, match_date in enumerate(match_dates):
    file_name = date_file_name_dict[match_date]
    file_path = "data/" + file_name
    with open(file=file_path, mode="r") as file:
        data = json.load(file)
        info = data["info"]
        date = info["dates"][0]
        curr_year = date.split("-")[0]
        if curr_year != prev_year:
            print(curr_year, prev_year)
            prev_match_number = 0
        print("processing file: ", file_idx, file_name, match_date, date, prev_match_number, sep=", ")
        
        if info["event"].get("match_number") != None:
            match_number = info["event"].get("match_number")
        else:
            match_number = prev_match_number + 1
            print("match number not presentin file_name:", file_name, "populating with: ", match_number)
        
        if info["event"].get("stage") == None:
            stage = "group"
        else:
            stage = info["event"].get("stage")

        prev_match_number = match_number
        prev_year = date.split("-")[0]
        
        innings = data["innings"]
        for idx, inning in enumerate(innings):
            if inning.get("super_over") is not None:
                print("skipping super_over: ", date, match_number)
                continue
            overs = inning["overs"]
            df_deliveries = pd.json_normalize(
                overs, record_path=["deliveries"], meta=["over"]
            )
            df_deliveries["date"] = date
            df_deliveries["match_number"] = match_number
            df_deliveries["innings"] = idx + 1
            df_deliveries['stage'] = stage
            merged_deliveries = pd.concat([merged_deliveries, df_deliveries], axis=0)
            for over in overs:
                for delivery in over["deliveries"]:
                    wicket_data = populate_wicket_data(index, delivery)
                    wickets_data.append(wicket_data)
                    index += 1
                    
df_wickets = pd.DataFrame(wickets_data)

merged_deliveries.reset_index(inplace=True)

merged_deliveries["delivery_index"] = merged_deliveries.index

df_merged = pd.merge(merged_deliveries, df_wickets, on="delivery_index", how="left")

df_merged.drop("delivery_index", inplace=True, axis=1)
# df_merged.drop("wickets", inplace=True, axis=1)

2008 
processing file: , 0, 335982.json, 2008-04-18, 2008-04-18, 0
processing file: , 1, 335983.json, 2008-04-19, 2008-04-19, 1
processing file: , 2, 335984.json, 2008-04-19_1, 2008-04-19, 2
processing file: , 3, 335986.json, 2008-04-20, 2008-04-20, 3
processing file: , 4, 335985.json, 2008-04-20_1, 2008-04-20, 4
processing file: , 5, 335987.json, 2008-04-21, 2008-04-21, 5
processing file: , 6, 335988.json, 2008-04-22, 2008-04-22, 6
processing file: , 7, 335989.json, 2008-04-23, 2008-04-23, 7
processing file: , 8, 335990.json, 2008-04-24, 2008-04-24, 8
processing file: , 9, 335991.json, 2008-04-25, 2008-04-25, 9
processing file: , 10, 335993.json, 2008-04-26, 2008-04-26, 10
processing file: , 11, 335992.json, 2008-04-26_1, 2008-04-26, 11
processing file: , 12, 335995.json, 2008-04-27, 2008-04-27, 12
processing file: , 13, 335994.json, 2008-04-27_1, 2008-04-27, 13
processing file: , 14, 335996.json, 2008-04-28, 2008-04-28, 14
processing file: , 15, 335997.json, 2008-04-29, 2008-04-29, 1

In [8]:
df_merged.columns

Index(['index', 'batter', 'bowler', 'non_striker', 'extras.legbyes',
       'runs.batter', 'runs.extras', 'runs.total', 'extras.wides', 'wickets',
       'extras.byes', 'over', 'date', 'match_number', 'innings', 'stage',
       'extras.noballs', 'extras.penalty', 'replacements.role',
       'runs.non_boundary', 'review.by', 'review.umpire', 'review.batter',
       'review.decision', 'review.umpires_call', 'review.type',
       'replacements.match', 'wicket.kind', 'wicket.player_out',
       'wicket.fielders'],
      dtype='object')

In [9]:
ordered_cols = [
    "date",
    "match_number",
    "innings",
    "over",
    "batter",
    "bowler",
    "stage",
    "non_striker",
    "runs.batter",
    "runs.extras",
    "runs.total",
    "extras.legbyes",
    "extras.wides",
    "extras.byes",
    "extras.noballs",
    "wicket.kind",
    "wicket.player_out",
    "wicket.fielders",
    "wickets",
]

In [10]:
df_merged

Unnamed: 0,index,batter,bowler,non_striker,extras.legbyes,runs.batter,runs.extras,runs.total,extras.wides,wickets,...,review.by,review.umpire,review.batter,review.decision,review.umpires_call,review.type,replacements.match,wicket.kind,wicket.player_out,wicket.fielders
0,0,SC Ganguly,P Kumar,BB McCullum,1.0,0,1,1,,,...,,,,,,,,,,
1,1,BB McCullum,P Kumar,SC Ganguly,,0,0,0,,,...,,,,,,,,,,
2,2,BB McCullum,P Kumar,SC Ganguly,,0,1,1,1.0,,...,,,,,,,,,,
3,3,BB McCullum,P Kumar,SC Ganguly,,0,0,0,,,...,,,,,,,,,,
4,4,BB McCullum,P Kumar,SC Ganguly,,0,0,0,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260754,62,SS Iyer,AK Markram,VR Iyer,,1,0,1,,,...,,,,,,,,,,
260755,63,VR Iyer,AK Markram,SS Iyer,,1,0,1,,,...,,,,,,,,,,
260756,64,VR Iyer,Shahbaz Ahmed,SS Iyer,,1,0,1,,,...,,,,,,,,,,
260757,65,SS Iyer,Shahbaz Ahmed,VR Iyer,,1,0,1,,,...,,,,,,,,,,


In [11]:
# df_merged = pd.read_csv("./output/ipl_ball_by_ball_output.csv")

In [12]:
df_merged = df_merged.reindex(columns=ordered_cols)
df_merged.sort_values(by=["date", "match_number"], inplace=True)

In [13]:
df_merged['season'] = df_merged['date'].str.slice(0,4)

In [14]:
df_wickets.to_csv("output/wickets.csv", index=False)

In [15]:
df_merged.to_csv("output/ipl_ball_by_ball_output.csv", index=False)

In [16]:
df_merged.head()

Unnamed: 0,date,match_number,innings,over,batter,bowler,stage,non_striker,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,extras.byes,extras.noballs,wicket.kind,wicket.player_out,wicket.fielders,wickets,season
0,2008-04-18,1,1,0,SC Ganguly,P Kumar,group,BB McCullum,0,1,1,1.0,,,,,,,,2008
1,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
2,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,1,1,,1.0,,,,,,,2008
3,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
4,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008


In [17]:
df_merged.stage.value_counts()

stage
group                 244937
Final                   4086
Qualifier 1             3406
Qualifier 2             3392
Eliminator              2597
Semi Final              1409
Elimination Final        734
3rd Place Play-Off       198
Name: count, dtype: int64

#### Validate the resultant dataframe


In [18]:
df_merged.columns

Index(['date', 'match_number', 'innings', 'over', 'batter', 'bowler', 'stage',
       'non_striker', 'runs.batter', 'runs.extras', 'runs.total',
       'extras.legbyes', 'extras.wides', 'extras.byes', 'extras.noballs',
       'wicket.kind', 'wicket.player_out', 'wicket.fielders', 'wickets',
       'season'],
      dtype='object')

In [19]:
print("min date: ", df_merged["date"].min())
print("max date: ", df_merged["date"].max())

min date:  2008-04-18
max date:  2024-05-26


#### Since we have data for IPL matches between 2008 to 2024, the above date range looks right


In [20]:
df_merged["match_number"].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50, 51, 52,
       53, 54, 55, 56, 57, 58, 59, 47, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76])

##### The above values for match_number looks right


In [21]:
df_merged["innings"].unique()

array([1, 2])

In [22]:
df_merged["over"].unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19], dtype=object)

In [23]:
sorted(df_merged["runs.batter"].unique())

[np.int64(0),
 np.int64(1),
 np.int64(2),
 np.int64(3),
 np.int64(4),
 np.int64(5),
 np.int64(6)]

In [24]:
sorted(df_merged["runs.extras"].unique())

[np.int64(0),
 np.int64(1),
 np.int64(2),
 np.int64(3),
 np.int64(4),
 np.int64(5),
 np.int64(7)]

In [25]:
sorted(df_merged["runs.total"].unique())

[np.int64(0),
 np.int64(1),
 np.int64(2),
 np.int64(3),
 np.int64(4),
 np.int64(5),
 np.int64(6),
 np.int64(7)]

In [26]:
sorted(df_merged["extras.legbyes"].unique())

[np.float64(1.0),
 np.float64(nan),
 np.float64(2.0),
 np.float64(3.0),
 np.float64(4.0),
 np.float64(5.0)]

In [27]:
sorted(df_merged["extras.wides"].unique())

[np.float64(nan),
 np.float64(1.0),
 np.float64(2.0),
 np.float64(3.0),
 np.float64(4.0),
 np.float64(5.0)]

In [28]:
sorted(df_merged["extras.byes"].unique())

[np.float64(nan),
 np.float64(1.0),
 np.float64(2.0),
 np.float64(3.0),
 np.float64(4.0)]

In [29]:
sorted(df_merged["extras.noballs"].unique())

[np.float64(nan),
 np.float64(1.0),
 np.float64(2.0),
 np.float64(3.0),
 np.float64(5.0)]

In [30]:
df_merged["wicket.kind"].unique()

array([None, 'caught', 'bowled', 'run out', 'lbw', 'retired hurt',
       'stumped', 'caught and bowled', 'hit wicket',
       'obstructing the field', 'retired out'], dtype=object)

In [31]:
df_merged["wicket.player_out"].unique()[:10]

array([None, 'SC Ganguly', 'RT Ponting', 'DJ Hussey', 'R Dravid',
       'V Kohli', 'JH Kallis', 'W Jaffer', 'MV Boucher', 'B Akhil'],
      dtype=object)

In [32]:
df_merged[~pd.isnull(df_merged["wicket.fielders"])]["wicket.fielders"].count()

np.int64(9342)

In [33]:
df_merged[~ pd.isnull(df_merged["wickets"])]['wickets'].count()

np.int64(12923)

In [34]:
df_merged

Unnamed: 0,date,match_number,innings,over,batter,bowler,stage,non_striker,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,extras.byes,extras.noballs,wicket.kind,wicket.player_out,wicket.fielders,wickets,season
0,2008-04-18,1,1,0,SC Ganguly,P Kumar,group,BB McCullum,0,1,1,1.0,,,,,,,,2008
1,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
2,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,1,1,,1.0,,,,,,,2008
3,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
4,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260754,2024-05-26,73,2,9,SS Iyer,AK Markram,Final,VR Iyer,1,0,1,,,,,,,,,2024
260755,2024-05-26,73,2,9,VR Iyer,AK Markram,Final,SS Iyer,1,0,1,,,,,,,,,2024
260756,2024-05-26,73,2,10,VR Iyer,Shahbaz Ahmed,Final,SS Iyer,1,0,1,,,,,,,,,2024
260757,2024-05-26,73,2,10,SS Iyer,Shahbaz Ahmed,Final,VR Iyer,1,0,1,,,,,,,,,2024


In [35]:
df_merged

Unnamed: 0,date,match_number,innings,over,batter,bowler,stage,non_striker,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,extras.byes,extras.noballs,wicket.kind,wicket.player_out,wicket.fielders,wickets,season
0,2008-04-18,1,1,0,SC Ganguly,P Kumar,group,BB McCullum,0,1,1,1.0,,,,,,,,2008
1,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
2,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,1,1,,1.0,,,,,,,2008
3,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
4,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260754,2024-05-26,73,2,9,SS Iyer,AK Markram,Final,VR Iyer,1,0,1,,,,,,,,,2024
260755,2024-05-26,73,2,9,VR Iyer,AK Markram,Final,SS Iyer,1,0,1,,,,,,,,,2024
260756,2024-05-26,73,2,10,VR Iyer,Shahbaz Ahmed,Final,SS Iyer,1,0,1,,,,,,,,,2024
260757,2024-05-26,73,2,10,SS Iyer,Shahbaz Ahmed,Final,VR Iyer,1,0,1,,,,,,,,,2024


In [36]:
df_merged.drop(columns='wickets',inplace=True)

In [37]:
df_merged.groupby(['date','match_number','innings','batter']).agg({'runs.batter':np.sum,'runs.extras':np.sum,'bowler':'count'})

  df_merged.groupby(['date','match_number','innings','batter']).agg({'runs.batter':np.sum,'runs.extras':np.sum,'bowler':'count'})
  df_merged.groupby(['date','match_number','innings','batter']).agg({'runs.batter':np.sum,'runs.extras':np.sum,'bowler':'count'})


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,runs.batter,runs.extras,bowler
date,match_number,innings,batter,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-04-18,1,1,BB McCullum,158,11,77
2008-04-18,1,1,DJ Hussey,12,0,12
2008-04-18,1,1,Mohammad Hafeez,5,0,3
2008-04-18,1,1,RT Ponting,20,4,20
2008-04-18,1,1,SC Ganguly,10,2,12
...,...,...,...,...,...,...
2024-05-26,73,1,TM Head,0,0,1
2024-05-26,73,2,Rahmanullah Gurbaz,39,7,35
2024-05-26,73,2,SP Narine,6,0,2
2024-05-26,73,2,SS Iyer,6,0,3


In [38]:
df_merged.groupby(['date','match_number','innings','bowler','runs.batter']).agg({'runs.total':'sum','wicket.kind':'count','batter':'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,runs.total,wicket.kind,batter
date,match_number,innings,bowler,runs.batter,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2008-04-18,1,1,AA Noffke,0,6,1,8
2008-04-18,1,1,AA Noffke,1,11,0,11
2008-04-18,1,1,AA Noffke,2,4,0,2
2008-04-18,1,1,AA Noffke,4,8,0,2
2008-04-18,1,1,AA Noffke,6,12,0,2
...,...,...,...,...,...,...,...
2024-05-26,73,2,Shahbaz Ahmed,6,12,0,2
2024-05-26,73,2,T Natarajan,0,2,0,4
2024-05-26,73,2,T Natarajan,1,5,0,5
2024-05-26,73,2,T Natarajan,4,16,0,4


In [39]:
ball_by_ball = df_merged

### THE SUMMARY DATA

In [42]:
fields_to_skip = ["meta", "players", "registry", "innings"]

In [43]:
def flatten_json(y):
    out = {}

    def flatten(x, name=""):
        if type(x) is dict:
            for a in x:
                if a in fields_to_skip:
                    # print("skipping field: ", a)
                    pass
                else:
                    flatten(x[a], name + a + "_")
        elif type(x) is list:
            i = 1
            for a in x:
                flatten(a, name + str(i) + "_")
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

In [44]:
files = os.listdir(path="data")
print(files)

['1254103.json', '1136605.json', '1216501.json', '1082595.json', '829731.json', '1359507.json', '548315.json', '1304079.json', '1422119.json', '1178418.json', '1359487.json', '1359515.json', '336025.json', '729309.json', '1178407.json', '501239.json', '1178423.json', '1426305.json', '419107.json', '336036.json', '1426311.json', '1082637.json', '392232.json', '1216522.json', '1426307.json', '829713.json', '1136576.json', '1359498.json', '1082629.json', '1178413.json', '1216506.json', '1426282.json', '1082613.json', '1304105.json', '1082642.json', '1359514.json', '1216530.json', '980967.json', '1254067.json', '392225.json', '336032.json', '1304057.json', '548359.json', '1304061.json', '548314.json', '1136561.json', '335995.json', '1136620.json', '1359538.json', '598005.json', '1426266.json', '1359533.json', '1216494.json', '829807.json', '829767.json', '1304094.json', '1426281.json', '1136595.json', '980929.json', '1359496.json', '419114.json', '1304095.json', '1178406.json', '980911.jso

In [45]:
rows = list()
files = os.listdir(path="data")

files = list(filter(lambda x: str(x).endswith(".json"), files))
prev_match_number = -1
for file in files:
    print('procesaing file',file)
    try:
        file_name = "data/" + file
        with open(file_name, "r") as file:
            json_data = json.load(file)
            flattened_data = flatten_json(json_data)
            # if not   flattened_data['match_number'].isnumeric():
            #     flattened_data['match_number'] = prev_match_number + 1
            #     print("match number not presentin file_name:", file_name, "populating with: ", flattened_data['match_number'])
            # prev_match_number = flattened_data['match_number']
            rows.append(flattened_data)
    except:
        print(file_name)

procesaing file 1254103.json
procesaing file 1136605.json
procesaing file 1216501.json
procesaing file 1082595.json
procesaing file 829731.json
procesaing file 1359507.json
procesaing file 548315.json
procesaing file 1304079.json
procesaing file 1422119.json
procesaing file 1178418.json
procesaing file 1359487.json
procesaing file 1359515.json
procesaing file 336025.json
procesaing file 729309.json
procesaing file 1178407.json
procesaing file 501239.json
procesaing file 1178423.json
procesaing file 1426305.json
procesaing file 419107.json
procesaing file 336036.json
procesaing file 1426311.json
procesaing file 1082637.json
procesaing file 392232.json
procesaing file 1216522.json
procesaing file 1426307.json
procesaing file 829713.json
procesaing file 1136576.json
procesaing file 1359498.json
procesaing file 1082629.json
procesaing file 1178413.json
procesaing file 1216506.json
procesaing file 1426282.json
procesaing file 1082613.json
procesaing file 1304105.json
procesaing file 1082642

In [46]:
# Create a DataFrame with a single row
ipl_summary = pd.DataFrame(rows)

In [47]:
ipl_summary

Unnamed: 0,info_balls_per_over,info_city,info_dates_1,info_event_name,info_event_match_number,info_gender,info_match_type,info_officials_match_referees_1,info_officials_reserve_umpires_1,info_officials_tv_umpires_1,...,info_teams_2,info_toss_decision,info_toss_winner,info_venue,info_outcome_by_runs,info_outcome_method,info_event_stage,info_outcome_result,info_outcome_eliminator,info_dates_2
0,6,Dubai,2021-09-29,Indian Premier League,43.0,male,T20,M Nayyar,K Srinivasan,RK Illingworth,...,Royal Challengers Bangalore,field,Royal Challengers Bangalore,Dubai International Cricket Stadium,,,,,,
1,6,Delhi,2018-05-12,Indian Premier League,45.0,male,T20,Prakash Bhatt,K Srinath,AK Chaudhary,...,Royal Challengers Bangalore,field,Royal Challengers Bangalore,Arun Jaitley Stadium,,,,,,
2,6,Abu Dhabi,2020-10-07,Indian Premier League,21.0,male,T20,V Narayan Kutty,K Srinivasan,C Shamshuddin,...,Chennai Super Kings,bat,Kolkata Knight Riders,Sheikh Zayed Stadium,10.0,,,,,
3,6,Bengaluru,2017-04-08,Indian Premier League,5.0,male,T20,J Srinath,Navdeep Singh,A Nand Kishore,...,Delhi Daredevils,bat,Royal Challengers Bangalore,M.Chinnaswamy Stadium,15.0,,,,,
4,6,Visakhapatnam,2015-04-18,Indian Premier League,13.0,male,T20,M Nayyar,VK Sharma,C Shamshuddin,...,Delhi Daredevils,bat,Delhi Daredevils,Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket St...,4.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,6,Delhi,2008-05-08,Indian Premier League,28.0,male,T20,CH Lloyd,,K Hariharan,...,Chennai Super Kings,field,Chennai Super Kings,Feroz Shah Kotla,,,,,,
1091,6,Delhi,2008-04-19,Indian Premier League,3.0,male,T20,GR Viswanath,,IL Howell,...,Rajasthan Royals,bat,Rajasthan Royals,Feroz Shah Kotla,,,,,,
1092,6,Mohali,2024-03-23,Indian Premier League,2.0,male,T20,V Narayan Kutty,PM Joshi,AG Wharf,...,Punjab Kings,field,Punjab Kings,Maharaja Yadavindra Singh International Cricke...,,,,,,
1093,6,Navi Mumbai,2022-04-23,Indian Premier League,35.0,male,T20,V Narayan Kutty,Vinod Seshan,BNJ Oxenford,...,Kolkata Knight Riders,bat,Gujarat Titans,"Dr DY Patil Sports Academy, Mumbai",8.0,,,,,


In [48]:
ipl_summary.to_csv("output/ipl_summary_raw.csv", index=False)

In [49]:
ipl_summary.head()

Unnamed: 0,info_balls_per_over,info_city,info_dates_1,info_event_name,info_event_match_number,info_gender,info_match_type,info_officials_match_referees_1,info_officials_reserve_umpires_1,info_officials_tv_umpires_1,...,info_teams_2,info_toss_decision,info_toss_winner,info_venue,info_outcome_by_runs,info_outcome_method,info_event_stage,info_outcome_result,info_outcome_eliminator,info_dates_2
0,6,Dubai,2021-09-29,Indian Premier League,43.0,male,T20,M Nayyar,K Srinivasan,RK Illingworth,...,Royal Challengers Bangalore,field,Royal Challengers Bangalore,Dubai International Cricket Stadium,,,,,,
1,6,Delhi,2018-05-12,Indian Premier League,45.0,male,T20,Prakash Bhatt,K Srinath,AK Chaudhary,...,Royal Challengers Bangalore,field,Royal Challengers Bangalore,Arun Jaitley Stadium,,,,,,
2,6,Abu Dhabi,2020-10-07,Indian Premier League,21.0,male,T20,V Narayan Kutty,K Srinivasan,C Shamshuddin,...,Chennai Super Kings,bat,Kolkata Knight Riders,Sheikh Zayed Stadium,10.0,,,,,
3,6,Bengaluru,2017-04-08,Indian Premier League,5.0,male,T20,J Srinath,Navdeep Singh,A Nand Kishore,...,Delhi Daredevils,bat,Royal Challengers Bangalore,M.Chinnaswamy Stadium,15.0,,,,,
4,6,Visakhapatnam,2015-04-18,Indian Premier League,13.0,male,T20,M Nayyar,VK Sharma,C Shamshuddin,...,Delhi Daredevils,bat,Delhi Daredevils,Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket St...,4.0,,,,,


In [50]:
def remove_prefix(x: str, to_replace: str, replace_by: str):
    return x.replace(to_replace, replace_by)

In [51]:
ipl_summary.rename(
    lambda x: remove_prefix(str(x), to_replace="info_", replace_by=""),
    inplace=True,
    axis=1,
)

In [52]:
ipl_summary.rename(
    lambda x: remove_prefix(str(x), to_replace=" ", replace_by="_"),
    inplace=True,
    axis=1,
)

In [53]:
ipl_summary.rename(lambda x: str(x).lower(), inplace=True, axis=1)

In [54]:
ipl_summary.head()

Unnamed: 0,balls_per_over,city,dates_1,event_name,event_match_number,gender,match_type,officials_match_referees_1,officials_reserve_umpires_1,officials_tv_umpires_1,...,teams_2,toss_decision,toss_winner,venue,outcome_by_runs,outcome_method,event_stage,outcome_result,outcome_eliminator,dates_2
0,6,Dubai,2021-09-29,Indian Premier League,43.0,male,T20,M Nayyar,K Srinivasan,RK Illingworth,...,Royal Challengers Bangalore,field,Royal Challengers Bangalore,Dubai International Cricket Stadium,,,,,,
1,6,Delhi,2018-05-12,Indian Premier League,45.0,male,T20,Prakash Bhatt,K Srinath,AK Chaudhary,...,Royal Challengers Bangalore,field,Royal Challengers Bangalore,Arun Jaitley Stadium,,,,,,
2,6,Abu Dhabi,2020-10-07,Indian Premier League,21.0,male,T20,V Narayan Kutty,K Srinivasan,C Shamshuddin,...,Chennai Super Kings,bat,Kolkata Knight Riders,Sheikh Zayed Stadium,10.0,,,,,
3,6,Bengaluru,2017-04-08,Indian Premier League,5.0,male,T20,J Srinath,Navdeep Singh,A Nand Kishore,...,Delhi Daredevils,bat,Royal Challengers Bangalore,M.Chinnaswamy Stadium,15.0,,,,,
4,6,Visakhapatnam,2015-04-18,Indian Premier League,13.0,male,T20,M Nayyar,VK Sharma,C Shamshuddin,...,Delhi Daredevils,bat,Delhi Daredevils,Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket St...,4.0,,,,,


In [55]:
def populate_team_1(row):
    toss_winner = row["toss_winner"]
    if row["toss_decision"] == "bat":
        team_1 = row["toss_winner"]
    else:
        teams_1 = row["teams_1"]
        teams_2 = row["teams_2"]
        team_1 = teams_2 if toss_winner == teams_1 else teams_1

    return team_1

In [56]:
ipl_summary["team_1"] = ipl_summary.apply(populate_team_1, axis=1)

In [57]:
def populate_team_2(row):
    toss_winner = row["toss_winner"]
    if row["toss_decision"] == "field":
        team_2 = row["toss_winner"]
    else:
        teams_1 = row["teams_1"]
        teams_2 = row["teams_2"]
        team_2 = teams_2 if toss_winner == teams_1 else teams_1

    return team_2

In [58]:
ipl_summary["team_2"] = ipl_summary.apply(populate_team_2, axis=1)

In [59]:
ipl_summary.columns

Index(['balls_per_over', 'city', 'dates_1', 'event_name', 'event_match_number',
       'gender', 'match_type', 'officials_match_referees_1',
       'officials_reserve_umpires_1', 'officials_tv_umpires_1',
       'officials_umpires_1', 'officials_umpires_2', 'outcome_winner',
       'outcome_by_wickets', 'overs', 'player_of_match_1', 'season',
       'team_type', 'teams_1', 'teams_2', 'toss_decision', 'toss_winner',
       'venue', 'outcome_by_runs', 'outcome_method', 'event_stage',
       'outcome_result', 'outcome_eliminator', 'dates_2', 'team_1', 'team_2'],
      dtype='object')

In [60]:
sel_cols = [
    "teams_1",
    "teams_2",
    "toss_decision",
    "toss_winner",
    "outcome_winner",
    "team_1",
    "team_2",
]
ipl_summary[sel_cols].head()

Unnamed: 0,teams_1,teams_2,toss_decision,toss_winner,outcome_winner,team_1,team_2
0,Rajasthan Royals,Royal Challengers Bangalore,field,Royal Challengers Bangalore,Royal Challengers Bangalore,Rajasthan Royals,Royal Challengers Bangalore
1,Delhi Daredevils,Royal Challengers Bangalore,field,Royal Challengers Bangalore,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore
2,Kolkata Knight Riders,Chennai Super Kings,bat,Kolkata Knight Riders,Kolkata Knight Riders,Kolkata Knight Riders,Chennai Super Kings
3,Royal Challengers Bangalore,Delhi Daredevils,bat,Royal Challengers Bangalore,Royal Challengers Bangalore,Royal Challengers Bangalore,Delhi Daredevils
4,Sunrisers Hyderabad,Delhi Daredevils,bat,Delhi Daredevils,Delhi Daredevils,Delhi Daredevils,Sunrisers Hyderabad


In [61]:
ipl_summary.drop(columns=["teams_1", "teams_2"], inplace=True)

### Following columns have only a single value and so can be dropped from the dataframe

1. balls_per_over
2. event_name
3. gender
4. match_type
5. overs
6. team_type


In [62]:
ipl_summary["balls_per_over"].value_counts()

balls_per_over
6    1095
Name: count, dtype: int64

In [63]:
ipl_summary["event_name"].value_counts()

event_name
Indian Premier League    1095
Name: count, dtype: int64

In [64]:
ipl_summary["gender"].value_counts()

gender
male    1095
Name: count, dtype: int64

In [65]:
ipl_summary["match_type"].value_counts()

match_type
T20    1095
Name: count, dtype: int64

In [66]:
ipl_summary["overs"].value_counts()

overs
20    1095
Name: count, dtype: int64

In [67]:
ipl_summary["team_type"].value_counts()

team_type
club    1095
Name: count, dtype: int64

In [68]:
ipl_summary.drop(
    columns=[
        "balls_per_over",
        "event_name",
        "gender",
        "match_type",
        "overs",
        "team_type",
    ],
    inplace=True,
)

### Season has 2016 represented as string and numeric and so it is showing up as two different values. Convert to str type


In [69]:
ipl_summary[ipl_summary["season"].apply(lambda x: str(x).strip().find("2016") != -1)][
    "season"
].value_counts()

season
2016    59
2016     1
Name: count, dtype: int64

In [70]:
ipl_summary["season"] = ipl_summary["season"].astype(str)

In [71]:
ipl_summary.season.value_counts()

season
2013       76
2023       74
2012       74
2022       74
2011       73
2024       71
2021       60
2020/21    60
2018       60
2016       60
2009/10    60
2014       60
2019       60
2017       59
2015       59
2007/08    58
2009       57
Name: count, dtype: int64

In [72]:
ipl_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1095 entries, 0 to 1094
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   city                         1044 non-null   object 
 1   dates_1                      1095 non-null   object 
 2   event_match_number           1029 non-null   float64
 3   officials_match_referees_1   1095 non-null   object 
 4   officials_reserve_umpires_1  1071 non-null   object 
 5   officials_tv_umpires_1       1091 non-null   object 
 6   officials_umpires_1          1095 non-null   object 
 7   officials_umpires_2          1095 non-null   object 
 8   outcome_winner               1076 non-null   object 
 9   outcome_by_wickets           578 non-null    float64
 10  player_of_match_1            1090 non-null   object 
 11  season                       1095 non-null   object 
 12  toss_decision                1095 non-null   object 
 13  toss_winner       

In [73]:
ipl_summary.loc[ipl_summary["event_match_number"].isnull(), "event_match_number"] = -1

In [74]:
ipl_summary["event_match_number"] = ipl_summary["event_match_number"].astype(int)

In [75]:
ipl_summary["event_match_number"].unique()

array([43, 45, 21,  5, 13, 33, 10,  1, 41, 16, 32, 42, 48, 67,  2, 53, -1,
       47, 52, 69, 24, 39, 38, 44, 23, 59, 40, 54, 34, 50, 11, 15,  9, 64,
        8, 28, 31, 35, 22, 49,  6, 14, 19, 55, 30,  7, 51, 62,  4, 63, 57,
       37, 20,  3, 56, 25, 26, 60, 65, 46, 36, 12, 27, 29, 17, 18, 66, 58,
       72, 68, 61, 70, 71])

In [76]:
ipl_summary.describe()

Unnamed: 0,event_match_number,outcome_by_wickets,outcome_by_runs
count,1095.0,578.0,498.0
mean,29.479452,6.192042,30.104418
std,19.388911,1.845733,26.739844
min,-1.0,1.0,1.0
25%,13.0,5.0,11.0
50%,29.0,6.0,22.0
75%,45.0,7.0,41.0
max,72.0,10.0,146.0


In [77]:
for col in ipl_summary.columns:
    print(col, ipl_summary[col].dtype, sep=" => ")

city => object
dates_1 => object
event_match_number => int64
officials_match_referees_1 => object
officials_reserve_umpires_1 => object
officials_tv_umpires_1 => object
officials_umpires_1 => object
officials_umpires_2 => object
outcome_winner => object
outcome_by_wickets => float64
player_of_match_1 => object
season => object
toss_decision => object
toss_winner => object
venue => object
outcome_by_runs => float64
outcome_method => object
event_stage => object
outcome_result => object
outcome_eliminator => object
dates_2 => object
team_1 => object
team_2 => object


# Handle Missing Values


In [78]:
# populate missing city values based on the stadium
ipl_summary.loc[ipl_summary["venue"] == "Sharjah Cricket Stadium", "city"] = "Sharjah"
ipl_summary.loc[ipl_summary["venue"] == "Dubai International Cricket Stadium", "city"] = "Dubai"

In [79]:
# Drop dates_2 and outcome_eliminator columns as it has 98% missing values
ipl_summary = ipl_summary.drop(columns=["dates_2", "outcome_eliminator"])

In [80]:
# Defaulting the event_stage to group_stage as it is left blank
ipl_summary.loc[ipl_summary["event_stage"].isnull(), "event_stage"] = "group_stage"

In [81]:
ipl_summary['outcome_result']

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
1090    NaN
1091    NaN
1092    NaN
1093    NaN
1094    NaN
Name: outcome_result, Length: 1095, dtype: object

In [82]:
ipl_summary.loc[ipl_summary["outcome_winner"].isnull(), "outcome_winner"] = ipl_summary["outcome_result"]

In [83]:
ipl_summary.loc[ipl_summary["outcome_method"].isnull(), "outcome_method"] = "regular"

In [84]:
# drop outcome_result as it is merged in the outcome_winner column
ipl_summary.drop(columns=["outcome_result"], inplace=True)

In [85]:
# Fix season values
ipl_summary.loc[ipl_summary["season"] == "2007/08", "season"] = "2008"
ipl_summary.loc[ipl_summary["season"] == "2009/10", "season"] = "2010"
ipl_summary.loc[ipl_summary["season"] == "2020/21", "season"] = "2020"

In [86]:
ipl_summary["season"].value_counts().sort_index()

season
2008    58
2009    57
2010    60
2011    73
2012    74
2013    76
2014    60
2015    59
2016    60
2017    59
2018    60
2019    60
2020    60
2021    60
2022    74
2023    74
2024    71
Name: count, dtype: int64

In [87]:
ipl_summary["season"] = ipl_summary["season"].astype(int)

In [88]:
# Handle event_match_number
print(len(ipl_summary.loc[ipl_summary["event_match_number"] == -1, "event_match_number"]))
ipl_summary.loc[ipl_summary["event_match_number"] == -1, ["season", "dates_1"]].sort_values(by="dates_1")

66


Unnamed: 0,season,dates_1
853,2008,2008-05-30
207,2008,2008-05-31
610,2008,2008-06-01
1015,2009,2009-05-22
546,2009,2009-05-23
...,...,...
1094,2023,2023-05-29
563,2024,2024-05-21
759,2024,2024-05-22
20,2024,2024-05-24


In [89]:
# Not using this as we are populating the missing match numbers with event stage
def populate_match_numbers(df, season):
    df_part = df.loc[
        (df["season"] == season) & (df["event_match_number"].isnull()),
        ["event_match_number", "dates_1"],
    ].sort_values(by="dates_1")
    max_match_number = df[df["season"] == season]["event_match_number"].max() + 1
    print(df_part)
    for index, _ in df_part.iterrows():
        df.loc[index, "event_match_number"] = max_match_number
        max_match_number += 1

    print(df)

In [90]:
# Not using this as we are populating the missing match numbers with event stage
def populate_match_numbers_across_seasons(df):
    for seas in df.season.unique():
        populate_match_numbers(df, seas)

In [91]:
# Not using this as we are populating the missing match numbers with event stage
# populate_match_numbers_across_seasons(df)

In [92]:
# note that this would convert the null values to string "nan"
ipl_summary["event_match_number"] = ipl_summary["event_match_number"].astype(str)

In [93]:
ipl_summary.sort_values(by='dates_1',inplace=True)

In [94]:
ipl_summary.loc[ipl_summary["event_match_number"] == "-1", ["event_match_number"]] = ipl_summary["event_stage"]

In [95]:
ipl_summary['match_number'] = 0

In [96]:
prev_match_number = -1
for match_number,season in zip(ipl_summary['event_match_number'],ipl_summary['season']):

    if match_number.isnumeric():
        ipl_summary.loc[(ipl_summary["event_match_number"] == match_number) &(ipl_summary['season'] == season) , ["match_number"]] = int(
            match_number
        )
    else:
        int_match_number= prev_match_number+1
        ipl_summary.loc[
            (ipl_summary["event_match_number"] == match_number) & (ipl_summary["season"] == season),
            ["match_number"],
        ] = int_match_number
        match_number = int_match_number
    prev_match_number = int(match_number)

In [97]:
ipl_summary['match_number'].value_counts()

match_number
56    22
58    19
1     17
4     17
5     17
      ..
69     6
66     5
63     5
74     5
75     2
Name: count, Length: 75, dtype: int64

In [98]:
ipl_summary["match_number"].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       19, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 42, 41, 43, 44, 45, 46, 48, 49, 50, 51, 52,
       53, 54, 55, 56, 58, 59, 47, 57, 60, 61, 62, 63, 64, 65, 66, 68, 67,
       69, 70, 71, 72, 73, 74, 75])

In [99]:
pd.Series(ipl_summary["venue"].unique())

0                                 M Chinnaswamy Stadium
1            Punjab Cricket Association Stadium, Mohali
2                                      Feroz Shah Kotla
3                                          Eden Gardens
4                                      Wankhede Stadium
5                                Sawai Mansingh Stadium
6             Rajiv Gandhi International Stadium, Uppal
7                       MA Chidambaram Stadium, Chepauk
8                            Dr DY Patil Sports Academy
9                                              Newlands
10                                     St George's Park
11                                            Kingsmead
12                                      SuperSport Park
13                                         Buffalo Park
14                                New Wanderers Stadium
15                                De Beers Diamond Oval
16                                      OUTsurance Oval
17                                    Brabourne 

In [100]:
ipl_summary["venue"] = ipl_summary["venue"].str.partition(",")[0]

In [101]:
ipl_summary["venue"] = ipl_summary["venue"].str.replace(".", "")

In [102]:
ipl_summary["venue"] = ipl_summary["venue"].str.replace(" ", "")

In [103]:
pd.Series(ipl_summary["venue"].unique())

0                                   MChinnaswamyStadium
1                       PunjabCricketAssociationStadium
2                                        FerozShahKotla
3                                           EdenGardens
4                                       WankhedeStadium
5                                  SawaiMansinghStadium
6                       RajivGandhiInternationalStadium
7                                  MAChidambaramStadium
8                                DrDYPatilSportsAcademy
9                                              Newlands
10                                       StGeorge'sPark
11                                            Kingsmead
12                                       SuperSportPark
13                                          BuffaloPark
14                                  NewWanderersStadium
15                                   DeBeersDiamondOval
16                                       OUTsuranceOval
17                                     Brabourne

In [104]:
ipl_summary.columns

Index(['city', 'dates_1', 'event_match_number', 'officials_match_referees_1',
       'officials_reserve_umpires_1', 'officials_tv_umpires_1',
       'officials_umpires_1', 'officials_umpires_2', 'outcome_winner',
       'outcome_by_wickets', 'player_of_match_1', 'season', 'toss_decision',
       'toss_winner', 'venue', 'outcome_by_runs', 'outcome_method',
       'event_stage', 'team_1', 'team_2', 'match_number'],
      dtype='object')

In [105]:
ipl_summary = ipl_summary.rename(
    columns={
        "dates_1": "date",
        "officials_match_referees_1": "officials_match_referees",
        "officials_reserve_umpires_1": "officials_reserve_umpires",
        "officials_tv_umpires_1": "officials_tv_umpires",
        "teams_1": "team_1",
        "teams_2": "team_2",
        "player_of_match_1": "player_of_match",
    }
)

In [106]:
ipl_summary.columns

Index(['city', 'date', 'event_match_number', 'officials_match_referees',
       'officials_reserve_umpires', 'officials_tv_umpires',
       'officials_umpires_1', 'officials_umpires_2', 'outcome_winner',
       'outcome_by_wickets', 'player_of_match', 'season', 'toss_decision',
       'toss_winner', 'venue', 'outcome_by_runs', 'outcome_method',
       'event_stage', 'team_1', 'team_2', 'match_number'],
      dtype='object')

In [107]:
ipl_summary.rename(columns={"event_match_number": "season_match_no"}, inplace=True)

In [108]:
cols = [
    "date",
    "match_number",
    "city",
    "team_1",
    "team_2",
    "season_match_no",
    "outcome_winner",
    "player_of_match",
    "toss_winner",
    "toss_decision",
    "officials_match_referees",
    "officials_reserve_umpires",
    "officials_tv_umpires",
    "officials_umpires_1",
    "officials_umpires_2",
    "outcome_by_wickets",
    "season",
    "venue",
    "outcome_by_runs",
    "event_stage",
    "outcome_method",
]
ipl_summary = ipl_summary.reindex(columns=cols)

In [109]:
ipl_summary = ipl_summary.sort_values(by=["date", "match_number"])

In [110]:
ipl_summary["season"] = ipl_summary["date"].str.slice(0, 4)

In [111]:
ipl_summary.season.value_counts()

season
2013    76
2022    74
2012    74
2023    74
2011    73
2024    71
2019    60
2016    60
2010    60
2021    60
2020    60
2014    60
2018    60
2017    59
2015    59
2008    58
2009    57
Name: count, dtype: int64

In [112]:
ipl_summary.to_csv("output/ipl_summary.csv", index=False)

In [113]:
ball_by_ball 

Unnamed: 0,date,match_number,innings,over,batter,bowler,stage,non_striker,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,extras.byes,extras.noballs,wicket.kind,wicket.player_out,wicket.fielders,season
0,2008-04-18,1,1,0,SC Ganguly,P Kumar,group,BB McCullum,0,1,1,1.0,,,,,,,2008
1,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,2008
2,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,1,1,,1.0,,,,,,2008
3,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,2008
4,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260754,2024-05-26,73,2,9,SS Iyer,AK Markram,Final,VR Iyer,1,0,1,,,,,,,,2024
260755,2024-05-26,73,2,9,VR Iyer,AK Markram,Final,SS Iyer,1,0,1,,,,,,,,2024
260756,2024-05-26,73,2,10,VR Iyer,Shahbaz Ahmed,Final,SS Iyer,1,0,1,,,,,,,,2024
260757,2024-05-26,73,2,10,SS Iyer,Shahbaz Ahmed,Final,VR Iyer,1,0,1,,,,,,,,2024


In [114]:
ball_by_ball.shape

(260759, 19)

In [115]:
ball_by_ball.head()

Unnamed: 0,date,match_number,innings,over,batter,bowler,stage,non_striker,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,extras.byes,extras.noballs,wicket.kind,wicket.player_out,wicket.fielders,season
0,2008-04-18,1,1,0,SC Ganguly,P Kumar,group,BB McCullum,0,1,1,1.0,,,,,,,2008
1,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,2008
2,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,1,1,,1.0,,,,,,2008
3,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,2008
4,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,2008


In [116]:
ipl_summary["match_number"] = ipl_summary["match_number"].astype("str")

In [117]:
ipl_summary.shape

(1095, 21)

In [118]:
ipl_summary.head()

Unnamed: 0,date,match_number,city,team_1,team_2,season_match_no,outcome_winner,player_of_match,toss_winner,toss_decision,...,officials_reserve_umpires,officials_tv_umpires,officials_umpires_1,officials_umpires_2,outcome_by_wickets,season,venue,outcome_by_runs,event_stage,outcome_method
334,2008-04-18,1,Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,1,Kolkata Knight Riders,BB McCullum,Royal Challengers Bangalore,field,...,VN Kulkarni,AM Saheba,Asad Rauf,RE Koertzen,,2008,MChinnaswamyStadium,140.0,group_stage,regular
308,2008-04-19,2,Chandigarh,Chennai Super Kings,Kings XI Punjab,2,Chennai Super Kings,MEK Hussey,Chennai Super Kings,bat,...,MSS Ranawat,RB Tiffin,MR Benson,SL Shastri,,2008,PunjabCricketAssociationStadium,33.0,group_stage,regular
1091,2008-04-19,3,Delhi,Rajasthan Royals,Delhi Daredevils,3,Delhi Daredevils,MF Maharoof,Rajasthan Royals,bat,...,,IL Howell,Aleem Dar,GA Pratapkumar,9.0,2008,FerozShahKotla,,group_stage,regular
486,2008-04-20,4,Kolkata,Deccan Chargers,Kolkata Knight Riders,4,Kolkata Knight Riders,DJ Hussey,Deccan Chargers,bat,...,F Gomes,Asad Rauf,BF Bowden,K Hariharan,5.0,2008,EdenGardens,,group_stage,regular
461,2008-04-20,5,Mumbai,Mumbai Indians,Royal Challengers Bangalore,5,Royal Challengers Bangalore,MV Boucher,Mumbai Indians,bat,...,SN Bandekar,AV Jayaprakash,SJ Davis,DJ Harper,5.0,2008,WankhedeStadium,,group_stage,regular


In [119]:
def get_items_by_team(field, agg="sum"):
    if agg == "sum":
        grouped_df = ball_by_ball.groupby(["date", "match_number", "innings"])[field].sum()
    elif agg == "max":
        grouped_df = ball_by_ball.groupby(["date", "match_number", "innings"])[field].max()

    grouped_df = grouped_df.reset_index()

    df_pivot = grouped_df.pivot(
        index=["date", "match_number"], columns="innings", values=field
    )

    df_pivot.reset_index(inplace=True)

    df_pivot = df_pivot.rename(columns={1: "team_1_" + field, 2: "team_2_" + field})

    return df_pivot

In [121]:
df_pivot = get_items_by_team("runs.total")

In [123]:
df_pivot.match_number.dtype

dtype('int64')

In [125]:
ipl_summary['match_number'] = ipl_summary.match_number.astype(int)

In [126]:
df_merged = pd.merge(ipl_summary, df_pivot, how="outer", on=["date", "match_number"])

In [127]:
df_pivot = get_items_by_team("runs.extras")
df_merged = pd.merge(df_merged, df_pivot, how="outer", on=["date", "match_number"])

In [128]:
df_pivot = get_items_by_team("extras.legbyes")
df_merged = pd.merge(df_merged, df_pivot, how="outer", on=["date", "match_number"])

In [129]:
df_pivot = get_items_by_team("extras.wides")
df_merged = pd.merge(df_merged, df_pivot, how="outer", on=["date", "match_number"])

In [130]:
df_pivot = get_items_by_team("extras.byes")
df_merged = pd.merge(df_merged, df_pivot, how="outer", on=["date", "match_number"])

In [131]:
df_pivot = get_items_by_team("extras.noballs")
df_merged = pd.merge(df_merged, df_pivot, how="outer", on=["date", "match_number"])

In [132]:
df_pivot = get_items_by_team("over", "max")
df_merged = pd.merge(df_merged, df_pivot, how="outer", on=["date", "match_number"])

In [133]:
pd.set_option("display.max_columns", 35)
df_merged.head()

Unnamed: 0,date,match_number,city,team_1,team_2,season_match_no,outcome_winner,player_of_match,toss_winner,toss_decision,officials_match_referees,officials_reserve_umpires,officials_tv_umpires,officials_umpires_1,officials_umpires_2,outcome_by_wickets,season,venue,outcome_by_runs,event_stage,outcome_method,team_1_runs.total,team_2_runs.total,team_1_runs.extras,team_2_runs.extras,team_1_extras.legbyes,team_2_extras.legbyes,team_1_extras.wides,team_2_extras.wides,team_1_extras.byes,team_2_extras.byes,team_1_extras.noballs,team_2_extras.noballs,team_1_over,team_2_over
0,2008-04-18,1,Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,1,Kolkata Knight Riders,BB McCullum,Royal Challengers Bangalore,field,J Srinath,VN Kulkarni,AM Saheba,Asad Rauf,RE Koertzen,,2008,MChinnaswamyStadium,140.0,group_stage,regular,222.0,82.0,17.0,19.0,4.0,8.0,9.0,11.0,4.0,0.0,0.0,0.0,19,15
1,2008-04-19,2,Chandigarh,Chennai Super Kings,Kings XI Punjab,2,Chennai Super Kings,MEK Hussey,Chennai Super Kings,bat,S Venkataraghavan,MSS Ranawat,RB Tiffin,MR Benson,SL Shastri,,2008,PunjabCricketAssociationStadium,33.0,group_stage,regular,240.0,207.0,6.0,11.0,2.0,4.0,3.0,5.0,0.0,2.0,1.0,0.0,19,19
2,2008-04-19,3,Delhi,Rajasthan Royals,Delhi Daredevils,3,Delhi Daredevils,MF Maharoof,Rajasthan Royals,bat,GR Viswanath,,IL Howell,Aleem Dar,GA Pratapkumar,9.0,2008,FerozShahKotla,,group_stage,regular,129.0,132.0,7.0,10.0,3.0,0.0,3.0,10.0,1.0,0.0,0.0,0.0,19,15
3,2008-04-20,4,Kolkata,Deccan Chargers,Kolkata Knight Riders,4,Kolkata Knight Riders,DJ Hussey,Deccan Chargers,bat,FM Engineer,F Gomes,Asad Rauf,BF Bowden,K Hariharan,5.0,2008,EdenGardens,,group_stage,regular,110.0,112.0,10.0,28.0,4.0,8.0,4.0,15.0,0.0,4.0,2.0,1.0,18,18
4,2008-04-20,5,Mumbai,Mumbai Indians,Royal Challengers Bangalore,5,Royal Challengers Bangalore,MV Boucher,Mumbai Indians,bat,J Srinath,SN Bandekar,AV Jayaprakash,SJ Davis,DJ Harper,5.0,2008,WankhedeStadium,,group_stage,regular,165.0,166.0,11.0,5.0,6.0,0.0,3.0,5.0,2.0,0.0,0.0,0.0,19,19


In [134]:
cols = [
    "date",
    "match_number",
    "city",
    "team_1",
    "team_2",
    "team_1_runs.total",
    "team_2_runs.total",
    "outcome_winner",
    "player_of_match",
    "team_1_over",
    "team_2_over",
    "toss_winner",
    "toss_decision",
    "team_1_runs.extras",
    "team_2_runs.extras",
    "team_1_extras.legbyes",
    "team_2_extras.legbyes",
    "team_1_extras.wides",
    "team_2_extras.wides",
    "team_1_extras.byes",
    "team_2_extras.byes",
    "team_1_extras.noballs",
    "team_2_extras.noballs",
    "officials_match_referees",
    "officials_reserve_umpires",
    "officials_tv_umpires",
    "officials_umpires_1",
    "officials_umpires_2",
    "outcome_by_wickets",
    "season",
    "venue",
    "outcome_by_runs",
    "event_stage",
    "outcome_method",
]
df_merged = df_merged.reindex(columns=cols)

In [135]:
pd.set_option('display.max_columns', 40)

df_merged.head()

Unnamed: 0,date,match_number,city,team_1,team_2,team_1_runs.total,team_2_runs.total,outcome_winner,player_of_match,team_1_over,team_2_over,toss_winner,toss_decision,team_1_runs.extras,team_2_runs.extras,team_1_extras.legbyes,team_2_extras.legbyes,team_1_extras.wides,team_2_extras.wides,team_1_extras.byes,team_2_extras.byes,team_1_extras.noballs,team_2_extras.noballs,officials_match_referees,officials_reserve_umpires,officials_tv_umpires,officials_umpires_1,officials_umpires_2,outcome_by_wickets,season,venue,outcome_by_runs,event_stage,outcome_method
0,2008-04-18,1,Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,222.0,82.0,Kolkata Knight Riders,BB McCullum,19,15,Royal Challengers Bangalore,field,17.0,19.0,4.0,8.0,9.0,11.0,4.0,0.0,0.0,0.0,J Srinath,VN Kulkarni,AM Saheba,Asad Rauf,RE Koertzen,,2008,MChinnaswamyStadium,140.0,group_stage,regular
1,2008-04-19,2,Chandigarh,Chennai Super Kings,Kings XI Punjab,240.0,207.0,Chennai Super Kings,MEK Hussey,19,19,Chennai Super Kings,bat,6.0,11.0,2.0,4.0,3.0,5.0,0.0,2.0,1.0,0.0,S Venkataraghavan,MSS Ranawat,RB Tiffin,MR Benson,SL Shastri,,2008,PunjabCricketAssociationStadium,33.0,group_stage,regular
2,2008-04-19,3,Delhi,Rajasthan Royals,Delhi Daredevils,129.0,132.0,Delhi Daredevils,MF Maharoof,19,15,Rajasthan Royals,bat,7.0,10.0,3.0,0.0,3.0,10.0,1.0,0.0,0.0,0.0,GR Viswanath,,IL Howell,Aleem Dar,GA Pratapkumar,9.0,2008,FerozShahKotla,,group_stage,regular
3,2008-04-20,4,Kolkata,Deccan Chargers,Kolkata Knight Riders,110.0,112.0,Kolkata Knight Riders,DJ Hussey,18,18,Deccan Chargers,bat,10.0,28.0,4.0,8.0,4.0,15.0,0.0,4.0,2.0,1.0,FM Engineer,F Gomes,Asad Rauf,BF Bowden,K Hariharan,5.0,2008,EdenGardens,,group_stage,regular
4,2008-04-20,5,Mumbai,Mumbai Indians,Royal Challengers Bangalore,165.0,166.0,Royal Challengers Bangalore,MV Boucher,19,19,Mumbai Indians,bat,11.0,5.0,6.0,0.0,3.0,5.0,2.0,0.0,0.0,0.0,J Srinath,SN Bandekar,AV Jayaprakash,SJ Davis,DJ Harper,5.0,2008,WankhedeStadium,,group_stage,regular


In [136]:
df_merged.rename(columns={"outcome_winner": "match_winner"}, inplace=True)

In [137]:
df_merged["match_extras"] = df_merged["team_1_runs.extras"] + df_merged["team_2_runs.extras"]

In [138]:
df_merged["match_legbyes"] = df_merged["team_1_extras.legbyes"] + df_merged["team_2_extras.legbyes"]

In [139]:
df_merged["match_wides"] = df_merged["team_1_extras.wides"] + df_merged["team_2_extras.wides"]

In [140]:
df_merged["match_byes"] = df_merged["team_1_extras.byes"] + df_merged["team_2_extras.byes"]

In [141]:
df_merged["match_byes"] = df_merged["team_1_extras.byes"] + df_merged["team_2_extras.byes"]

In [142]:
df_merged["match_noballs"] = df_merged["team_1_extras.noballs"] + df_merged["team_2_extras.noballs"]

In [143]:
df_merged.head()

Unnamed: 0,date,match_number,city,team_1,team_2,team_1_runs.total,team_2_runs.total,match_winner,player_of_match,team_1_over,team_2_over,toss_winner,toss_decision,team_1_runs.extras,team_2_runs.extras,team_1_extras.legbyes,team_2_extras.legbyes,team_1_extras.wides,team_2_extras.wides,team_1_extras.byes,team_2_extras.byes,team_1_extras.noballs,team_2_extras.noballs,officials_match_referees,officials_reserve_umpires,officials_tv_umpires,officials_umpires_1,officials_umpires_2,outcome_by_wickets,season,venue,outcome_by_runs,event_stage,outcome_method,match_extras,match_legbyes,match_wides,match_byes,match_noballs
0,2008-04-18,1,Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,222.0,82.0,Kolkata Knight Riders,BB McCullum,19,15,Royal Challengers Bangalore,field,17.0,19.0,4.0,8.0,9.0,11.0,4.0,0.0,0.0,0.0,J Srinath,VN Kulkarni,AM Saheba,Asad Rauf,RE Koertzen,,2008,MChinnaswamyStadium,140.0,group_stage,regular,36.0,12.0,20.0,4.0,0.0
1,2008-04-19,2,Chandigarh,Chennai Super Kings,Kings XI Punjab,240.0,207.0,Chennai Super Kings,MEK Hussey,19,19,Chennai Super Kings,bat,6.0,11.0,2.0,4.0,3.0,5.0,0.0,2.0,1.0,0.0,S Venkataraghavan,MSS Ranawat,RB Tiffin,MR Benson,SL Shastri,,2008,PunjabCricketAssociationStadium,33.0,group_stage,regular,17.0,6.0,8.0,2.0,1.0
2,2008-04-19,3,Delhi,Rajasthan Royals,Delhi Daredevils,129.0,132.0,Delhi Daredevils,MF Maharoof,19,15,Rajasthan Royals,bat,7.0,10.0,3.0,0.0,3.0,10.0,1.0,0.0,0.0,0.0,GR Viswanath,,IL Howell,Aleem Dar,GA Pratapkumar,9.0,2008,FerozShahKotla,,group_stage,regular,17.0,3.0,13.0,1.0,0.0
3,2008-04-20,4,Kolkata,Deccan Chargers,Kolkata Knight Riders,110.0,112.0,Kolkata Knight Riders,DJ Hussey,18,18,Deccan Chargers,bat,10.0,28.0,4.0,8.0,4.0,15.0,0.0,4.0,2.0,1.0,FM Engineer,F Gomes,Asad Rauf,BF Bowden,K Hariharan,5.0,2008,EdenGardens,,group_stage,regular,38.0,12.0,19.0,4.0,3.0
4,2008-04-20,5,Mumbai,Mumbai Indians,Royal Challengers Bangalore,165.0,166.0,Royal Challengers Bangalore,MV Boucher,19,19,Mumbai Indians,bat,11.0,5.0,6.0,0.0,3.0,5.0,2.0,0.0,0.0,0.0,J Srinath,SN Bandekar,AV Jayaprakash,SJ Davis,DJ Harper,5.0,2008,WankhedeStadium,,group_stage,regular,16.0,6.0,8.0,2.0,0.0


In [144]:
ball_by_ball.head(10)

Unnamed: 0,date,match_number,innings,over,batter,bowler,stage,non_striker,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,extras.byes,extras.noballs,wicket.kind,wicket.player_out,wicket.fielders,season
0,2008-04-18,1,1,0,SC Ganguly,P Kumar,group,BB McCullum,0,1,1,1.0,,,,,,,2008
1,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,2008
2,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,1,1,,1.0,,,,,,2008
3,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,2008
4,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,2008
5,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,2008
6,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,1,1,1.0,,,,,,,2008
7,2008-04-18,1,1,1,BB McCullum,Z Khan,group,SC Ganguly,0,0,0,,,,,,,,2008
8,2008-04-18,1,1,1,BB McCullum,Z Khan,group,SC Ganguly,4,0,4,,,,,,,,2008
9,2008-04-18,1,1,1,BB McCullum,Z Khan,group,SC Ganguly,4,0,4,,,,,,,,2008


In [145]:
runs_groupby= ball_by_ball.groupby(['date', 'match_number', 'runs.batter'])['runs.batter'].agg(["count"])

In [146]:
runs_groupby

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
date,match_number,runs.batter,Unnamed: 3_level_1
2008-04-18,1,0,110
2008-04-18,1,1,66
2008-04-18,1,2,14
2008-04-18,1,4,18
2008-04-18,1,6,17
...,...,...,...
2024-05-26,73,0,90
2024-05-26,73,1,57
2024-05-26,73,2,10
2024-05-26,73,4,18


In [147]:
runs_df = runs_groupby.reset_index()

In [148]:
runs_df = runs_df[(runs_df['runs.batter'] == 4) | (runs_df['runs.batter'] == 6) ].sort_values(by=['date','match_number'])

In [149]:
runs_df

Unnamed: 0,date,match_number,runs.batter,count
3,2008-04-18,1,4,18
4,2008-04-18,1,6,17
8,2008-04-19,2,4,38
9,2008-04-19,2,6,25
13,2008-04-19,3,4,32
...,...,...,...,...
6057,2024-05-22,71,6,13
6061,2024-05-24,72,4,24
6062,2024-05-24,72,6,14
6066,2024-05-26,73,4,18


In [150]:
runs_pivot = runs_df.pivot(index=["date", "match_number"], columns="runs.batter", values=["count"])

In [151]:
runs_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,count,count
Unnamed: 0_level_1,runs.batter,4,6
date,match_number,Unnamed: 2_level_2,Unnamed: 3_level_2
2008-04-18,1,18,17
2008-04-19,2,38,25
2008-04-19,3,32,4
2008-04-20,4,11,10
2008-04-20,5,33,11
...,...,...,...
2024-05-19,69,31,26
2024-05-21,70,29,16
2024-05-22,71,33,13
2024-05-24,72,24,14


In [152]:
runs_pivot.reset_index(inplace=True)

In [153]:
runs_pivot.columns.values

array([('date', ''), ('match_number', ''), ('count', 4), ('count', 6)],
      dtype=object)

In [154]:
runs_pivot.columns = [col[0] + "_" + str(col[1]) for col in runs_pivot.columns.values]

In [155]:
runs_pivot.columns

Index(['date_', 'match_number_', 'count_4', 'count_6'], dtype='object')

In [156]:
runs_pivot = runs_pivot.rename(columns={"date_": "date", "match_number_": "match_number", "count_0": "match_dotballs" , "count_4": "match_4's" , "count_6": "match_6's"} )


In [157]:
runs_pivot

Unnamed: 0,date,match_number,match_4's,match_6's
0,2008-04-18,1,18,17
1,2008-04-19,2,38,25
2,2008-04-19,3,32,4
3,2008-04-20,4,11,10
4,2008-04-20,5,33,11
...,...,...,...,...
1090,2024-05-19,69,31,26
1091,2024-05-21,70,29,16
1092,2024-05-22,71,33,13
1093,2024-05-24,72,24,14


In [158]:
df_merged = pd.merge(df_merged, runs_pivot, how="outer", on=["date", "match_number"])

In [159]:
df_merged

Unnamed: 0,date,match_number,city,team_1,team_2,team_1_runs.total,team_2_runs.total,match_winner,player_of_match,team_1_over,team_2_over,toss_winner,toss_decision,team_1_runs.extras,team_2_runs.extras,team_1_extras.legbyes,team_2_extras.legbyes,team_1_extras.wides,team_2_extras.wides,team_1_extras.byes,...,team_1_extras.noballs,team_2_extras.noballs,officials_match_referees,officials_reserve_umpires,officials_tv_umpires,officials_umpires_1,officials_umpires_2,outcome_by_wickets,season,venue,outcome_by_runs,event_stage,outcome_method,match_extras,match_legbyes,match_wides,match_byes,match_noballs,match_4's,match_6's
0,2008-04-18,1,Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,222.0,82.0,Kolkata Knight Riders,BB McCullum,19,15,Royal Challengers Bangalore,field,17.0,19.0,4.0,8.0,9.0,11.0,4.0,...,0.0,0.0,J Srinath,VN Kulkarni,AM Saheba,Asad Rauf,RE Koertzen,,2008,MChinnaswamyStadium,140.0,group_stage,regular,36.0,12.0,20.0,4.0,0.0,18.0,17.0
1,2008-04-19,2,Chandigarh,Chennai Super Kings,Kings XI Punjab,240.0,207.0,Chennai Super Kings,MEK Hussey,19,19,Chennai Super Kings,bat,6.0,11.0,2.0,4.0,3.0,5.0,0.0,...,1.0,0.0,S Venkataraghavan,MSS Ranawat,RB Tiffin,MR Benson,SL Shastri,,2008,PunjabCricketAssociationStadium,33.0,group_stage,regular,17.0,6.0,8.0,2.0,1.0,38.0,25.0
2,2008-04-19,3,Delhi,Rajasthan Royals,Delhi Daredevils,129.0,132.0,Delhi Daredevils,MF Maharoof,19,15,Rajasthan Royals,bat,7.0,10.0,3.0,0.0,3.0,10.0,1.0,...,0.0,0.0,GR Viswanath,,IL Howell,Aleem Dar,GA Pratapkumar,9.0,2008,FerozShahKotla,,group_stage,regular,17.0,3.0,13.0,1.0,0.0,32.0,4.0
3,2008-04-20,4,Kolkata,Deccan Chargers,Kolkata Knight Riders,110.0,112.0,Kolkata Knight Riders,DJ Hussey,18,18,Deccan Chargers,bat,10.0,28.0,4.0,8.0,4.0,15.0,0.0,...,2.0,1.0,FM Engineer,F Gomes,Asad Rauf,BF Bowden,K Hariharan,5.0,2008,EdenGardens,,group_stage,regular,38.0,12.0,19.0,4.0,3.0,11.0,10.0
4,2008-04-20,5,Mumbai,Mumbai Indians,Royal Challengers Bangalore,165.0,166.0,Royal Challengers Bangalore,MV Boucher,19,19,Mumbai Indians,bat,11.0,5.0,6.0,0.0,3.0,5.0,2.0,...,0.0,0.0,J Srinath,SN Bandekar,AV Jayaprakash,SJ Davis,DJ Harper,5.0,2008,WankhedeStadium,,group_stage,regular,16.0,6.0,8.0,2.0,0.0,33.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1122,2024-05-19,69,Hyderabad,Punjab Kings,Sunrisers Hyderabad,214.0,215.0,Sunrisers Hyderabad,Abhishek Sharma,19,19,Punjab Kings,bat,10.0,17.0,5.0,6.0,4.0,10.0,0.0,...,1.0,1.0,Prakash Bhatt,Bhavesh Patel,HAS Khalid,Nitin Menon,VK Sharma,4.0,2024,RajivGandhiInternationalStadium,,group_stage,regular,27.0,11.0,14.0,0.0,2.0,31.0,26.0
1123,2024-05-21,70,Ahmedabad,Sunrisers Hyderabad,Kolkata Knight Riders,159.0,164.0,Kolkata Knight Riders,MA Starc,19,13,Sunrisers Hyderabad,bat,7.0,11.0,0.0,9.0,5.0,2.0,0.0,...,2.0,0.0,J Srinath,MV Saidharshan Kumar,KN Ananthapadmanabhan,AK Chaudhary,R Pandit,8.0,2024,NarendraModiStadium,,Qualifier 1,regular,18.0,9.0,7.0,0.0,2.0,29.0,16.0
1124,2024-05-22,71,Ahmedabad,Royal Challengers Bengaluru,Rajasthan Royals,172.0,174.0,Rajasthan Royals,R Ashwin,19,18,Rajasthan Royals,field,4.0,6.0,2.0,1.0,2.0,5.0,0.0,...,0.0,0.0,V Narayan Kutty,R Pandit,AK Chaudhary,KN Ananthapadmanabhan,MV Saidharshan Kumar,4.0,2024,NarendraModiStadium,,Eliminator,regular,10.0,3.0,7.0,0.0,0.0,33.0,13.0
1125,2024-05-24,72,Chennai,Sunrisers Hyderabad,Rajasthan Royals,175.0,139.0,Sunrisers Hyderabad,Shahbaz Ahmed,19,19,Rajasthan Royals,field,8.0,5.0,1.0,1.0,7.0,3.0,0.0,...,0.0,1.0,J Srinath,J Madanagopal,MA Gough,Nitin Menon,VK Sharma,,2024,MAChidambaramStadium,36.0,Qualifier 2,regular,13.0,2.0,10.0,0.0,1.0,24.0,14.0


In [160]:
df_merged.to_csv("output/ipl_match_level_stats.csv", index=False)

In [161]:
ball_by_ball  = pd.read_csv("output/ipl_ball_by_ball_output.csv")
ipl_match_stats = pd.read_csv("output/ipl_match_level_stats.csv")

In [162]:
ball_by_ball

Unnamed: 0,date,match_number,innings,over,batter,bowler,stage,non_striker,runs.batter,runs.extras,runs.total,extras.legbyes,extras.wides,extras.byes,extras.noballs,wicket.kind,wicket.player_out,wicket.fielders,wickets,season
0,2008-04-18,1,1,0,SC Ganguly,P Kumar,group,BB McCullum,0,1,1,1.0,,,,,,,,2008
1,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
2,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,1,1,,1.0,,,,,,,2008
3,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
4,2008-04-18,1,1,0,BB McCullum,P Kumar,group,SC Ganguly,0,0,0,,,,,,,,,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260754,2024-05-26,73,2,9,SS Iyer,AK Markram,Final,VR Iyer,1,0,1,,,,,,,,,2024
260755,2024-05-26,73,2,9,VR Iyer,AK Markram,Final,SS Iyer,1,0,1,,,,,,,,,2024
260756,2024-05-26,73,2,10,VR Iyer,Shahbaz Ahmed,Final,SS Iyer,1,0,1,,,,,,,,,2024
260757,2024-05-26,73,2,10,SS Iyer,Shahbaz Ahmed,Final,VR Iyer,1,0,1,,,,,,,,,2024


In [163]:
ipl_match_stats

Unnamed: 0,date,match_number,city,team_1,team_2,team_1_runs.total,team_2_runs.total,match_winner,player_of_match,team_1_over,team_2_over,toss_winner,toss_decision,team_1_runs.extras,team_2_runs.extras,team_1_extras.legbyes,team_2_extras.legbyes,team_1_extras.wides,team_2_extras.wides,team_1_extras.byes,...,team_1_extras.noballs,team_2_extras.noballs,officials_match_referees,officials_reserve_umpires,officials_tv_umpires,officials_umpires_1,officials_umpires_2,outcome_by_wickets,season,venue,outcome_by_runs,event_stage,outcome_method,match_extras,match_legbyes,match_wides,match_byes,match_noballs,match_4's,match_6's
0,2008-04-18,1,Bangalore,Kolkata Knight Riders,Royal Challengers Bangalore,222.0,82.0,Kolkata Knight Riders,BB McCullum,19.0,15.0,Royal Challengers Bangalore,field,17.0,19.0,4.0,8.0,9.0,11.0,4.0,...,0.0,0.0,J Srinath,VN Kulkarni,AM Saheba,Asad Rauf,RE Koertzen,,2008.0,MChinnaswamyStadium,140.0,group_stage,regular,36.0,12.0,20.0,4.0,0.0,18.0,17.0
1,2008-04-19,2,Chandigarh,Chennai Super Kings,Kings XI Punjab,240.0,207.0,Chennai Super Kings,MEK Hussey,19.0,19.0,Chennai Super Kings,bat,6.0,11.0,2.0,4.0,3.0,5.0,0.0,...,1.0,0.0,S Venkataraghavan,MSS Ranawat,RB Tiffin,MR Benson,SL Shastri,,2008.0,PunjabCricketAssociationStadium,33.0,group_stage,regular,17.0,6.0,8.0,2.0,1.0,38.0,25.0
2,2008-04-19,3,Delhi,Rajasthan Royals,Delhi Daredevils,129.0,132.0,Delhi Daredevils,MF Maharoof,19.0,15.0,Rajasthan Royals,bat,7.0,10.0,3.0,0.0,3.0,10.0,1.0,...,0.0,0.0,GR Viswanath,,IL Howell,Aleem Dar,GA Pratapkumar,9.0,2008.0,FerozShahKotla,,group_stage,regular,17.0,3.0,13.0,1.0,0.0,32.0,4.0
3,2008-04-20,4,Kolkata,Deccan Chargers,Kolkata Knight Riders,110.0,112.0,Kolkata Knight Riders,DJ Hussey,18.0,18.0,Deccan Chargers,bat,10.0,28.0,4.0,8.0,4.0,15.0,0.0,...,2.0,1.0,FM Engineer,F Gomes,Asad Rauf,BF Bowden,K Hariharan,5.0,2008.0,EdenGardens,,group_stage,regular,38.0,12.0,19.0,4.0,3.0,11.0,10.0
4,2008-04-20,5,Mumbai,Mumbai Indians,Royal Challengers Bangalore,165.0,166.0,Royal Challengers Bangalore,MV Boucher,19.0,19.0,Mumbai Indians,bat,11.0,5.0,6.0,0.0,3.0,5.0,2.0,...,0.0,0.0,J Srinath,SN Bandekar,AV Jayaprakash,SJ Davis,DJ Harper,5.0,2008.0,WankhedeStadium,,group_stage,regular,16.0,6.0,8.0,2.0,0.0,33.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1122,2024-05-19,69,Hyderabad,Punjab Kings,Sunrisers Hyderabad,214.0,215.0,Sunrisers Hyderabad,Abhishek Sharma,19.0,19.0,Punjab Kings,bat,10.0,17.0,5.0,6.0,4.0,10.0,0.0,...,1.0,1.0,Prakash Bhatt,Bhavesh Patel,HAS Khalid,Nitin Menon,VK Sharma,4.0,2024.0,RajivGandhiInternationalStadium,,group_stage,regular,27.0,11.0,14.0,0.0,2.0,31.0,26.0
1123,2024-05-21,70,Ahmedabad,Sunrisers Hyderabad,Kolkata Knight Riders,159.0,164.0,Kolkata Knight Riders,MA Starc,19.0,13.0,Sunrisers Hyderabad,bat,7.0,11.0,0.0,9.0,5.0,2.0,0.0,...,2.0,0.0,J Srinath,MV Saidharshan Kumar,KN Ananthapadmanabhan,AK Chaudhary,R Pandit,8.0,2024.0,NarendraModiStadium,,Qualifier 1,regular,18.0,9.0,7.0,0.0,2.0,29.0,16.0
1124,2024-05-22,71,Ahmedabad,Royal Challengers Bengaluru,Rajasthan Royals,172.0,174.0,Rajasthan Royals,R Ashwin,19.0,18.0,Rajasthan Royals,field,4.0,6.0,2.0,1.0,2.0,5.0,0.0,...,0.0,0.0,V Narayan Kutty,R Pandit,AK Chaudhary,KN Ananthapadmanabhan,MV Saidharshan Kumar,4.0,2024.0,NarendraModiStadium,,Eliminator,regular,10.0,3.0,7.0,0.0,0.0,33.0,13.0
1125,2024-05-24,72,Chennai,Sunrisers Hyderabad,Rajasthan Royals,175.0,139.0,Sunrisers Hyderabad,Shahbaz Ahmed,19.0,19.0,Rajasthan Royals,field,8.0,5.0,1.0,1.0,7.0,3.0,0.0,...,0.0,1.0,J Srinath,J Madanagopal,MA Gough,Nitin Menon,VK Sharma,,2024.0,MAChidambaramStadium,36.0,Qualifier 2,regular,13.0,2.0,10.0,0.0,1.0,24.0,14.0


In [164]:
master_df = pd.merge(left=ball_by_ball,right=ipl_match_stats,on=['date','match_number'])

In [165]:
master_df.drop(columns='season_y',inplace=True)

In [166]:
master_df.rename(columns={'season_x':'season'},inplace=True)

In [167]:
master_df.columns

Index(['date', 'match_number', 'innings', 'over', 'batter', 'bowler', 'stage',
       'non_striker', 'runs.batter', 'runs.extras', 'runs.total',
       'extras.legbyes', 'extras.wides', 'extras.byes', 'extras.noballs',
       'wicket.kind', 'wicket.player_out', 'wicket.fielders', 'wickets',
       'season', 'city', 'team_1', 'team_2', 'team_1_runs.total',
       'team_2_runs.total', 'match_winner', 'player_of_match', 'team_1_over',
       'team_2_over', 'toss_winner', 'toss_decision', 'team_1_runs.extras',
       'team_2_runs.extras', 'team_1_extras.legbyes', 'team_2_extras.legbyes',
       'team_1_extras.wides', 'team_2_extras.wides', 'team_1_extras.byes',
       'team_2_extras.byes', 'team_1_extras.noballs', 'team_2_extras.noballs',
       'officials_match_referees', 'officials_reserve_umpires',
       'officials_tv_umpires', 'officials_umpires_1', 'officials_umpires_2',
       'outcome_by_wickets', 'venue', 'outcome_by_runs', 'event_stage',
       'outcome_method', 'match_extras', '

In [168]:
master_df.drop(columns='wickets',inplace=True)

In [169]:
ball_by_ball.drop(columns='wickets',inplace=True)


In [170]:
ball_by_ball.groupby(['date','match_number','innings','batter']).agg({'runs.batter':np.sum,'runs.extras':np.sum,'bowler':'count'})


  ball_by_ball.groupby(['date','match_number','innings','batter']).agg({'runs.batter':np.sum,'runs.extras':np.sum,'bowler':'count'})
  ball_by_ball.groupby(['date','match_number','innings','batter']).agg({'runs.batter':np.sum,'runs.extras':np.sum,'bowler':'count'})


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,runs.batter,runs.extras,bowler
date,match_number,innings,batter,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-04-18,1,1,BB McCullum,158,11,77
2008-04-18,1,1,DJ Hussey,12,0,12
2008-04-18,1,1,Mohammad Hafeez,5,0,3
2008-04-18,1,1,RT Ponting,20,4,20
2008-04-18,1,1,SC Ganguly,10,2,12
...,...,...,...,...,...,...
2024-05-26,73,1,TM Head,0,0,1
2024-05-26,73,2,Rahmanullah Gurbaz,39,7,35
2024-05-26,73,2,SP Narine,6,0,2
2024-05-26,73,2,SS Iyer,6,0,3


In [171]:
ball_by_ball.groupby(['date','match_number','innings','bowler','runs.batter']).agg({'runs.total':'sum','wicket.kind':'count','batter':'count'})


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,runs.total,wicket.kind,batter
date,match_number,innings,bowler,runs.batter,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2008-04-18,1,1,AA Noffke,0,6,1,8
2008-04-18,1,1,AA Noffke,1,11,0,11
2008-04-18,1,1,AA Noffke,2,4,0,2
2008-04-18,1,1,AA Noffke,4,8,0,2
2008-04-18,1,1,AA Noffke,6,12,0,2
...,...,...,...,...,...,...,...
2024-05-26,73,2,Shahbaz Ahmed,6,12,0,2
2024-05-26,73,2,T Natarajan,0,2,0,4
2024-05-26,73,2,T Natarajan,1,5,0,5
2024-05-26,73,2,T Natarajan,4,16,0,4


In [172]:
batter_total = ball_by_ball.groupby(['batter','season',]).agg({'runs.total':['sum',np.average],'bowler':'max'})


In [173]:
batter_total


Unnamed: 0_level_0,Unnamed: 1_level_0,runs.total,runs.total,bowler
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,average,max
batter,season,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A Ashish Reddy,2012,40,1.333333,SW Tait
A Ashish Reddy,2013,126,1.400000,UT Yadav
A Ashish Reddy,2015,74,1.608696,TG Southee
A Ashish Reddy,2016,48,1.600000,YS Chahal
A Badoni,2022,173,1.244604,YS Chahal
...,...,...,...,...
Z Khan,2011,21,0.875000,SB Jakati
Z Khan,2012,12,0.705882,Shakib Al Hasan
Z Khan,2014,10,1.428571,IK Pathan
Z Khan,2016,6,0.461538,PP Chawla


In [174]:
batter_total.columns


MultiIndex([('runs.total',     'sum'),
            ('runs.total', 'average'),
            (    'bowler',     'max')],
           )

In [175]:
batter_scores_count = ball_by_ball.groupby(['batter','season','runs.batter']).agg({'runs.total':['sum','count']})


In [176]:
batter_scores_count

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,runs.total,runs.total
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,count
batter,season,runs.batter,Unnamed: 3_level_2,Unnamed: 4_level_2
A Ashish Reddy,2012,0,5,13
A Ashish Reddy,2012,1,9,9
A Ashish Reddy,2012,2,8,4
A Ashish Reddy,2012,4,12,3
A Ashish Reddy,2012,6,6,1
...,...,...,...,...
Z Khan,2016,0,0,10
Z Khan,2016,1,2,2
Z Khan,2016,4,4,1
Z Khan,2017,0,0,9


In [177]:
vk1 =batter_scores_count.loc['V Kohli'].reset_index()
vk2 = batter_total.loc['V Kohli'].reset_index()
vk1.columns = ['_'.join(col) for col in vk1.columns]
vk2.columns = ['_'.join(col) for col in vk2.columns]
vk1['season_'] =  vk1.season_.astype(int)
vk2['season_'] =  vk2.season_.astype(int)
vk1.rename(columns={'runs.total_sum':'total_runs_per_score','runs.total_count':'no_of_balls','season_':'season'},inplace=True)
vk2.rename(columns={'season_': 'season'}, inplace=True)
print(vk1.info())
print(vk2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97 entries, 0 to 96
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   season                97 non-null     int64
 1   runs.batter_          97 non-null     int64
 2   total_runs_per_score  97 non-null     int64
 3   no_of_balls           97 non-null     int64
dtypes: int64(4)
memory usage: 3.2 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   season              17 non-null     int64  
 1   runs.total_sum      17 non-null     int64  
 2   runs.total_average  17 non-null     float64
 3   bowler_max          17 non-null     object 
dtypes: float64(1), int64(2), object(1)
memory usage: 672.0+ bytes
None


In [178]:
print(vk1.dtypes)
print(vk2.dtypes)
# v_kohli = vk1.join(vk2,on='season',lsuffix='_')
v_kohli = pd.merge(vk1, vk2, how="inner", on="season")
# v_kohli.columns = ['_'.join(col) for col in v_kohli.columns]
v_kohli.head(20)
vk2.head()


season                  int64
runs.batter_            int64
total_runs_per_score    int64
no_of_balls             int64
dtype: object
season                  int64
runs.total_sum          int64
runs.total_average    float64
bowler_max             object
dtype: object


Unnamed: 0,season,runs.total_sum,runs.total_average,bowler_max
0,2008,187,1.113095,WPUJC Vaas
1,2009,258,1.146667,Yuvraj Singh
2,2010,320,1.481481,Z Khan
3,2011,584,1.234672,Yuvraj Singh
4,2012,375,1.126126,YK Pathan


In [179]:
vk1.head(10)


Unnamed: 0,season,runs.batter_,total_runs_per_score,no_of_balls
0,2008,0,22,82
1,2008,1,59,59
2,2008,2,10,5
3,2008,4,72,18
4,2008,6,24,4
5,2009,0,12,98
6,2009,1,84,84
7,2009,2,26,13
8,2009,4,88,22
9,2009,6,48,8


In [180]:
v_kohli

Unnamed: 0,season,runs.batter_,total_runs_per_score,no_of_balls,runs.total_sum,runs.total_average,bowler_max
0,2008,0,22,82,187,1.113095,WPUJC Vaas
1,2008,1,59,59,187,1.113095,WPUJC Vaas
2,2008,2,10,5,187,1.113095,WPUJC Vaas
3,2008,4,72,18,187,1.113095,WPUJC Vaas
4,2008,6,24,4,187,1.113095,WPUJC Vaas
...,...,...,...,...,...,...,...
92,2024,1,198,198,770,1.549296,YS Chahal
93,2024,2,64,32,770,1.549296,YS Chahal
94,2024,3,3,1,770,1.549296,YS Chahal
95,2024,4,248,62,770,1.549296,YS Chahal


In [184]:
gapminder(v_kohli, v_kohli['season'], v_kohli['runs.total_sum'],
          v_kohli['runs.total_average']**5, v_kohli["bowler_max"] ,labels={'runs.total_sum':'Total runs','season_':"Season"},title='Virat Batting Perf')


In [185]:
parallelplot(v_kohli,columns=[v_kohli['runs.total_sum'],v_kohli['runs.batter_'],v_kohli['total_runs_per_score'],v_kohli['no_of_balls']],title="Kohli Parallel",color_col=v_kohli['season'])

In [186]:
bowler_perf = ball_by_ball.groupby(['bowler','season','wicket.kind']).agg({'wicket.player_out':['count'],'batter':['max']})
bowler_perf


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,wicket.player_out,batter
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,max
bowler,season,wicket.kind,Unnamed: 3_level_2,Unnamed: 4_level_2
A Ashish Reddy,2012,bowled,2,RV Uthappa
A Ashish Reddy,2012,caught,7,V Kohli
A Ashish Reddy,2012,lbw,2,N Saini
A Ashish Reddy,2013,bowled,2,LRPL Taylor
A Ashish Reddy,2013,caught and bowled,1,V Kohli
...,...,...,...,...
Z Khan,2016,lbw,1,PP Chawla
Z Khan,2016,run out,2,UT Yadav
Z Khan,2017,bowled,1,AM Rahane
Z Khan,2017,caught,9,V Kohli


In [194]:
bowler_match_perf = ball_by_ball.groupby(['bowler','season','match_number']).agg({'wicket.player_out':['count'],'runs.total':['sum'],'batter':'count'})
bowler_match_perf

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,wicket.player_out,runs.total,batter
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,sum,count
bowler,season,match_number,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
A Ashish Reddy,2012,35,2,32,24
A Ashish Reddy,2012,40,1,11,14
A Ashish Reddy,2012,42,1,32,19
A Ashish Reddy,2012,46,1,16,13
A Ashish Reddy,2012,50,1,36,25
...,...,...,...,...,...
Z Khan,2017,32,0,9,7
Z Khan,2017,45,0,30,24
Z Khan,2017,50,0,35,24
Z Khan,2017,52,2,25,25


In [195]:
ys_chahal_wicket = bowler_perf.loc['YS Chahal'].reset_index()
ys_chahal_match = bowler_match_perf.loc['YS Chahal'].reset_index()
ys_chahal_match.drop(columns='match_number',inplace=True)
ys_chahal_match.columns = ['_'.join(col) for col in ys_chahal_match.columns]
ys_chahal_match



dropping on a non-lexsorted multi-index without a level parameter may impact performance.



Unnamed: 0,season_,wicket.player_out_count,runs.total_sum,batter_count
0,2013,0,35,24
1,2014,1,18,25
2,2014,2,17,24
3,2014,1,26,24
4,2014,1,17,18
...,...,...,...,...
154,2024,1,48,24
155,2024,1,22,25
156,2024,2,31,25
157,2024,1,43,24


In [196]:
ys_chahal_match = ys_chahal_match.groupby('season_').agg({'wicket.player_out_count':'sum','runs.total_sum':'sum','batter_count':'sum'})
# ys_chahal_match.columns = ['_'.join(col) for col in ys_chahal_match.columns]
ys_chahal_wicket.columns = ['_'.join(col) for col in ys_chahal_wicket.columns]

In [197]:
ys_chahal_match.reset_index(inplace=True)
ys_chahal_match

Unnamed: 0,season_,wicket.player_out_count,runs.total_sum,batter_count
0,2013,0,35,24
1,2014,12,389,333
2,2015,24,416,283
3,2016,22,409,308
4,2017,14,351,271
5,2018,12,374,313
6,2019,18,394,308
7,2020,22,414,348
8,2021,19,381,323
9,2022,29,536,429


In [198]:
ys_chahal_wicket

Unnamed: 0,season_,wicket.kind_,wicket.player_out_count,batter_max
0,2014,bowled,1,M Vijay
1,2014,caught,10,V Sehwag
2,2014,stumped,1,KP Pietersen
3,2015,bowled,4,SV Samson
4,2015,caught,14,Yuvraj Singh
5,2015,caught and bowled,1,Sandeep Sharma
6,2015,lbw,1,DA Warner
7,2015,run out,1,JP Duminy
8,2015,stumped,3,KS Williamson
9,2016,bowled,2,CA Lynn


In [199]:
gapminder(df=ys_chahal_match,x_col=ys_chahal_match['season_'],y_col=ys_chahal_match['wicket.player_out_count'],size_col=ys_chahal_match['runs.total_sum']**5,color_col=(ys_chahal_match['runs.total_sum']/ys_chahal_match['batter_count'])**5,title="Yuzvendra Wicket Trend")

In [200]:
ys_chahal_wicket['wicket.kind_'].unique()

array(['bowled', 'caught', 'stumped', 'caught and bowled', 'lbw',
       'run out'], dtype=object)

In [201]:
ys_chahal_wicket['wicket_kind_no'] = [['bowled', 'caught', 'stumped', 'caught and bowled', 'lbw',
       'run out'].index(value) for value in ys_chahal_wicket['wicket.kind_']]

In [202]:
label = {'0':'bowled','1':'caught','2':'stumped','3':'caught and bowled','4':'stumped','5':'lbw'}
print(label)
parallelplot(ys_chahal_wicket,columns=[ys_chahal_wicket['season_'],ys_chahal_wicket['wicket.player_out_count'],ys_chahal_wicket['wicket_kind_no']],labels=label,title="Yuzvendra Perf")

{'0': 'bowled', '1': 'caught', '2': 'stumped', '3': 'caught and bowled', '4': 'stumped', '5': 'lbw'}


In [None]:
# master_df.to_csv("output/master_dataframe.csv")