# 1. Imports & load dataset

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from ipywidgets import interactive
import ipywidgets as widgets

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
ipl_df = pd.read_csv('C:/Users/neera/Mad About Sports/Advanced Cricket Analytics Masterclass/Datasets/IPL_ball_by_ball_updated.csv')

# 2. Examining the data

In [3]:
ipl_df.head(2)

Unnamed: 0,match_id,season,start_date,venue,innings,ball,batting_team,bowling_team,striker,non_striker,bowler,runs_off_bat,extras,wides,noballs,byes,legbyes,penalty,wicket_type,player_dismissed,other_wicket_type,other_player_dismissed
0,335982,2008,2008-04-18,M Chinnaswamy Stadium,2,6.8,Royal Challengers Bangalore,Kolkata Knight Riders,MV Boucher,CL White,AB Agarkar,4,0,,,,,,,,,
1,335982,2008,2008-04-18,M Chinnaswamy Stadium,2,2.7,Royal Challengers Bangalore,Kolkata Knight Riders,W Jaffer,JH Kallis,AB Dinda,1,0,,,,,,,,,


In [4]:
ipl_df['batting_team'].unique()

array(['Royal Challengers Bangalore', 'Kolkata Knight Riders',
       'Rajasthan Royals', 'Delhi Daredevils', 'Kings XI Punjab',
       'Chennai Super Kings', 'Deccan Chargers', 'Mumbai Indians',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals', 'Punjab Kings',
       'Gujarat Titans', 'Lucknow Super Giants'], dtype=object)

In [5]:
ipl_df['innings'].unique()

array([2, 1, 4, 3, 6, 5], dtype=int64)

In [6]:
# Not considering super overs (innings 3 to 6)
ipl_df = ipl_df[(ipl_df.innings == 1) | (ipl_df.innings == 2)]

In [7]:
ipl_df['innings'].unique()

array([2, 1], dtype=int64)

# 3. Additional columns
## 3.1 Total runs in each ball

In [8]:
ipl_df['total_runs'] = ipl_df.apply(lambda x: x['runs_off_bat'] + x['extras'], axis = 1)

## 3.2 Is there a wicket in that ball

In [9]:
ipl_df['player_dismissed'].unique()

array([nan, 'JH Kallis', 'W Jaffer', 'V Kohli', 'R Dravid', 'Z Khan',
       'SB Joshi', 'AA Noffke', 'MV Boucher', 'B Akhil', 'CL White',
       'SC Ganguly', 'DJ Hussey', 'RT Ponting', 'M Kaif', 'DS Lehmann',
       'M Rawat', 'RA Jadeja', 'SR Watson', 'YK Pathan', 'T Kohli',
       'SK Warne', 'V Sehwag', 'KC Sangakkara', 'SK Raina', 'JDP Oram',
       'PA Patel', 'MS Dhoni', 'ML Hayden', 'JR Hopes', 'Yuvraj Singh',
       'K Goel', 'SB Styris', 'SB Bangar', 'WPUJC Vaas', 'A Symonds',
       'AS Yadav', 'Y Venugopal Rao', 'PP Ojha', 'VVS Laxman',
       'RG Sharma', 'AC Gilchrist', 'Mohammad Hafeez', 'WP Saha',
       'BB McCullum', 'PR Shah', 'AM Nayar', 'RV Uthappa', 'DJ Thornely',
       'L Ronchi', 'ST Jayasuriya', 'LRPL Taylor', 'SM Pollock',
       'S Chanderpaul', 'Kamran Akmal', 'DPMD Jayawardene', 'IK Pathan',
       'S Sohal', 'B Lee', 'G Gambhir', 'Shahid Afridi', 'MA Khote',
       'Harbhajan Singh', 'DJ Bravo', 'MEK Hussey', 'GC Smith',
       'D Salunkhe', 'SS Tiwary',

In [10]:
type(ipl_df['player_dismissed'].unique()[0]), type(ipl_df['player_dismissed'].unique()[1])

(float, str)

In [11]:
# isOut[x] will be 1 if type(isOut[x]) matches type('str') which is a string
ipl_df['isOut'] = ipl_df['player_dismissed'].apply(lambda x: 1 if type(x) == type('str') else 0)

In [12]:
ipl_df['isOut'].unique()

array([0, 1], dtype=int64)

# 4. Calculating outcome probabilities
## 4.1 Choose teams

In [13]:
# t1 - bat first, t2 - bat second

t1 = 'Punjab Kings'
t2 = 'Delhi Capitals'

## 4.2 Outcomes

In [14]:
100 * ipl_df[ipl_df.batting_team == t1].total_runs.value_counts(normalize = True)

1    41.735537
0    35.714286
4    10.537190
6    5.991736 
2    5.637544 
3    0.206612 
5    0.177096 
Name: total_runs, dtype: float64

5 and 7 each contribute to about 0.2% of the total runs scored by Punjab Kings. So, we shall ignore these in the outcomes.

In [15]:
outcomes = [0, 1, 2, 3, 4, 6, 'w']

In [16]:
t1_outcomes_count = ipl_df[ipl_df.batting_team == t1].total_runs.value_counts()
t2_outcomes_count = ipl_df[ipl_df.batting_team == t2].total_runs.value_counts()

In [17]:
t1_outs = ipl_df[ipl_df.batting_team == t1].isOut.sum()
t2_outs = ipl_df[ipl_df.batting_team == t2].isOut.sum()

In [18]:
for outcome in outcomes:
    print(outcome)

0
1
2
3
4
6
w


In [19]:
t1_outcomes_count

1    1414
0    1210
4    357 
6    203 
2    191 
3    7   
5    6   
Name: total_runs, dtype: int64

In [20]:
# Total no. of wickets lost
t1_outs

180

In [21]:
t1_outcomes = []
t2_outcomes = []

for outcome in outcomes:
    if outcome != 'w':
        t1_outcomes.append(t1_outcomes_count[outcome])
        t2_outcomes.append(t2_outcomes_count[outcome])
    else:
        t1_outcomes.append(t1_outs)
        t2_outcomes.append(t2_outs)

In [22]:
t1_outcomes, t2_outcomes

([1210, 1414, 191, 7, 357, 203, 180], [2516, 3277, 477, 23, 921, 346, 371])

## 4.3 Probability of outcomes

In [23]:
t1_pb_outcomes = [i/sum(t1_outcomes) for i in t1_outcomes]
t2_pb_outcomes = [i/sum(t2_outcomes) for i in t2_outcomes]

In [24]:
t1_pb_outcomes

[0.33969679955081417,
 0.3969679955081415,
 0.05362156092083099,
 0.0019651880965749578,
 0.10022459292532285,
 0.05699045480067378,
 0.05053340819764177]

In [25]:
t1_pb_ls = np.cumsum(t1_pb_outcomes)
t2_pb_ls = np.cumsum(t2_pb_outcomes)

In [26]:
t1_pb_ls, t2_pb_ls

(array([0.3396968 , 0.7366648 , 0.79028636, 0.79225154, 0.89247614,
        0.94946659, 1.        ]),
 array([0.31723616, 0.73042491, 0.79056865, 0.79346867, 0.90959526,
        0.95322154, 1.        ]))

## 4.4 Using probability values

In [27]:
def get_pbvalues(teamName):
    if teamName == 'PBKS':
        p_0 = 0.33970
        p_1 = 0.73666
        p_2 = 0.79029
        p_3 = 0.79225
        p_4 = 0.89248
        p_6 = 0.94947
        p_w = 1
    
    elif teamName == 'DC':
        p_0 = 0.31724
        p_1 = 0.73042
        p_2 = 0.79057
        p_3 = 0.79347
        p_4 = 0.90960
        p_6 = 0.95322
        p_w = 1

    return p_0, p_1, p_2, p_3, p_4, p_6, p_w

# 5. Runs prediction model for chasing team

In [28]:
# current_overs is an integer
def predict_runs(target, current_score, current_wickets, current_overs):

    # pb values of both teams
    i1p_0, i1p_1, i1p_2, i1p_3, i1p_4, i1p_6, i1p_w = get_pbvalues('PBKS')
    i2p_0, i2p_1, i2p_2, i2p_3, i2p_4, i2p_6, i2p_w = get_pbvalues('DC')

    pred_runs = current_score
    pred_wks = current_wickets
    leftover_balls = 120 - current_overs*6

    for i in range(leftover_balls):
        r_value = np.random.random()

        if r_value <= i2p_0:
            pred_runs += 0
        elif r_value <= i2p_1:
            pred_runs += 1
        elif r_value <= i2p_2:
            pred_runs += 2
        elif r_value <= i2p_3:
            pred_runs += 3
        elif r_value <= i2p_4:
            pred_runs += 4
        elif r_value <= i2p_6:
            pred_runs += 6
        else:
            pred_runs += 0
            pred_wks += 1
            if pred_wks == 10:
                break
        if pred_runs > target:
            break
        # print('pred_runs: ', pred_runs)
        # print('pred_wks: ', pred_wks)
    
    return pred_runs

In [29]:
# predict_runs(target, current_score, current_wickets, current_overs)
predict_runs(167, 0, 0, 0)

171

# 6. Winner function

In [30]:
def get_win(pred_runs, target):
    if pred_runs > target:
        return 'win'
    elif pred_runs == target:
        return 'tie'
    else:
        return 'lose'

In [31]:
# runs, wickets, overs, 
# win - 1st ing score

# PBKS - 166/6

target = 167

current_score   = 87
current_wickets = 1
current_overs   = 10

iter_count = 100

runs_ls    = []
results_ls = []

win_count  = 0
tie_count  = 0
lose_count = 0

for i in range(iter_count):
    pred_runs = predict_runs(target, current_score, current_wickets, current_overs)
    runs_ls.append(pred_runs)
    result_pred = get_win(pred_runs, target)
    results_ls.append(result_pred)

    if result_pred == 'win':
        win_count += 1
    elif result_pred == 'tie':
        tie_count += 1
    else:
        lose_count +=1

In [32]:
win_count, tie_count, lose_count

(42, 2, 56)

# 7. Finding runs at a given stage

In [33]:
def find_runs(current_score, target, current_wickets, at_overs):
    runs_ls = []
    results_ls = []

    req_runs = []
    win_ls = []

    for i in range(current_score, target + 1):
        win_count = 0
        tie_count = 0
        lose_count = 0

        for j in range(100):
            pred_runs = predict_runs(target, i, current_wickets, at_overs)
            runs_ls.append(pred_runs)
            result_pred = get_win(pred_runs, target)
            results_ls.append(result_pred)

            if result_pred == 'win':
                win_count += 1
            elif result_pred == 'tie':
                tie_count += 1
            else:
                lose_count +=1

            win_ls.append(win_count)
            req_runs.append(i)
            # print('runs: ', i, ' win%: ', win_count)

    required_runs = current_score
    for i in range(len(req_runs)):
        if win_ls[i] > 50:
            required_runs = req_runs[i]
            # print('Runs to be: ', req_runs[i])
            break

    return required_runs

In [34]:
# find_runs(current_score, target, current_wickets, at_overs)
score_needed = find_runs(87, 167, 1, 11)
score_needed

100

In [35]:
print(f'This means that the team should score {score_needed} runs after 11 overs to have a win % of greater than 50.')

This means that the team should score 100 runs after 11 overs to have a win % of greater than 50.


# 8. Finding no. of wickets the chasing team can afford to lose

In [36]:
def find_wickets(current_score, target, current_wickets, at_overs):
    req_runs = find_runs(current_score, target, current_wickets, at_overs)

    runs_ls = []
    results_ls = []

    req_wks = []
    win_ls = []

    for i in range(current_wickets, 10):
        win_count = 0
        tie_count = 0
        lose_count = 0

        for j in range(100):
            # pred_runs = predict_runs(target, req_runs, i, at_overs)
            pred_runs = predict_runs(target, current_score, i, at_overs)
            runs_ls.append(pred_runs)
            result_pred = get_win(pred_runs, target)
            results_ls.append(result_pred)

            if result_pred == 'win':
                win_count += 1
            elif result_pred == 'tie':
                tie_count += 1
            else:
                lose_count +=1

        win_ls.append(win_count)
        req_wks.append(i)
        # print('wickets: ', i, ' win%: ', win_count)

    req_wicket_value = current_wickets
    
    for i in range(len(req_wks)):
        if (win_ls[i] < 50)  :
            req_wicket_value = req_wks[i]
            break

    return req_wicket_value

In [37]:
# find_wickets(current_score, target, current_wickets, at_overs)
wickets_to_lose = find_wickets(87, 167, 1, 11)
wickets_to_lose

1

In [38]:
print(f'This means that the team can lose total {wickets_to_lose} wickets after 11 overs to have a win % of greater than 50.')

This means that the team can lose total 1 wickets after 11 overs to have a win % of greater than 50.


# 9. Interactive Chart

In [39]:
def find_runs_wickets(current_wks, at_overs, target_score):
    plt.figure(figsize = (16, 6))
    
    # x = np.linspace(-10, 10, num=1000)
    x = np.array(list(range(21)))
    
    req_value = find_runs(87, target_score, current_wks, at_overs)
    req_wk_value = find_wickets(87, target_score, current_wks, at_overs)

    if at_overs == 10:
        req_value = 87
        req_wk_value = 1

    # print(req_value)
    y = np.array([req_value for i in range(21)])

    # plt.plot(x, current_overs * x + target_score)
    # plt.plot(x, y)
    plt.scatter(at_overs, req_value, s = 1200, color = 'red')
    plt.axhline(target_score, ls = '--', color = 'blue')
    plt.text(1, target_score + 10, 'Target Score :' + str(target_score) , color = 'darkblue', fontsize = 13)
    
    plt.text(
        at_overs,
        req_value,
        str(req_value) + '/' + str(req_wk_value),
        color = 'white',
        fontsize = 12,
        horizontalalignment = 'center',
        verticalalignment = 'center'
    )
    
    plt.text(
        at_overs,
        req_value - 30,
        'DC has to be at ' + str(req_value) + '/' + str(req_wk_value) + ' after ' + str(at_overs) + ' ov',
        horizontalalignment = 'center'
    )
    
    plt.ylim(50, target_score + 50)
    plt.xticks(x)
    plt.title('Where should DC be?', fontsize = 20)
    plt.xlabel('Overs')
    plt.ylabel('Score')
    plt.show()

In [40]:
print('current_score = DC: 87/1 (10 overs)')
print('')

interactive_plot = interactive(
    find_runs_wickets,
    current_wks = widgets.IntSlider(min = 1, max = 10, step = 1, value = 1),
    at_overs = widgets.IntSlider(min = 10, max = 20, step = 1, value = 10),
    target_score = widgets.IntSlider(min = 0, max = 250, step = 1, value = 167)
)

output = interactive_plot.children[-1]
output.layout.height = '450px'

interactive_plot

current_score = DC: 87/1 (10 overs)



interactive(children=(IntSlider(value=1, description='current_wks', max=10, min=1), IntSlider(value=10, descri…