# 1. Importing the libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import pickle

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# 2. Loading the data

In [2]:
df = pd.read_csv('C:/Users/neera/Mad About Sports/Advanced Cricket Analytics Masterclass/Datasets/T20I_data_with_features.csv')
df.head(2)

Unnamed: 0,id,innings,batting_team,bowling_team,overs,over,ball,total_runs,player_dismissed,total,total_score,total_wickets,prev_30_runs,prev_30_wickets,prev_30_dot_balls,prev_30_boundaries
0,211028,1,England,Australia,0.1,0,1,0,0,179,0,0,0,0,1,0
1,211028,1,England,Australia,0.2,0,2,1,0,179,1,0,1,0,1,0


# 3. Feature encoding

In [3]:
df = pd.get_dummies(data = df, columns = ['batting_team', 'bowling_team'])
df.head(2)

Unnamed: 0,id,innings,overs,over,ball,total_runs,player_dismissed,total,total_score,total_wickets,prev_30_runs,prev_30_wickets,prev_30_dot_balls,prev_30_boundaries,batting_team_Australia,batting_team_Bangladesh,batting_team_England,batting_team_India,batting_team_New Zealand,batting_team_Pakistan,batting_team_South Africa,batting_team_Sri Lanka,batting_team_West Indies,bowling_team_Australia,bowling_team_Bangladesh,bowling_team_England,bowling_team_India,bowling_team_New Zealand,bowling_team_Pakistan,bowling_team_South Africa,bowling_team_Sri Lanka,bowling_team_West Indies
0,211028,1,0.1,0,1,0,0,179,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1,211028,1,0.2,0,2,1,0,179,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [4]:
df.columns

Index(['id', 'innings', 'overs', 'over', 'ball', 'total_runs',
       'player_dismissed', 'total', 'total_score', 'total_wickets',
       'prev_30_runs', 'prev_30_wickets', 'prev_30_dot_balls',
       'prev_30_boundaries', 'batting_team_Australia',
       'batting_team_Bangladesh', 'batting_team_England', 'batting_team_India',
       'batting_team_New Zealand', 'batting_team_Pakistan',
       'batting_team_South Africa', 'batting_team_Sri Lanka',
       'batting_team_West Indies', 'bowling_team_Australia',
       'bowling_team_Bangladesh', 'bowling_team_England', 'bowling_team_India',
       'bowling_team_New Zealand', 'bowling_team_Pakistan',
       'bowling_team_South Africa', 'bowling_team_Sri Lanka',
       'bowling_team_West Indies'],
      dtype='object')

# 4. Subsetting the features

In [5]:
df = df[['id', 'batting_team_Australia', 'batting_team_Bangladesh', 'batting_team_England', 'batting_team_India', 'batting_team_New Zealand',
         'batting_team_Pakistan', 'batting_team_South Africa', 'batting_team_Sri Lanka', 'batting_team_West Indies', 'bowling_team_Australia',
         'bowling_team_Bangladesh', 'bowling_team_England', 'bowling_team_India', 'bowling_team_New Zealand', 'bowling_team_Pakistan',
         'bowling_team_South Africa', 'bowling_team_Sri Lanka', 'bowling_team_West Indies', 'overs', 'total_score', 'total_wickets', 'prev_30_runs',
         'prev_30_wickets', 'prev_30_dot_balls', 'prev_30_boundaries', 'total']]

df.head(2)

Unnamed: 0,id,batting_team_Australia,batting_team_Bangladesh,batting_team_England,batting_team_India,batting_team_New Zealand,batting_team_Pakistan,batting_team_South Africa,batting_team_Sri Lanka,batting_team_West Indies,bowling_team_Australia,bowling_team_Bangladesh,bowling_team_England,bowling_team_India,bowling_team_New Zealand,bowling_team_Pakistan,bowling_team_South Africa,bowling_team_Sri Lanka,bowling_team_West Indies,overs,total_score,total_wickets,prev_30_runs,prev_30_wickets,prev_30_dot_balls,prev_30_boundaries,total
0,211028,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.1,0,0,0,0,1,0,179
1,211028,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0.2,1,0,1,0,1,0,179


# 5. train_test_split

In [6]:
X = df.drop(labels = ['id', 'total'], axis = 1).values
y = df['total'].values

In [7]:
X

array([[ 0.,  0.,  1., ...,  0.,  1.,  0.],
       [ 0.,  0.,  1., ...,  0.,  1.,  0.],
       [ 0.,  0.,  1., ...,  0.,  2.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  2., 14.,  4.],
       [ 0.,  0.,  0., ...,  2., 13.,  5.],
       [ 0.,  0.,  0., ...,  1., 12.,  6.]])

In [8]:
y

array([179, 179, 179, ...,  71,  71,  71], dtype=int64)

The stratify parameter makes a split so that the proportion of values in the sample produced will be the same as the proportion of values provided to parameter 'stratify'.

For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's.

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42, stratify = y)

In [10]:
X_train = np.asarray(X_train).astype(np.float32)
X_test  = np.asarray(X_test).astype(np.float32)

In [11]:
X_train

array([[ 0.,  0.,  0., ...,  3., 10.,  1.],
       [ 0.,  0.,  0., ...,  0., 13., 12.],
       [ 0.,  0.,  0., ...,  0., 10.,  1.],
       ...,
       [ 0.,  0.,  0., ...,  1.,  6.,  6.],
       [ 0.,  0.,  0., ...,  2.,  6.,  2.],
       [ 0.,  0.,  0., ...,  2., 14.,  2.]], dtype=float32)

In [12]:
X_test

array([[ 0.,  1.,  0., ...,  1., 11.,  5.],
       [ 0.,  0.,  0., ...,  0.,  7.,  4.],
       [ 0.,  0.,  0., ...,  2., 16.,  4.],
       ...,
       [ 0.,  0.,  1., ...,  0.,  8.,  1.],
       [ 0.,  0.,  0., ...,  0., 12.,  7.],
       [ 0.,  0.,  1., ...,  0.,  4.,  8.]], dtype=float32)

In [13]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(179337, 25) (59780, 25) (179337,) (59780,)


# 6. Training LR model

In [14]:
LR_model = LinearRegression()
LR_model.fit(X_train, y_train)

LinearRegression()

# 7. Creating a pickle file for the classifier

In [15]:
filename = 'lr-model.pkl'
pickle.dump(LR_model, open(filename, 'wb'))

In [16]:
prediction = LR_model.predict(X_test)
mean_absolute_error(y_test, prediction)

18.81745701187149

# 8. Function to predict scores

In [17]:
def score_prediction(bat_team, bowl_team, overs, total_score, total_wickets, prev_30_runs, prev_30_wickets, prev_30_dot_balls, prev_30_boundaries):
    temp_array = list()
    
    if bat_team == 'AUS':
        temp_array += [1, 0, 0, 0, 0, 0, 0, 0, 0]
    elif bat_team == 'BAN':
        temp_array += [0, 1, 0, 0, 0, 0, 0, 0, 0]
    elif bat_team == 'ENG':
        temp_array += [0, 0, 1, 0, 0, 0, 0, 0, 0]
    elif bat_team == 'IND':
        temp_array += [0, 0, 0, 1, 0, 0, 0, 0, 0]
    elif bat_team == 'NZ':
        temp_array += [0, 0, 0, 0, 1, 0, 0, 0, 0]
    elif bat_team == 'PAK':
        temp_array += [0, 0, 0, 0, 0, 1, 0, 0, 0]
    elif bat_team == 'SA':
        temp_array += [0, 0, 0, 0, 0, 0, 1, 0, 0]
    elif bat_team == 'SL':
        temp_array += [0, 0, 0, 0, 0, 0, 0, 1, 0]
    elif bat_team == 'WI':
        temp_array += [0, 0, 0, 0, 0, 0, 0, 0, 1]
        
    if bowl_team == 'AUS':
        temp_array += [1, 0, 0, 0, 0, 0, 0, 0, 0]
    elif bowl_team == 'BAN':
        temp_array += [0, 1, 0, 0, 0, 0, 0, 0, 0]
    elif bowl_team == 'ENG':
        temp_array += [0, 0, 1, 0, 0, 0, 0, 0, 0]
    elif bowl_team == 'IND':
        temp_array += [0, 0, 0, 1, 0, 0, 0, 0, 0]
    elif bowl_team == 'NZ':
        temp_array += [0, 0, 0, 0, 1, 0, 0, 0, 0]
    elif bowl_team == 'PAK':
        temp_array += [0, 0, 0, 0, 0, 1, 0, 0, 0]
    elif bowl_team == 'SA':
        temp_array += [0, 0, 0, 0, 0, 0, 1, 0, 0]
    elif bowl_team == 'SL':
        temp_array += [0, 0, 0, 0, 0, 0, 0, 1, 0]
    elif bowl_team == 'WI':
        temp_array += [0, 0, 0, 0, 0, 0, 0, 0, 1]
        
    temp_array += [overs, total_score, total_wickets, prev_30_runs, prev_30_wickets, prev_30_dot_balls, prev_30_boundaries]
    data = np.array([temp_array])
    my_prediction = int(LR_model.predict(data))
    
    print('Predicted score: ', my_prediction)
    print('Predicted score range: ', my_prediction - 19, 'to ', my_prediction + 19)

In [18]:
bat_team = 'AUS'

bowl_team = 'IND'

overs = 10.3

total_score = 67 # current score

total_wickets = 2 # current wickets

prev_30_runs = 37 # runs in last 30 balls

prev_30_wickets = 1 # wickets in last 30 balls

prev_30_dot_balls = 8 # dots in last 30 balls

prev_30_boundaries = 5 # boundaries in last 30 balls

score_prediction(bat_team, bowl_team, overs, total_score, total_wickets, prev_30_runs, prev_30_wickets, prev_30_dot_balls, prev_30_boundaries)

Predicted score:  140
Predicted score range:  121 to  159
