<a href="https://colab.research.google.com/github/SarangGami/IPL-First-Innings-Score-Prediction/blob/main/IPL_first_innings_score_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
# Data manipulation libraries
import pandas as pd
import numpy as np

# Data visualization libraries
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
import seaborn as sns

# Preprocessing libraries
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

# For build pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline


# Machine learning models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.ensemble import VotingRegressor,StackingRegressor


# for plot decision tree
from sklearn import tree

# Model selection libraries
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

# importing XGB regressor
from xgboost import XGBRegressor

# Metrics libraries for model evaluation
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error

# Warnings module handles warnings in Python
import warnings
warnings.filterwarnings('ignore')

### **Data Gathering & accessing**

In [37]:
# Loading the dataset
df = pd.read_csv('ipl.csv')

In [38]:
df.head()

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
0,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,SC Ganguly,P Kumar,1,0,0.1,1,0,0,0,222
1,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,1,0,0.2,1,0,0,0,222
2,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.2,2,0,0,0,222
3,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.3,2,0,0,0,222
4,1,2008-04-18,M Chinnaswamy Stadium,Kolkata Knight Riders,Royal Challengers Bangalore,BB McCullum,P Kumar,2,0,0.4,2,0,0,0,222


In [39]:
df.tail()

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
76009,617,2017-05-21,"Rajiv Gandhi International Stadium, Uppal",Mumbai Indians,Rising Pune Supergiant,KH Pandya,DT Christian,121,7,19.2,40,0,40,12,129
76010,617,2017-05-21,"Rajiv Gandhi International Stadium, Uppal",Mumbai Indians,Rising Pune Supergiant,KH Pandya,DT Christian,127,7,19.3,46,0,46,12,129
76011,617,2017-05-21,"Rajiv Gandhi International Stadium, Uppal",Mumbai Indians,Rising Pune Supergiant,KH Pandya,DT Christian,128,7,19.4,47,0,47,12,129
76012,617,2017-05-21,"Rajiv Gandhi International Stadium, Uppal",Mumbai Indians,Rising Pune Supergiant,MG Johnson,DT Christian,129,7,19.5,48,0,47,13,129
76013,617,2017-05-21,"Rajiv Gandhi International Stadium, Uppal",Mumbai Indians,Rising Pune Supergiant,KH Pandya,DT Christian,129,8,19.6,47,1,47,13,129


In [40]:
df.shape

(76014, 15)

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76014 entries, 0 to 76013
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   mid             76014 non-null  int64  
 1   date            76014 non-null  object 
 2   venue           76014 non-null  object 
 3   bat_team        76014 non-null  object 
 4   bowl_team       76014 non-null  object 
 5   batsman         76014 non-null  object 
 6   bowler          76014 non-null  object 
 7   runs            76014 non-null  int64  
 8   wickets         76014 non-null  int64  
 9   overs           76014 non-null  float64
 10  runs_last_5     76014 non-null  int64  
 11  wickets_last_5  76014 non-null  int64  
 12  striker         76014 non-null  int64  
 13  non-striker     76014 non-null  int64  
 14  total           76014 non-null  int64  
dtypes: float64(1), int64(8), object(6)
memory usage: 8.7+ MB


In [42]:
df.describe()

Unnamed: 0,mid,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
count,76014.0,76014.0,76014.0,76014.0,76014.0,76014.0,76014.0,76014.0,76014.0
mean,308.62774,74.889349,2.415844,9.783068,33.216434,1.120307,24.962283,8.869287,160.901452
std,178.156878,48.823327,2.015207,5.772587,14.914174,1.053343,20.079752,10.795742,29.246231
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0
25%,154.0,34.0,1.0,4.6,24.0,0.0,10.0,1.0,142.0
50%,308.0,70.0,2.0,9.6,34.0,1.0,20.0,5.0,162.0
75%,463.0,111.0,4.0,14.6,43.0,2.0,35.0,13.0,181.0
max,617.0,263.0,10.0,19.6,113.0,7.0,175.0,109.0,263.0


In [43]:
df.duplicated().sum()

np.int64(0)

In [44]:
df.isnull().sum()

mid               0
date              0
venue             0
bat_team          0
bowl_team         0
batsman           0
bowler            0
runs              0
wickets           0
overs             0
runs_last_5       0
wickets_last_5    0
striker           0
non-striker       0
total             0
dtype: int64

### **Data Cleaning**

In [45]:
ipl_df = df.copy()

In [46]:
ipl_df.sample(5)

Unnamed: 0,mid,date,venue,bat_team,bowl_team,batsman,bowler,runs,wickets,overs,runs_last_5,wickets_last_5,striker,non-striker,total
36555,297,2012-05-13,Sawai Mansingh Stadium,Rajasthan Royals,Pune Warriors,R Dravid,M Kartik,8,0,0.6,8,0,6,0,170
34997,284,2012-05-04,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Deccan Chargers,MS Dhoni,Ankit Sharma,85,2,10.2,35,1,30,4,160
141,2,2008-04-19,"Punjab Cricket Association Stadium, Mohali",Chennai Super Kings,Kings XI Punjab,PA Patel,B Lee,26,1,2.4,26,1,15,9,240
34402,279,2012-04-30,"MA Chidambaram Stadium, Chepauk",Chennai Super Kings,Kolkata Knight Riders,RA Jadeja,Iqbal Abdulla,89,4,13.1,24,2,8,1,139
48732,396,2014-04-20,Sharjah Cricket Stadium,Rajasthan Royals,Kings XI Punjab,AM Rahane,AR Patel,1,0,0.4,1,0,1,0,191


In [47]:
# Removing unwanted columns

columns_to_remove = ['mid', 'venue', 'batsman', 'bowler', 'striker', 'non-striker']
ipl_df.drop(labels=columns_to_remove, axis=1, inplace=True)

In [48]:
ipl_df.sample(5)

Unnamed: 0,date,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5,total
1182,2008-04-25,Kings XI Punjab,Mumbai Indians,111,3,12.1,37,2,182
62333,2016-04-12,Royal Challengers Bangalore,Sunrisers Hyderabad,44,1,5.2,43,1,227
30383,2012-04-08,Pune Warriors,Kings XI Punjab,1,0,0.1,1,0,166
41905,2013-04-21,Mumbai Indians,Delhi Daredevils,45,2,8.2,34,1,161
44645,2013-05-05,Mumbai Indians,Chennai Super Kings,47,3,10.1,26,3,139


In [49]:
ipl_df['bat_team'].unique()

array(['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
       'Mumbai Indians', 'Deccan Chargers', 'Kings XI Punjab',
       'Royal Challengers Bangalore', 'Delhi Daredevils',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant'], dtype=object)

In [50]:
# Keeping only consistent teams
consistent_teams = ['Kolkata Knight Riders', 'Chennai Super Kings', 'Rajasthan Royals',
                    'Mumbai Indians', 'Kings XI Punjab', 'Royal Challengers Bangalore',
                    'Delhi Daredevils', 'Sunrisers Hyderabad']

In [51]:
ipl_df = ipl_df[(ipl_df['bat_team'].isin(consistent_teams)) & (ipl_df['bowl_team'].isin(consistent_teams))]

In [52]:
# Removing the first 5 overs data in every match
ipl_df = ipl_df[ipl_df['overs']>=5.0]

In [53]:
ipl_df.shape

(40108, 9)

In [54]:
# Converting the column 'date' from string into datetime object

ipl_df['date']= pd.to_datetime(ipl_df['date'])

In [55]:
ipl_df['crr'] =ipl_df['runs']/ipl_df['overs']

In [56]:
# Split the 'overs' column into its integer and decimal parts
overs_int = ipl_df['overs'].apply(lambda x: int(x))
overs_dec = ipl_df['overs'].apply(lambda x: float('0.' + str(x).split('.')[1]))

# Calculate the number of balls left in each over
balls_left = (20 - overs_int - 1) * 6 + (6 - ((overs_dec * 10) % 10 + 1))

# Add the 'balls left' column to the DataFrame
ipl_df['balls_left'] = balls_left.astype(int)

In [57]:
ipl_df['wickets_left'] = 10-ipl_df['wickets']

In [58]:
ipl_df.reset_index(drop=True, inplace=True)

In [59]:
ipl_df

Unnamed: 0,date,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5,total,crr,balls_left,wickets_left
0,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,61,0,5.1,59,0,222,11.960784,88,10
1,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.2,59,1,222,11.730769,87,9
2,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.3,59,1,222,11.509434,86,9
3,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.4,59,1,222,11.296296,85,9
4,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.5,58,1,222,11.090909,84,9
...,...,...,...,...,...,...,...,...,...,...,...,...
40103,2017-05-19,Kolkata Knight Riders,Mumbai Indians,106,9,18.1,29,4,107,5.856354,10,1
40104,2017-05-19,Kolkata Knight Riders,Mumbai Indians,107,9,18.2,29,4,107,5.879121,9,1
40105,2017-05-19,Kolkata Knight Riders,Mumbai Indians,107,9,18.3,28,4,107,5.846995,8,1
40106,2017-05-19,Kolkata Knight Riders,Mumbai Indians,107,9,18.4,24,4,107,5.815217,7,1


In [60]:
ipl_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40108 entries, 0 to 40107
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            40108 non-null  datetime64[ns]
 1   bat_team        40108 non-null  object        
 2   bowl_team       40108 non-null  object        
 3   runs            40108 non-null  int64         
 4   wickets         40108 non-null  int64         
 5   overs           40108 non-null  float64       
 6   runs_last_5     40108 non-null  int64         
 7   wickets_last_5  40108 non-null  int64         
 8   total           40108 non-null  int64         
 9   crr             40108 non-null  float64       
 10  balls_left      40108 non-null  int64         
 11  wickets_left    40108 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(7), object(2)
memory usage: 3.7+ MB


## **pre-processing and model implementation**

In [61]:
ipl_df.head()

Unnamed: 0,date,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5,total,crr,balls_left,wickets_left
0,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,61,0,5.1,59,0,222,11.960784,88,10
1,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.2,59,1,222,11.730769,87,9
2,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.3,59,1,222,11.509434,86,9
3,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.4,59,1,222,11.296296,85,9
4,2008-04-18,Kolkata Knight Riders,Royal Challengers Bangalore,61,1,5.5,58,1,222,11.090909,84,9


In [62]:
# Splitting the data into train and test set

X_train = ipl_df.drop(labels='total', axis=1)[ipl_df['date'].dt.year <= 2015]
X_test = ipl_df.drop(labels='total', axis=1)[ipl_df['date'].dt.year >= 2016]
y_train = ipl_df[ipl_df['date'].dt.year <= 2015]['total'].values
y_test = ipl_df[ipl_df['date'].dt.year >= 2016]['total'].values

In [63]:
# split the Dataset into independent(x) and dependent(y) Dataset

X = ipl_df.drop(columns=['total'])
y = ipl_df['total']

In [64]:
# calling train_test_split() to get the training and testing data.

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

# split sizes
print(X_train.shape)
print(X_test.shape)

(32086, 11)
(8022, 11)


In [65]:
# Removing the 'date' column

X_train.drop(labels='date', axis=1, inplace=True)
X_test.drop(labels='date', axis=1, inplace=True)

In [66]:
# split sizes
print(X_train.shape)
print(X_test.shape)

(32086, 10)
(8022, 10)


In [67]:
X_train.head()

Unnamed: 0,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5,crr,balls_left,wickets_left
34151,Mumbai Indians,Chennai Super Kings,136,2,15.2,47,2,8.947368,27,8
15132,Mumbai Indians,Rajasthan Royals,164,4,16.6,68,1,9.879518,17,6
13576,Chennai Super Kings,Delhi Daredevils,89,3,12.1,46,1,7.355372,46,7
11954,Rajasthan Royals,Kolkata Knight Riders,124,4,16.4,45,1,7.560976,19,6
2017,Royal Challengers Bangalore,Mumbai Indians,57,3,10.1,22,3,5.643564,58,7


In [68]:
X_test.head()

Unnamed: 0,bat_team,bowl_team,runs,wickets,overs,runs_last_5,wickets_last_5,crr,balls_left,wickets_left
22763,Royal Challengers Bangalore,Kolkata Knight Riders,24,1,6.3,20,1,3.809524,80,9
22840,Royal Challengers Bangalore,Kolkata Knight Riders,111,8,19.2,33,5,5.78125,3,2
12223,Kings XI Punjab,Rajasthan Royals,195,6,19.5,25,4,10.0,0,4
18423,Chennai Super Kings,Delhi Daredevils,64,0,7.2,45,0,8.888889,75,10
151,Chennai Super Kings,Kings XI Punjab,151,4,14.4,63,1,10.486111,31,6


In [69]:
trf = ColumnTransformer([
    ('trf',OneHotEncoder(sparse_output=False,drop='first'),[0,1])
]
,remainder='passthrough')

## **LinearRegression**

In [70]:
pipe1 = Pipeline(steps=[
    ('step1',trf),
    ('step2',StandardScaler()),
    ('step3',LinearRegression())
])


# fit the pipeline on training dataset
pipe1.fit(X_train,y_train)

# predict the train and test dataset 
y_pred_train = pipe1.predict(X_train)
y_pred = pipe1.predict(X_test)

# LinearRegression model all output scores
print('\033[1mTraining data R2 and Adjusted R2 Score\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print('R2 score',r2_score(y_train,y_pred_train))
print('Adjusted R2 score', (1-(1-r2_score(y_train,y_pred_train))*((X_train.shape[0]-1)/(X_train.shape[0]-X_train.shape[1]-1))))

print('\n')
print('\033[1mTesting data R2 and Adjusted R2 Score\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print('R2 score',r2_score(y_test,y_pred))
print('Adjusted R2 score', (1-(1-r2_score(y_test,y_pred))*((X_test.shape[0]-1)/(X_test.shape[0]-X_test.shape[1]-1))))

print('\n')
print('\033[1mThe performance metrics\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print('MAE',mean_absolute_error(y_test,y_pred))
print('MSE',mean_squared_error(y_test,y_pred))
print('RMSE',np.sqrt(mean_squared_error(y_test,y_pred)))

[1mTraining data R2 and Adjusted R2 Score[0m
[1m-----------------------------------------[0m
R2 score 0.6628634929898946
Adjusted R2 score 0.6627583841802265


[1mTesting data R2 and Adjusted R2 Score[0m
[1m-----------------------------------------[0m
R2 score 0.6737098030957681
Adjusted R2 score 0.673302500390857


[1mThe performance metrics[0m
[1m-----------------------------------------[0m
MAE 12.79919552059396
MSE 295.34685451159174
RMSE 17.18565839622072


## **DecisionTreeRegressor with hyperparameter tuning**

In [71]:
# apply DecisionTreeRegressor algorithm with hyperparameter tuning

# giving parameters
parameters = {
    'criterion':['squared_error'],                    # 'friedman_mse', 'absolute_error'
    'splitter' :['best'],                             # random
    'max_depth' :[7],                                 # 4,5,6,7,8,9,None
    'max_features' :[1.0]                             # 0.25,0.50,0.75,0.85
}

# we use gridsearchCV because the dataset is not that big
dtr = GridSearchCV(DecisionTreeRegressor(), param_grid=parameters , cv=5, n_jobs=-1)


pipe2 = Pipeline(steps=[
    ('step1',trf),
    ('step3',dtr)
])


# fit the pipeline on training dataset
pipe2.fit(X_train,y_train)

# predict the train and test dataset 
y_pred_train = pipe2.predict(X_train)
y_pred = pipe2.predict(X_test)


# DecisionTreeRegressor model all output scores
print('\033[1mTraining data R2 and Adjusted R2 Score\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print('R2 score',r2_score(y_train,y_pred_train))
print('Adjusted R2 score', (1-(1-r2_score(y_train,y_pred_train))*((X_train.shape[0]-1)/(X_train.shape[0]-X_train.shape[1]-1))))

print('\n')
print('\033[1mTesting data R2 and Adjusted R2 Score\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print('R2 score',r2_score(y_test,y_pred))
print('Adjusted R2 score', (1-(1-r2_score(y_test,y_pred))*((X_test.shape[0]-1)/(X_test.shape[0]-X_test.shape[1]-1))))

print('\n')
print('\033[1mCross-validation score and best params\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print("The best parameters is", dtr.best_params_)
print('cross-validation score', dtr.best_score_)

print('\n')
print('\033[1mThe performance metrics\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print('MAE',mean_absolute_error(y_test,y_pred))
print('MSE',mean_squared_error(y_test,y_pred))
print('RMSE',np.sqrt(mean_squared_error(y_test,y_pred)))

[1mTraining data R2 and Adjusted R2 Score[0m
[1m-----------------------------------------[0m
R2 score 0.7163238893459634
Adjusted R2 score 0.7162354478461492


[1mTesting data R2 and Adjusted R2 Score[0m
[1m-----------------------------------------[0m
R2 score 0.7212783027325309
Adjusted R2 score 0.7209303790060704


[1mCross-validation score and best params[0m
[1m-----------------------------------------[0m
The best parameters is {'criterion': 'squared_error', 'max_depth': 7, 'max_features': 1.0, 'splitter': 'best'}
cross-validation score 0.6954193047824366


[1mThe performance metrics[0m
[1m-----------------------------------------[0m
MAE 11.747063896702855
MSE 252.28945691016395
RMSE 15.883622285554512


## **RandomForestRegressor**

In [72]:
# apply RandomForestRegressor algorithm


# giving parameters
parameters = {
    'n_estimators':[100],                        
    'max_features' :['sqrt']                   
}
 
# we use gridsearchCV because the dataset is not that big so we use this not RandomizedSearchCV
rfr = GridSearchCV(RandomForestRegressor(), param_grid=parameters , cv=5, n_jobs=-1)

pipe4 = Pipeline(steps=[
    ('step1',trf),
    ('step3',rfr)
])

# fit the pipeline on training dataset
pipe4.fit(X_train,y_train)

# predict the train and test dataset 
y_pred_train = pipe4.predict(X_train)
y_pred = pipe4.predict(X_test)

# RandomForestRegressor model all output scores
print('\033[1mTraining data R2 and Adjusted R2 Score\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print('R2 score',r2_score(y_train,y_pred_train))
print('Adjusted R2 score', (1-(1-r2_score(y_train,y_pred_train))*((X_train.shape[0]-1)/(X_train.shape[0]-X_train.shape[1]-1))))

print('\n')
print('\033[1mTesting data R2 and Adjusted R2 Score\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print('R2 score',r2_score(y_test,y_pred))
print('Adjusted R2 score', (1-(1-r2_score(y_test,y_pred))*((X_test.shape[0]-1)/(X_test.shape[0]-X_test.shape[1]-1))))

print('\n')
print('\033[1mCross-validation score and best params\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print("The best parameters is", rfr.best_params_)
print('cross-validation score', rfr.best_score_)

print('\n')
print('\033[1mThe performance metrics\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print('MAE',mean_absolute_error(y_test,y_pred))
print('MSE',mean_squared_error(y_test,y_pred))
print('RMSE',np.sqrt(mean_squared_error(y_test,y_pred)))

[1mTraining data R2 and Adjusted R2 Score[0m
[1m-----------------------------------------[0m
R2 score 0.9905605797830253
Adjusted R2 score 0.9905576368616793


[1mTesting data R2 and Adjusted R2 Score[0m
[1m-----------------------------------------[0m
R2 score 0.9382502838986867
Adjusted R2 score 0.938173202740153


[1mCross-validation score and best params[0m
[1m-----------------------------------------[0m
The best parameters is {'max_features': 'sqrt', 'n_estimators': 100}
cross-validation score 0.9222385684872713


[1mThe performance metrics[0m
[1m-----------------------------------------[0m
MAE 4.490871928138479
MSE 55.8937552845313
RMSE 7.476212629703044


## **XGBRegressor**

In [73]:
# apply XGBRegressor algorithm


# giving parameters
parameters = {
    'max_depth' :[6]                                                         
}
 
# we use gridsearchCV because the dataset is not that big so we use this not RandomizedSearchCV
xgbr = GridSearchCV(XGBRegressor(objective='reg:squarederror'), param_grid=parameters , cv=5, n_jobs=-1)

pipe5 = Pipeline(steps=[
    ('step1',trf),
    ('step3',xgbr)
])

# fit the pipeline on training dataset
pipe5.fit(X_train,y_train)

# predict the train and test dataset 
y_pred_train = pipe5.predict(X_train)
y_pred = pipe5.predict(X_test)

# GradientBoostingRegressor model all output scores
print('\033[1mTraining data R2 and Adjusted R2 Score\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print('R2 score',r2_score(y_train,y_pred_train))
print('Adjusted R2 score', (1-(1-r2_score(y_train,y_pred_train))*((X_train.shape[0]-1)/(X_train.shape[0]-X_train.shape[1]-1))))

print('\n')
print('\033[1mTesting data R2 and Adjusted R2 Score\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print('R2 score',r2_score(y_test,y_pred))
print('Adjusted R2 score', (1-(1-r2_score(y_test,y_pred))*((X_test.shape[0]-1)/(X_test.shape[0]-X_test.shape[1]-1))))

print('\n')
print('\033[1mCross-validation score and best params\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print("The best parameters is", xgbr.best_params_)
print('cross-validation score', xgbr.best_score_)

print('\n')
print('\033[1mThe performance metrics\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print('MAE',mean_absolute_error(y_test,y_pred))
print('MSE',mean_squared_error(y_test,y_pred))
print('RMSE',np.sqrt(mean_squared_error(y_test,y_pred)))

[1mTraining data R2 and Adjusted R2 Score[0m
[1m-----------------------------------------[0m
R2 score 0.9063966039146877
Adjusted R2 score 0.9063674212502807


[1mTesting data R2 and Adjusted R2 Score[0m
[1m-----------------------------------------[0m
R2 score 0.8736689933815254
Adjusted R2 score 0.8735112964565241


[1mCross-validation score and best params[0m
[1m-----------------------------------------[0m
The best parameters is {'max_depth': 6}
cross-validation score 0.8638022894292995


[1mThe performance metrics[0m
[1m-----------------------------------------[0m
MAE 7.458634366301996
MSE 114.35055599601942
RMSE 10.693481939762156


## **GradientBoostingRegressor**

In [74]:
# apply GradientBoostingRegressor algorithm with hyperparameter tuning as step2


# giving parameters
parameters = {
    'n_estimators' :[100],                                     
    'learning_rate' :[0.1],                                       
    'max_depth' : [3]                                                          
}
 
# we use gridsearchCV because the dataset is not that big so we use this not RandomizedSearchCV
gbr = GridSearchCV(GradientBoostingRegressor(), param_grid=parameters , cv=5, n_jobs=-1)

pipe6 = Pipeline(steps=[
    ('step1',trf),
    ('step2',StandardScaler()),
    ('step3',gbr)
])

# fit the pipeline on training dataset
pipe6.fit(X_train,y_train)

# predict the train and test dataset 
y_pred_train = pipe6.predict(X_train)
y_pred = pipe6.predict(X_test)

# GradientBoostingRegressor model all output scores
print('\033[1mTraining data R2 and Adjusted R2 Score\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print('R2 score',r2_score(y_train,y_pred_train))
print('Adjusted R2 score', (1-(1-r2_score(y_train,y_pred_train))*((X_train.shape[0]-1)/(X_train.shape[0]-X_train.shape[1]-1))))

print('\n')
print('\033[1mTesting data R2 and Adjusted R2 Score\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print('R2 score',r2_score(y_test,y_pred))
print('Adjusted R2 score', (1-(1-r2_score(y_test,y_pred))*((X_test.shape[0]-1)/(X_test.shape[0]-X_test.shape[1]-1))))

print('\n')
print('\033[1mCross-validation score and best params\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print("The best parameters is", gbr.best_params_)
print('cross-validation score', gbr.best_score_)

print('\n')
print('\033[1mThe performance metrics\033[0m')
print('\033[1m' + '-----------------------------------------' + '\033[0m')
print('MAE',mean_absolute_error(y_test,y_pred))
print('MSE',mean_squared_error(y_test,y_pred))
print('RMSE',np.sqrt(mean_squared_error(y_test,y_pred)))

[1mTraining data R2 and Adjusted R2 Score[0m
[1m-----------------------------------------[0m
R2 score 0.7240368749171924
Adjusted R2 score 0.7239508380894191


[1mTesting data R2 and Adjusted R2 Score[0m
[1m-----------------------------------------[0m
R2 score 0.7272505206554479
Adjusted R2 score 0.7269100519507362


[1mCross-validation score and best params[0m
[1m-----------------------------------------[0m
The best parameters is {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
cross-validation score 0.7177284008776925


[1mThe performance metrics[0m
[1m-----------------------------------------[0m
MAE 11.569278396475594
MSE 246.88360716436537
RMSE 15.712530259775647


## Conclusion -

- After conducting a thorough analysis, we have identified XGBRegressor and RandomForestRegressor as the top-performing models in our study. Both models demonstrated high r2 scores during cross-validation.

- However, upon further evaluation, we found that the RandomForestRegressor demonstrated slightly better accuracy than XGBRegressor. While both models performed well, we did observe slight overfitting with the RandomForestRegressor on our training data.

- Given these observations, we have ultimately decided to deploy the XGBRegressor pipeline for our task, as it represents a strong balance between accuracy and generalizability.

- We are confident that the XGBRegressor pipeline will produce reliable and accurate predictions, and we recommend its use for further analysis and decision-making.

In [75]:
import pickle

pickle.dump(pipe5,open('pipe.pkl','wb'))

In [76]:
from joblib import dump, load
dump(pipe5, 'model_new.joblib')

['model_new.joblib']

In [78]:
X_train.columns


Index(['bat_team', 'bowl_team', 'runs', 'wickets', 'overs', 'runs_last_5',
       'wickets_last_5', 'crr', 'balls_left', 'wickets_left'],
      dtype='object')