## Import Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

import time
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams

rcParams["figure.figsize"] = (10, 6)
sns.set()

## Import Datasets

In [2]:
# importing filtered csv 

ipl_data = pd.read_csv("../Datasets/eda_feature_engineering.csv")

In [3]:
ipl_data.head()

Unnamed: 0,id,inning,over,ball,total_runs,is_wicket,batting_team,bowling_team,final_score,wickets,runs,last_5_over_wickets,last_5_over_runs,last_5_over_balls,venue,winner
0,335982,1,0.1,1,1,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,0,1,0,1,1,M Chinnaswamy Stadium,Kolkata Knight Riders
1,335982,1,0.2,2,0,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,0,1,0,1,2,M Chinnaswamy Stadium,Kolkata Knight Riders
2,335982,1,0.3,3,1,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,0,2,0,2,3,M Chinnaswamy Stadium,Kolkata Knight Riders
3,335982,1,0.4,4,0,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,0,2,0,2,4,M Chinnaswamy Stadium,Kolkata Knight Riders
4,335982,1,0.5,5,0,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,0,2,0,2,5,M Chinnaswamy Stadium,Kolkata Knight Riders


In [4]:
ipl_data.shape

(193096, 16)

In [5]:
ipl_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193096 entries, 0 to 193095
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   193096 non-null  int64  
 1   inning               193096 non-null  int64  
 2   over                 193096 non-null  float64
 3   ball                 193096 non-null  int64  
 4   total_runs           193096 non-null  int64  
 5   is_wicket            193096 non-null  int64  
 6   batting_team         193096 non-null  object 
 7   bowling_team         193096 non-null  object 
 8   final_score          193096 non-null  int64  
 9   wickets              193096 non-null  int64  
 10  runs                 193096 non-null  int64  
 11  last_5_over_wickets  193096 non-null  int64  
 12  last_5_over_runs     193096 non-null  int64  
 13  last_5_over_balls    193096 non-null  int64  
 14  venue                193096 non-null  object 
 15  winner           

## Consistent Team Selection

In [6]:
ipl_data.batting_team.unique()

array(['Kolkata Knight Riders', 'Royal Challengers Bangalore',
       'Chennai Super Kings', 'Kings XI Punjab', 'Rajasthan Royals',
       'Delhi Daredevils', 'Mumbai Indians', 'Deccan Chargers',
       'Kochi Tuskers Kerala', 'Pune Warriors', 'Sunrisers Hyderabad',
       'Rising Pune Supergiants', 'Gujarat Lions',
       'Rising Pune Supergiant', 'Delhi Capitals'], dtype=object)

### Consistent IPL teams

In [7]:
# Note that batting team and bowling team have same values
# Choosing consistent playing teams

ipl_teams = ['Kolkata Knight Riders', 'Royal Challengers Bangalore',
       'Chennai Super Kings', 'Punjab Kings', 'Rajasthan Royals',
       'Mumbai Indians','Sunrisers Hyderabad','Delhi Capitals']

# Delhi Daredevils name was changed to Delhi Capitals
# Kings XI Punjab name was changed to  Punjab Kings

# let's rename this teams with current name

ipl_data.replace({"Delhi Daredevils": "Delhi Capitals", "Kings XI Punjab": "Punjab Kings"}, inplace=True)

- [Kings XI name changed in 2021](https://en.wikipedia.org/wiki/Punjab_Kings#:~:text=the%20High%20Court.-,Name%20change,after%2013%20seasons%20of%20IPL.)
- [Delhi Daredevils name changed in 2018](https://en.wikipedia.org/wiki/Delhi_Capitals#:~:text=In%20December%202018%2C%20the%20team,Daredevils%20to%20the%20Delhi%20Capitals.)

### Drop non consistent teams

In [8]:
# Drop team values of unselected teams
non_ipl_teams = list(filter(lambda x: x not in ipl_teams, ipl_data.batting_team.unique()))

In [9]:
non_ipl_teams

['Deccan Chargers',
 'Kochi Tuskers Kerala',
 'Pune Warriors',
 'Rising Pune Supergiants',
 'Gujarat Lions',
 'Rising Pune Supergiant']

In [10]:
# Dropping non_ipl_teams

for i in non_ipl_teams:
    ipl_data.drop(ipl_data.loc[(ipl_data.batting_team == i) | (ipl_data.bowling_team == i)].index, axis=0, inplace=True)
    
ipl_data = ipl_data.reset_index(drop=True)

### Team feature verification

In [11]:
len(ipl_teams)

8

In [12]:
ipl_data.batting_team.nunique()

8

In [13]:
ipl_data.bowling_team.nunique()

8

In [14]:
ipl_data.winner.nunique()

8

In [15]:
ipl_teams.sort()

In [16]:
ipl_teams

['Chennai Super Kings',
 'Delhi Capitals',
 'Kolkata Knight Riders',
 'Mumbai Indians',
 'Punjab Kings',
 'Rajasthan Royals',
 'Royal Challengers Bangalore',
 'Sunrisers Hyderabad']

## Over Selection

In [17]:
# Will drop the first five over to get better consistent score result.
ipl_data = ipl_data.loc[ipl_data.over >= 5]
ipl_data.head(20)

Unnamed: 0,id,inning,over,ball,total_runs,is_wicket,batting_team,bowling_team,final_score,wickets,runs,last_5_over_wickets,last_5_over_runs,last_5_over_balls,venue,winner
32,335982,1,5.1,33,1,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,0,61,0,60,32,M Chinnaswamy Stadium,Kolkata Knight Riders
33,335982,1,5.2,34,0,1,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,61,1,60,33,M Chinnaswamy Stadium,Kolkata Knight Riders
34,335982,1,5.3,35,0,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,61,1,60,34,M Chinnaswamy Stadium,Kolkata Knight Riders
35,335982,1,5.4,36,0,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,61,1,60,35,M Chinnaswamy Stadium,Kolkata Knight Riders
36,335982,1,5.5,37,0,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,61,1,60,36,M Chinnaswamy Stadium,Kolkata Knight Riders
37,335982,1,5.6,38,0,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,61,1,60,37,M Chinnaswamy Stadium,Kolkata Knight Riders
38,335982,1,6.1,39,1,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,62,1,61,38,M Chinnaswamy Stadium,Kolkata Knight Riders
39,335982,1,6.2,40,1,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,63,1,62,39,M Chinnaswamy Stadium,Kolkata Knight Riders
40,335982,1,6.3,41,1,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,64,1,63,40,M Chinnaswamy Stadium,Kolkata Knight Riders
41,335982,1,6.4,42,2,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,66,1,65,41,M Chinnaswamy Stadium,Kolkata Knight Riders


In [18]:
ipl_data.tail(20)

Unnamed: 0,id,inning,over,ball,total_runs,is_wicket,batting_team,bowling_team,final_score,wickets,runs,last_5_over_wickets,last_5_over_runs,last_5_over_balls,venue,winner
149259,1237181,2,15.3,93,4,0,Mumbai Indians,Delhi Capitals,157,2,135,2,135,92,Dubai International Cricket Stadium,Mumbai Indians
149260,1237181,2,15.4,94,1,0,Mumbai Indians,Delhi Capitals,157,2,136,2,136,93,Dubai International Cricket Stadium,Mumbai Indians
149261,1237181,2,15.5,95,0,0,Mumbai Indians,Delhi Capitals,157,2,136,2,136,94,Dubai International Cricket Stadium,Mumbai Indians
149262,1237181,2,15.6,96,1,0,Mumbai Indians,Delhi Capitals,157,2,137,2,137,95,Dubai International Cricket Stadium,Mumbai Indians
149263,1237181,2,16.1,97,0,0,Mumbai Indians,Delhi Capitals,157,2,137,2,137,96,Dubai International Cricket Stadium,Mumbai Indians
149264,1237181,2,16.2,98,0,1,Mumbai Indians,Delhi Capitals,157,3,137,3,137,97,Dubai International Cricket Stadium,Mumbai Indians
149265,1237181,2,16.3,99,1,0,Mumbai Indians,Delhi Capitals,157,3,138,3,138,98,Dubai International Cricket Stadium,Mumbai Indians
149266,1237181,2,16.4,100,4,0,Mumbai Indians,Delhi Capitals,157,3,142,3,142,99,Dubai International Cricket Stadium,Mumbai Indians
149267,1237181,2,16.5,101,4,0,Mumbai Indians,Delhi Capitals,157,3,146,3,146,100,Dubai International Cricket Stadium,Mumbai Indians
149268,1237181,2,16.6,102,1,0,Mumbai Indians,Delhi Capitals,157,3,147,3,147,101,Dubai International Cricket Stadium,Mumbai Indians


## Feature Encoding

In [19]:
ipl_data.head()

Unnamed: 0,id,inning,over,ball,total_runs,is_wicket,batting_team,bowling_team,final_score,wickets,runs,last_5_over_wickets,last_5_over_runs,last_5_over_balls,venue,winner
32,335982,1,5.1,33,1,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,0,61,0,60,32,M Chinnaswamy Stadium,Kolkata Knight Riders
33,335982,1,5.2,34,0,1,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,61,1,60,33,M Chinnaswamy Stadium,Kolkata Knight Riders
34,335982,1,5.3,35,0,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,61,1,60,34,M Chinnaswamy Stadium,Kolkata Knight Riders
35,335982,1,5.4,36,0,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,61,1,60,35,M Chinnaswamy Stadium,Kolkata Knight Riders
36,335982,1,5.5,37,0,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,61,1,60,36,M Chinnaswamy Stadium,Kolkata Knight Riders


### Convert Categorical Data to Numeric Data

In [20]:
# Convert 'batting_team' and 'bowling_team' from categorical to numeric data.
ipl_data.winner.value_counts()

Mumbai Indians                 18652
Chennai Super Kings            16981
Kolkata Knight Riders          14554
Punjab Kings                   12726
Royal Challengers Bangalore    12505
Rajasthan Royals               12226
Delhi Capitals                 12208
Sunrisers Hyderabad            10206
Name: winner, dtype: int64

In [21]:
# Will numer the team as per the winning counts Mumbai being the highest as 7 and 0 Sunrisers being lowest winning count.

encoded_teams = {k:v for v, k in enumerate(ipl_data.winner.value_counts().sort_values().index, 0)}
encoded_teams

{'Sunrisers Hyderabad': 0,
 'Delhi Capitals': 1,
 'Rajasthan Royals': 2,
 'Royal Challengers Bangalore': 3,
 'Punjab Kings': 4,
 'Kolkata Knight Riders': 5,
 'Chennai Super Kings': 6,
 'Mumbai Indians': 7}

In [22]:
# Mapping encoded values to new data frame

df1 = ipl_data.copy()


df1.batting_team = df1.batting_team.map(encoded_teams)
df1.bowling_team = df1.bowling_team.map(encoded_teams)

In [23]:
df1.head()

Unnamed: 0,id,inning,over,ball,total_runs,is_wicket,batting_team,bowling_team,final_score,wickets,runs,last_5_over_wickets,last_5_over_runs,last_5_over_balls,venue,winner
32,335982,1,5.1,33,1,0,5,3,222,0,61,0,60,32,M Chinnaswamy Stadium,Kolkata Knight Riders
33,335982,1,5.2,34,0,1,5,3,222,1,61,1,60,33,M Chinnaswamy Stadium,Kolkata Knight Riders
34,335982,1,5.3,35,0,0,5,3,222,1,61,1,60,34,M Chinnaswamy Stadium,Kolkata Knight Riders
35,335982,1,5.4,36,0,0,5,3,222,1,61,1,60,35,M Chinnaswamy Stadium,Kolkata Knight Riders
36,335982,1,5.5,37,0,0,5,3,222,1,61,1,60,36,M Chinnaswamy Stadium,Kolkata Knight Riders


In [24]:
df1.dtypes

id                       int64
inning                   int64
over                   float64
ball                     int64
total_runs               int64
is_wicket                int64
batting_team             int64
bowling_team             int64
final_score              int64
wickets                  int64
runs                     int64
last_5_over_wickets      int64
last_5_over_runs         int64
last_5_over_balls        int64
venue                   object
winner                  object
dtype: object

### One Hot Encoding (Venue)

In [25]:
df1.venue.sort_values()

70685     Barabati Stadium
71680     Barabati Stadium
71679     Barabati Stadium
71678     Barabati Stadium
71677     Barabati Stadium
                ...       
94325     Wankhede Stadium
94326     Wankhede Stadium
94327     Wankhede Stadium
94321     Wankhede Stadium
111589    Wankhede Stadium
Name: venue, Length: 110058, dtype: object

In [26]:
# One hot encoding of venue column

encoded_venue = pd.get_dummies(df1.venue, drop_first=True)

# 0 being Barbati Stadium when performing get_dummies 

encoded_venue.head()

Unnamed: 0,Brabourne Stadium,Buffalo Park,De Beers Diamond Oval,Dr DY Patil Sports Academy,Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium,Dubai International Cricket Stadium,Eden Gardens,Feroz Shah Kotla,Himachal Pradesh Cricket Association Stadium,Holkar Cricket Stadium,...,"Rajiv Gandhi International Stadium, Uppal","Sardar Patel Stadium, Motera",Sawai Mansingh Stadium,Shaheed Veer Narayan Singh International Stadium,Sharjah Cricket Stadium,Sheikh Zayed Stadium,St George's Park,Subrata Roy Sahara Stadium,SuperSport Park,Wankhede Stadium
32,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Feature Selection

In [27]:
df1.columns

Index(['id', 'inning', 'over', 'ball', 'total_runs', 'is_wicket',
       'batting_team', 'bowling_team', 'final_score', 'wickets', 'runs',
       'last_5_over_wickets', 'last_5_over_runs', 'last_5_over_balls', 'venue',
       'winner'],
      dtype='object')

In [28]:
# over, batting_team, bowling_team, wickets, runs, last_5_over_wickets, last_5_over_runs, last_5_over_balls,venue.
# final_score is our target variable

df2 = df1[["over", "wickets","runs", "last_5_over_wickets", "last_5_over_runs","batting_team", "bowling_team", "final_score"]].copy()

In [29]:
df2.head()

Unnamed: 0,over,wickets,runs,last_5_over_wickets,last_5_over_runs,batting_team,bowling_team,final_score
32,5.1,0,61,0,60,5,3,222
33,5.2,1,61,1,60,5,3,222
34,5.3,1,61,1,60,5,3,222
35,5.4,1,61,1,60,5,3,222
36,5.5,1,61,1,60,5,3,222


In [30]:
df2.reset_index(drop='index',inplace=True)

In [31]:
df2.head()

Unnamed: 0,over,wickets,runs,last_5_over_wickets,last_5_over_runs,batting_team,bowling_team,final_score
0,5.1,0,61,0,60,5,3,222
1,5.2,1,61,1,60,5,3,222
2,5.3,1,61,1,60,5,3,222
3,5.4,1,61,1,60,5,3,222
4,5.5,1,61,1,60,5,3,222


In [32]:
encoded_venue

Unnamed: 0,Brabourne Stadium,Buffalo Park,De Beers Diamond Oval,Dr DY Patil Sports Academy,Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium,Dubai International Cricket Stadium,Eden Gardens,Feroz Shah Kotla,Himachal Pradesh Cricket Association Stadium,Holkar Cricket Stadium,...,"Rajiv Gandhi International Stadium, Uppal","Sardar Patel Stadium, Motera",Sawai Mansingh Stadium,Shaheed Veer Narayan Singh International Stadium,Sharjah Cricket Stadium,Sheikh Zayed Stadium,St George's Park,Subrata Roy Sahara Stadium,SuperSport Park,Wankhede Stadium
32,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149274,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
149275,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
149276,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
149277,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
encoded_venue.reset_index(drop='index',inplace=True)

In [34]:
encoded_venue

Unnamed: 0,Brabourne Stadium,Buffalo Park,De Beers Diamond Oval,Dr DY Patil Sports Academy,Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium,Dubai International Cricket Stadium,Eden Gardens,Feroz Shah Kotla,Himachal Pradesh Cricket Association Stadium,Holkar Cricket Stadium,...,"Rajiv Gandhi International Stadium, Uppal","Sardar Patel Stadium, Motera",Sawai Mansingh Stadium,Shaheed Veer Narayan Singh International Stadium,Sharjah Cricket Stadium,Sheikh Zayed Stadium,St George's Park,Subrata Roy Sahara Stadium,SuperSport Park,Wankhede Stadium
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110053,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
110054,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
110055,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
110056,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
# Concatenate venue column to df2

df2 = pd.concat([df2, encoded_venue], axis=1).copy()
df2.head()

Unnamed: 0,over,wickets,runs,last_5_over_wickets,last_5_over_runs,batting_team,bowling_team,final_score,Brabourne Stadium,Buffalo Park,...,"Rajiv Gandhi International Stadium, Uppal","Sardar Patel Stadium, Motera",Sawai Mansingh Stadium,Shaheed Veer Narayan Singh International Stadium,Sharjah Cricket Stadium,Sheikh Zayed Stadium,St George's Park,Subrata Roy Sahara Stadium,SuperSport Park,Wankhede Stadium
0,5.1,0,61,0,60,5,3,222,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5.2,1,61,1,60,5,3,222,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5.3,1,61,1,60,5,3,222,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5.4,1,61,1,60,5,3,222,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5.5,1,61,1,60,5,3,222,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
df2.tail()

Unnamed: 0,over,wickets,runs,last_5_over_wickets,last_5_over_runs,batting_team,bowling_team,final_score,Brabourne Stadium,Buffalo Park,...,"Rajiv Gandhi International Stadium, Uppal","Sardar Patel Stadium, Motera",Sawai Mansingh Stadium,Shaheed Veer Narayan Singh International Stadium,Sharjah Cricket Stadium,Sheikh Zayed Stadium,St George's Park,Subrata Roy Sahara Stadium,SuperSport Park,Wankhede Stadium
110053,17.6,4,154,4,154,7,1,157,0,0,...,0,0,0,0,0,0,0,0,0,0
110054,18.1,4,155,4,155,7,1,157,0,0,...,0,0,0,0,0,0,0,0,0,0
110055,18.2,4,156,4,156,7,1,157,0,0,...,0,0,0,0,0,0,0,0,0,0
110056,18.3,5,156,5,156,7,1,157,0,0,...,0,0,0,0,0,0,0,0,0,0
110057,18.4,5,157,5,157,7,1,157,0,0,...,0,0,0,0,0,0,0,0,0,0


## Model Building

In [37]:
df2.columns.nunique()

37

### Train Test Split

In [38]:
# X is the feature matrix and y being the target variable

X = df2.drop(columns="final_score") 
y = df2.final_score

In [39]:
X.shape

(110058, 36)

In [40]:
y.shape

(110058,)

In [41]:
# Train test split the dataset


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

(X_train.shape, y_train.shape), (X_test.shape, y_test.shape)

(((77040, 36), (77040,)), ((33018, 36), (33018,)))

### Scaling 

In [42]:
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### Modeling

- Linear Regression
- Ridge
- Lasso
- Decision Tree
- Random Forest
- Ada Boost
- Gradient Boost 

In [43]:
models = {
    "Linear Regression": {
        "model": LinearRegression(),
        "params": {}
    },
    "Ridge": {
        "model": Ridge(),
        "params": {
            "alpha":np.arange(0.1, 1, 0.01)
        }
    },
    "Lasso": {
        "model": Lasso(),
        "params": {
            "alpha": np.arange(0.1, 1, 0.01)
        }
    },
    "Decision Tree": {
        "model": DecisionTreeRegressor(),
        "params": {
            "criterion": ["mse", "friedman_mse"],
            "splitter": ["best", "random"],
            "max_depth": [1, 3, 5, 7, 9, 10, 11, 12, 14, 15, 18, 20, 25, 28, 30, 33, 38, 40],
            "min_samples_split": [2, 4, 6, 8, 10, 15, 20],
            "min_samples_leaf": [i for i in range(1, 11)],
            "max_leaf_nodes": [None] + [i for i in range(10, 91, 10)],
            "max_features": ["auto", "log2", "sqrt", None]            
        }
    },
    "Random Forest": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators": [100, 200, 300],
            "criterion": ["mse", "friedman_mse"],
            "max_depth": [1, 3, 5, 7, 9, 10, 11, 12, 14, 15, 18, 20, 25, 28, 30, 33, 38, 40],
            "min_samples_split": [2, 4, 6, 8, 10, 15, 20],
            "min_samples_leaf": [i for i in range(1, 11)],
            "max_leaf_nodes": [None] + [i for i in range(10, 91, 10)],
            "max_features": ["auto", "log2", "sqrt", None]
        }
    },
    "Ada Boost": {
        "model": AdaBoostRegressor(),
        "params": {
            "n_estimators": [100, 200, 300],
            "learning_rate": np.arange(0.1, 1, 0.01),
            "loss": ['linear', 'square', 'exponential']
        }
    },
    "Gradient Boost": {
        "model": GradientBoostingRegressor(),
        "params": {
            "learning_rate": np.arange(0.1, 1, 0.01),
            "n_estimators": [100, 200, 300],
            "criterion": ['friedman_mse', 'mse'],
            "min_samples_split": [2, 4, 6, 8, 10, 15, 20],
            "min_samples_leaf": [i for i in range(1, 11)],
            "max_depth": [1, 3, 5, 7, 9, 10, 11, 12, 14, 15, 18, 20, 25, 28, 30, 33, 38, 40],
            "max_features": ["auto", "log2", "sqrt", None],
            "max_leaf_nodes": [None] + [i for i in range(10, 91, 10)],
            "alpha": np.arange(0.1, 1, 0.01)
        }
    }
}

### Best Model

In [44]:
start_time = time.time()
best_model = {}
best_model_details = []

for model_name, values in models.items():
    rscv = RandomizedSearchCV(values["model"], values["params"], cv=5, n_iter=15, n_jobs=-1, verbose=2, random_state=4)
    rscv.fit(X_train, y_train)
    print("---fitted---")
    best_model[model_name] = rscv
    best_model_details.append({"Model Name": model_name, "Best Score": rscv.best_score_, "Best Parameters": rscv.best_params_})
    print(model_name)


print("--------------------------------------------------------")
print(f"it takes {(time.time() - start_time) / 60} minutes")
print("--------------------------------------------------------")

Fitting 5 folds for each of 1 candidates, totalling 5 fits




---fitted---
Linear Regression
Fitting 5 folds for each of 15 candidates, totalling 75 fits
---fitted---
Ridge
Fitting 5 folds for each of 15 candidates, totalling 75 fits
---fitted---
Lasso
Fitting 5 folds for each of 15 candidates, totalling 75 fits




---fitted---
Decision Tree
Fitting 5 folds for each of 15 candidates, totalling 75 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


---fitted---
Random Forest
Fitting 5 folds for each of 15 candidates, totalling 75 fits
---fitted---
Ada Boost
Fitting 5 folds for each of 15 candidates, totalling 75 fits
































































































































































































































































































































---fitted---
Gradient Boost
--------------------------------------------------------
it takes 19.69133191506068 minutes
--------------------------------------------------------




In [45]:
pd.set_option('display.max_colwidth', None)
pd.DataFrame(best_model_details)

Unnamed: 0,Model Name,Best Score,Best Parameters
0,Linear Regression,0.52503,{}
1,Ridge,0.52503,{'alpha': 0.19999999999999996}
2,Lasso,0.497843,{'alpha': 0.19999999999999996}
3,Decision Tree,0.496933,"{'splitter': 'best', 'min_samples_split': 2, 'min_samples_leaf': 3, 'max_leaf_nodes': 90, 'max_features': 'auto', 'max_depth': 20, 'criterion': 'friedman_mse'}"
4,Random Forest,0.661075,"{'n_estimators': 300, 'min_samples_split': 15, 'min_samples_leaf': 5, 'max_leaf_nodes': None, 'max_features': 'log2', 'max_depth': 20, 'criterion': 'mse'}"
5,Ada Boost,0.402423,"{'n_estimators': 300, 'loss': 'linear', 'learning_rate': 0.17999999999999997}"
6,Gradient Boost,0.832477,"{'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 5, 'max_leaf_nodes': None, 'max_features': 'log2', 'max_depth': 14, 'learning_rate': 0.6799999999999997, 'criterion': 'mse', 'alpha': 0.7099999999999996}"


### Test Score

In [46]:
test_model = []

for model_name, model in best_model.items():
    test_model.append({"Model Name": model_name, "Test Score": model.score(X_test, y_test)})

pd.DataFrame(test_model)

Unnamed: 0,Model Name,Test Score
0,Linear Regression,0.529161
1,Ridge,0.52917
2,Lasso,0.5033
3,Decision Tree,0.505106
4,Random Forest,0.683246
5,Ada Boost,0.410373
6,Gradient Boost,0.861848


### MSE,MAE,RMSE Values

In [47]:
# let's calculate error using MSE, RMSE and MAE for Gradient Boost algorithm

train_model_error = []
test_model_error = []

for model_name, model in best_model.items():
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    train_mae = mean_absolute_error(y_train, y_pred_train)
    train_mse = mean_squared_error(y_train, y_pred_train)
    train_rmse = np.sqrt(train_mse)

    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_rmse = np.sqrt(test_mse)
    
    train_model_error.append({"Model Name": model_name, "Mean Absolute Error": train_mae, "Mean Squared Error": train_mse, "Root Mean Squared Error": train_rmse})
    test_model_error.append({"Model Name": model_name, "Mean Absolute Error": test_mae, "Mean Squared Error": test_mse, "Root Mean Squared Error": test_rmse})
    

train_model_error = pd.DataFrame(train_model_error)
test_model_error = pd.DataFrame(test_model_error)

In [48]:
print("-------- Training Data Error -------")
train_model_error

-------- Training Data Error -------


Unnamed: 0,Model Name,Mean Absolute Error,Mean Squared Error,Root Mean Squared Error
0,Linear Regression,14.93999,409.602129,20.23863
1,Ridge,14.940166,409.603182,20.238656
2,Lasso,15.527418,433.341759,20.816862
3,Decision Tree,15.501726,422.262332,20.549023
4,Random Forest,11.419714,242.500067,15.572414
5,Ada Boost,17.61751,514.087121,22.673489
6,Gradient Boost,1.603841,7.332912,2.707935


In [49]:
print("-------- Test Data Error -------")
test_model_error

-------- Test Data Error -------


Unnamed: 0,Model Name,Mean Absolute Error,Mean Squared Error,Root Mean Squared Error
0,Linear Regression,14.806188,402.628817,20.065613
1,Ridge,14.806251,402.621463,20.06543
2,Lasso,15.370086,424.743787,20.609313
3,Decision Tree,15.507938,423.199466,20.571812
4,Random Forest,12.099161,270.866555,16.458024
5,Ada Boost,17.496106,504.208386,22.454585
6,Gradient Boost,6.731762,118.137958,10.869129


[CV] END ...........................alpha=0.8599999999999995; total time=   0.0s
[CV] END ..........................alpha=0.19999999999999996; total time=   7.0s
[CV] END ..........................alpha=0.46999999999999986; total time=   4.2s
[CV] END ..........................alpha=0.29999999999999993; total time=   6.6s
[CV] END ...........................alpha=0.3699999999999999; total time=   5.2s
[CV] END criterion=friedman_mse, max_depth=5, max_features=sqrt, max_leaf_nodes=40, min_samples_leaf=3, min_samples_split=10, splitter=random; total time=   0.1s
[CV] END criterion=friedman_mse, max_depth=12, max_features=sqrt, max_leaf_nodes=None, min_samples_leaf=10, min_samples_split=10, splitter=random; total time=   0.3s
[CV] END criterion=friedman_mse, max_depth=5, max_features=sqrt, max_leaf_nodes=20, min_samples_leaf=5, min_samples_split=10, splitter=best; total time=   0.2s
[CV] END criterion=friedman_mse, max_depth=3, max_features=auto, max_leaf_nodes=80, min_samples_leaf=5, min

[CV] END .................................................... total time=   0.3s
[CV] END ..........................alpha=0.19999999999999996; total time=   0.1s
[CV] END ...........................alpha=0.7499999999999997; total time=   0.1s
[CV] END ..........................alpha=0.46999999999999986; total time=   0.1s
[CV] END ...........................alpha=0.7099999999999996; total time=   0.1s
[CV] END ...........................alpha=0.7099999999999996; total time=   0.1s
[CV] END ...........................alpha=0.5099999999999998; total time=   0.1s
[CV] END ...........................alpha=0.5099999999999998; total time=   0.1s
[CV] END ...........................alpha=0.8399999999999996; total time=   0.1s
[CV] END ...........................alpha=0.8399999999999996; total time=   0.1s
[CV] END ..........................alpha=0.22999999999999995; total time=   0.1s
[CV] END ..........................alpha=0.22999999999999995; total time=   0.1s
[CV] END ...................

[CV] END ..........................alpha=0.23999999999999994; total time=   0.1s
[CV] END ..........................alpha=0.23999999999999994; total time=   7.3s
[CV] END ...........................alpha=0.7099999999999996; total time=   2.5s
[CV] END ...........................alpha=0.5299999999999998; total time=   3.3s
[CV] END ..........................alpha=0.29999999999999993; total time=   5.4s
[CV] END ...........................alpha=0.7799999999999997; total time=   2.2s
[CV] END ...........................alpha=0.7799999999999997; total time=   1.9s
[CV] END ..........................alpha=0.48999999999999977; total time=   1.3s
[CV] END criterion=friedman_mse, max_depth=7, max_features=sqrt, max_leaf_nodes=20, min_samples_leaf=2, min_samples_split=15, splitter=random; total time=   0.2s
[CV] END criterion=friedman_mse, max_depth=12, max_features=sqrt, max_leaf_nodes=None, min_samples_leaf=10, min_samples_split=10, splitter=random; total time=   0.3s
[CV] END criterion=fried

## Model Selection

### Random Forest model
#### Train Data
- r2_score = 0.661075
- MAE (Mean Absolute Error) = 11.419714

#### Test Data
- r2_score = 0.683246
- MAE (Mean Absolute Error) = 12.099161


### Gradient Boost model
#### Train Data
- r2_score = 0.832477
- MAE (Mean Absolute Error) = 1.603841

#### Test Data
- r2_score = 0.861848
- MAE (Mean Absolute Error) = 6.731762

## Conclusion

1. We can observe minimal Mean Absolute Error difference for Random Forest as compare to Gradient Boost model.
2. R2 score tends to be on higher side for gradient boost model with slight chances of overfit.
3. R2 score for Random forest tends to be on lower side as compared to gradient boost model.
4. But keeping overfitting in mind I will perform predictions based on Random Forest model.

## Predictions

In [52]:
# Saving RF model and scalar in form of pickle file
# encoded_team and feature columns as JSON file

import pickle
import json
with open("../observations/model.pickle", "wb") as f:
    pickle.dump(best_model["Random Forest"], f)
with open("../observations/scaler.pickle", "wb") as f:
    pickle.dump(scaler, f)
with open("../observations/encodedteams.json", "w") as f:
    json.dump(encoded_teams, f)
with open("../observations/columns.json", "w") as f:
    json.dump({"columns": list(X.columns)}, f)

In [55]:
import numpy as np

In [56]:
scaler = None
model = None
encoded_teams = None
columns = None
with open("../observations/model.pickle", "rb") as f:
    model = pickle.load(f)
with open("../observations/scaler.pickle", "rb") as f:
    scaler = pickle.load(f)
with open("../observations/encodedteams.json", "r") as f:
    encoded_teams = json.load(f)
with open("../observations/columns.json", "r") as f:
    columns = np.array(json.load(f)["columns"])

In [57]:
def prediction(over, wickets, runs, last_5_over_wickets, last_5_over_runs, batting_team, bowling_team, venue):
    X_pred = np.zeros(columns.size)
    X_pred[0] = over
    X_pred[1] = wickets
    X_pred[2] = runs
    X_pred[3] = last_5_over_wickets
    X_pred[4] = last_5_over_runs
    X_pred[5] = encoded_teams[batting_team]
    X_pred[6] = encoded_teams[bowling_team]
    if venue != "Barabati Stadium":
        venue_pos = np.where(venue == columns)[0][0]
        X_pred[venue_pos] = 1
    X_pred = scaler.transform([X_pred])
    prediction = model.predict(X_pred)
    return prediction

In [58]:
prediction(7, 0, 52, 0, 24, "Sunrisers Hyderabad", "Delhi Capitals", "Sheikh Zayed Stadium")



array([149.61706453])

In [59]:
columns

array(['over', 'wickets', 'runs', 'last_5_over_wickets',
       'last_5_over_runs', 'batting_team', 'bowling_team',
       'Brabourne Stadium', 'Buffalo Park', 'De Beers Diamond Oval',
       'Dr DY Patil Sports Academy',
       'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium',
       'Dubai International Cricket Stadium', 'Eden Gardens',
       'Feroz Shah Kotla', 'Himachal Pradesh Cricket Association Stadium',
       'Holkar Cricket Stadium', 'JSCA International Stadium Complex',
       'Kingsmead', 'M Chinnaswamy Stadium',
       'MA Chidambaram Stadium, Chepauk',
       'Maharashtra Cricket Association Stadium', 'New Wanderers Stadium',
       'Newlands', 'OUTsurance Oval',
       'Punjab Cricket Association Stadium, Mohali',
       'Rajiv Gandhi International Stadium, Uppal',
       'Sardar Patel Stadium, Motera', 'Sawai Mansingh Stadium',
       'Shaheed Veer Narayan Singh International Stadium',
       'Sharjah Cricket Stadium', 'Sheikh Zayed Stadium',
       "St George's P

In [62]:
encoded_teams

{'Sunrisers Hyderabad': 0,
 'Delhi Capitals': 1,
 'Rajasthan Royals': 2,
 'Royal Challengers Bangalore': 3,
 'Punjab Kings': 4,
 'Kolkata Knight Riders': 5,
 'Chennai Super Kings': 6,
 'Mumbai Indians': 7}

In [63]:
prediction(12.6, 4, 87, 2, 28, "Sunrisers Hyderabad", "Punjab Kings", "Wankhede Stadium")



array([142.8843623])