In [1]:
import tpot



In [2]:
from tpot import TPOTRegressor
from sklearn.model_selection import RepeatedKFold
import pandas as pd 

In [3]:
ipl_data = pd.read_csv("../Datasets/eda_feature_engineering.csv")

In [4]:
# Note that batting team and bowling team have same values
# Choosing consistent playing teams

ipl_teams = ['Kolkata Knight Riders', 'Royal Challengers Bangalore',
       'Chennai Super Kings', 'Punjab Kings', 'Rajasthan Royals',
       'Mumbai Indians','Sunrisers Hyderabad','Delhi Capitals']

# Delhi Daredevils name was changed to Delhi Capitals
# Kings XI Punjab name was changed to  Punjab Kings

# let's rename this teams with current name

ipl_data.replace({"Delhi Daredevils": "Delhi Capitals", "Kings XI Punjab": "Punjab Kings"}, inplace=True)

In [5]:
# Drop team values of unselected teams
non_ipl_teams = list(filter(lambda x: x not in ipl_teams, ipl_data.batting_team.unique()))

In [6]:
# Dropping non_ipl_teams

for i in non_ipl_teams:
    ipl_data.drop(ipl_data.loc[(ipl_data.batting_team == i) | (ipl_data.bowling_team == i)].index, axis=0, inplace=True)
    
ipl_data = ipl_data.reset_index(drop=True)

In [7]:
# Will drop the first five over to get better consistent score result.
ipl_data = ipl_data.loc[ipl_data.over >= 5]
ipl_data.head(20)

Unnamed: 0,id,inning,over,ball,total_runs,is_wicket,batting_team,bowling_team,final_score,wickets,runs,last_5_over_wickets,last_5_over_runs,last_5_over_balls,venue,winner
32,335982,1,5.1,33,1,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,0,61,0,60,32,M Chinnaswamy Stadium,Kolkata Knight Riders
33,335982,1,5.2,34,0,1,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,61,1,60,33,M Chinnaswamy Stadium,Kolkata Knight Riders
34,335982,1,5.3,35,0,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,61,1,60,34,M Chinnaswamy Stadium,Kolkata Knight Riders
35,335982,1,5.4,36,0,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,61,1,60,35,M Chinnaswamy Stadium,Kolkata Knight Riders
36,335982,1,5.5,37,0,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,61,1,60,36,M Chinnaswamy Stadium,Kolkata Knight Riders
37,335982,1,5.6,38,0,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,61,1,60,37,M Chinnaswamy Stadium,Kolkata Knight Riders
38,335982,1,6.1,39,1,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,62,1,61,38,M Chinnaswamy Stadium,Kolkata Knight Riders
39,335982,1,6.2,40,1,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,63,1,62,39,M Chinnaswamy Stadium,Kolkata Knight Riders
40,335982,1,6.3,41,1,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,64,1,63,40,M Chinnaswamy Stadium,Kolkata Knight Riders
41,335982,1,6.4,42,2,0,Kolkata Knight Riders,Royal Challengers Bangalore,222,1,66,1,65,41,M Chinnaswamy Stadium,Kolkata Knight Riders


In [8]:
# Will numer the team as per the winning counts Mumbai being the highest as 7 and 0 Sunrisers being lowest winning count.

encoded_teams = {k:v for v, k in enumerate(ipl_data.winner.value_counts().sort_values().index, 0)}
encoded_teams

{'Sunrisers Hyderabad': 0,
 'Delhi Capitals': 1,
 'Rajasthan Royals': 2,
 'Royal Challengers Bangalore': 3,
 'Punjab Kings': 4,
 'Kolkata Knight Riders': 5,
 'Chennai Super Kings': 6,
 'Mumbai Indians': 7}

In [9]:
# Mapping encoded values to new data frame

ipl_data.batting_team = ipl_data.batting_team.map(encoded_teams)
ipl_data.bowling_team = ipl_data.bowling_team.map(encoded_teams)

In [11]:
# One hot encoding of venue column

encoded_venue = pd.get_dummies(ipl_data.venue, drop_first=True)

# 0 being Barbati Stadium when performing get_dummies 

encoded_venue.head()

Unnamed: 0,Brabourne Stadium,Buffalo Park,De Beers Diamond Oval,Dr DY Patil Sports Academy,Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium,Dubai International Cricket Stadium,Eden Gardens,Feroz Shah Kotla,Himachal Pradesh Cricket Association Stadium,Holkar Cricket Stadium,...,"Rajiv Gandhi International Stadium, Uppal","Sardar Patel Stadium, Motera",Sawai Mansingh Stadium,Shaheed Veer Narayan Singh International Stadium,Sharjah Cricket Stadium,Sheikh Zayed Stadium,St George's Park,Subrata Roy Sahara Stadium,SuperSport Park,Wankhede Stadium
32,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# over, batting_team, bowling_team, wickets, runs, last_5_over_wickets, last_5_over_runs, last_5_over_balls,venue.
# final_score is our target variable

ipl_data = ipl_data[["over", "wickets","runs", "last_5_over_wickets", "last_5_over_runs","batting_team", "bowling_team", "final_score"]]
ipl_data.head()

Unnamed: 0,over,wickets,runs,last_5_over_wickets,last_5_over_runs,batting_team,bowling_team,final_score
32,5.1,0,61,0,60,5,3,222
33,5.2,1,61,1,60,5,3,222
34,5.3,1,61,1,60,5,3,222
35,5.4,1,61,1,60,5,3,222
36,5.5,1,61,1,60,5,3,222


In [13]:
ipl_data = pd.concat([ipl_data, encoded_venue], axis=1).copy()
ipl_data.head()

Unnamed: 0,over,wickets,runs,last_5_over_wickets,last_5_over_runs,batting_team,bowling_team,final_score,Brabourne Stadium,Buffalo Park,...,"Rajiv Gandhi International Stadium, Uppal","Sardar Patel Stadium, Motera",Sawai Mansingh Stadium,Shaheed Veer Narayan Singh International Stadium,Sharjah Cricket Stadium,Sheikh Zayed Stadium,St George's Park,Subrata Roy Sahara Stadium,SuperSport Park,Wankhede Stadium
32,5.1,0,61,0,60,5,3,222,0,0,...,0,0,0,0,0,0,0,0,0,0
33,5.2,1,61,1,60,5,3,222,0,0,...,0,0,0,0,0,0,0,0,0,0
34,5.3,1,61,1,60,5,3,222,0,0,...,0,0,0,0,0,0,0,0,0,0
35,5.4,1,61,1,60,5,3,222,0,0,...,0,0,0,0,0,0,0,0,0,0
36,5.5,1,61,1,60,5,3,222,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# X is the feature matrix and y being the target variable

X = ipl_data.drop(columns="final_score") 
y = ipl_data.final_score

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

(X_train.shape, y_train.shape), (X_test.shape, y_test.shape)

(((88046, 36), (88046,)), ((22012, 36), (22012,)))

In [18]:
from sklearn.preprocessing import MinMaxScaler

In [19]:
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)