In [1]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, RobustScaler, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

In [2]:
df = pd.read_csv("../raw_data/final_to_model_df.csv")
df.columns

Index(['ID', 'City', 'Date', 'Season', 'MatchNumber', 'Team1', 'Team2',
       'Venue', 'TossWinner', 'TossDecision', 'WinningTeam', 'innings_total',
       'TeamA_batting_average', 'TeamB_batting_average', 'TeamA_innings_total',
       'TeamB_innings_total'],
      dtype='object')

In [3]:
df = df.drop(columns=['Date', 'Season', 'MatchNumber'])

In [4]:
df.dropna(axis=0, inplace=True)

In [35]:
df.head()

Unnamed: 0,ID,City,Team1,Team2,Venue,TossWinner,TossDecision,WinningTeam,innings_total,TeamA_batting_average,TeamB_batting_average,TeamA_innings_total,TeamB_innings_total
0,1312200,ahmedabad,rajasthan royals,gujarat titans,"narendra modi stadium, ahmedabad",rajasthan royals,bat,gujarat titans,130,155.397906,166.4375,130.0,133.0
1,1312199,ahmedabad,royal challengers bangalore,rajasthan royals,"narendra modi stadium, ahmedabad",rajasthan royals,field,rajasthan royals,157,155.707965,155.397906,157.0,161.0
2,1312198,kolkata,royal challengers bangalore,lucknow super giants,eden gardens,lucknow super giants,field,royal challengers bangalore,207,155.707965,169.866667,207.0,193.0
3,1312197,kolkata,rajasthan royals,gujarat titans,eden gardens,gujarat titans,field,gujarat titans,188,155.397906,166.4375,188.0,191.0
4,1304116,mumbai,sunrisers hyderabad,punjab kings,wankhede stadium,sunrisers hyderabad,bat,punjab kings,157,155.546053,158.518349,157.0,160.0


In [36]:
# Create a new column "team_x win". 1 will indicate that team_x won the match
df['Team_1_Win'] = (df['Team1'] == df['WinningTeam']).astype(int)
df.head()

Unnamed: 0,ID,City,Team1,Team2,Venue,TossWinner,TossDecision,WinningTeam,innings_total,TeamA_batting_average,TeamB_batting_average,TeamA_innings_total,TeamB_innings_total,Team_1_Win
0,1312200,ahmedabad,rajasthan royals,gujarat titans,"narendra modi stadium, ahmedabad",rajasthan royals,bat,gujarat titans,130,155.397906,166.4375,130.0,133.0,0
1,1312199,ahmedabad,royal challengers bangalore,rajasthan royals,"narendra modi stadium, ahmedabad",rajasthan royals,field,rajasthan royals,157,155.707965,155.397906,157.0,161.0,0
2,1312198,kolkata,royal challengers bangalore,lucknow super giants,eden gardens,lucknow super giants,field,royal challengers bangalore,207,155.707965,169.866667,207.0,193.0,1
3,1312197,kolkata,rajasthan royals,gujarat titans,eden gardens,gujarat titans,field,gujarat titans,188,155.397906,166.4375,188.0,191.0,0
4,1304116,mumbai,sunrisers hyderabad,punjab kings,wankhede stadium,sunrisers hyderabad,bat,punjab kings,157,155.546053,158.518349,157.0,160.0,0


In [None]:
# CREATE A SEPARATE X and y to try a XGBoost model

In [6]:
# Create your X and y for the model
X = df[['City', 'Venue', 'TossWinner', 'TossDecision', 'Team1', 'Team2', 'TeamA_batting_average',
       'TeamB_batting_average']]
y = df['TeamA_innings_total']
X.columns

Index(['City', 'Venue', 'TossWinner', 'TossDecision', 'Team1', 'Team2',
       'TeamA_batting_average', 'TeamB_batting_average'],
      dtype='object')

In [31]:
# Choose which columns to be OneHotEncoded or LabelEncoded

categorical_cols = ['City', 'Venue', 'TossWinner', 'TossDecision', 'Team1', 'Team2']
numerical_cols = ['TeamA_batting_average', 'TeamB_batting_average']

# Create a column transformer to normalise the different types of data
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(drop='if_binary', handle_unknown='ignore'), categorical_cols),
    ('num', MinMaxScaler(), numerical_cols)
])

# Create the pipeline to run the data through
pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

X_encoded = pipeline.fit_transform(X)

# Get the column names from the preprocessor
X_encoded = pd.DataFrame.sparse.from_spmatrix(X_encoded, columns=preprocessor.get_feature_names_out())


X_encoded

Unnamed: 0,cat__City_abu dhabi,cat__City_ahmedabad,cat__City_bangalore,cat__City_bengaluru,cat__City_bloemfontein,cat__City_cape town,cat__City_centurion,cat__City_chandigarh,cat__City_chennai,cat__City_cuttack,...,cat__Team2_lucknow super giants,cat__Team2_mumbai indians,cat__Team2_pune warriors,cat__Team2_punjab kings,cat__Team2_rajasthan royals,cat__Team2_rising pune supergiant,cat__Team2_royal challengers bangalore,cat__Team2_sunrisers hyderabad,num__TeamA_batting_average,num__TeamB_batting_average
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.575459,0.899382
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.584557,0.575459
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.584557,1.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.575459,0.899382
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.579806,0.667019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500405,0.515889
944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.672768,0.584557
945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.575459,0.527237
946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.726434,0.667019


In [32]:
X_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 948 entries, 0 to 947
Columns: 117 entries, cat__City_abu dhabi to num__TeamB_batting_average
dtypes: Sparse[float64, 0](117)
memory usage: 84.7 KB


In [33]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = GradientBoostingRegressor()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")

Mean Squared Error: 746.90
Root Mean Squared Error: 27.33
R-squared (R2) Score: -0.06


In [22]:
X.head()

Unnamed: 0,City,Venue,TossWinner,TossDecision,Team1,Team2,TeamA_batting_average,TeamB_batting_average
0,ahmedabad,"narendra modi stadium, ahmedabad",rajasthan royals,bat,rajasthan royals,gujarat titans,155.397906,166.4375
1,ahmedabad,"narendra modi stadium, ahmedabad",rajasthan royals,field,royal challengers bangalore,rajasthan royals,155.707965,155.397906
2,kolkata,eden gardens,lucknow super giants,field,royal challengers bangalore,lucknow super giants,155.707965,169.866667
3,kolkata,eden gardens,gujarat titans,field,rajasthan royals,gujarat titans,155.397906,166.4375
4,mumbai,wankhede stadium,sunrisers hyderabad,bat,sunrisers hyderabad,punjab kings,155.546053,158.518349


In [29]:
# Try grid searching with different models and scalers

# Define the hyperparameters
param_grid = {
    'model': [RandomForestRegressor(n_jobs=-1), GradientBoostingRegressor(), LinearRegression()]
}

# Create the pipeline with the model
pipeline2 = Pipeline([
    ('model', RandomForestRegressor(n_jobs=-1))
])

# Make the random search
random_search = GridSearchCV(
    estimator=pipeline2,
    param_grid=param_grid,
    scoring='r2',
    n_jobs=-1,
    cv=5
)

# Fit the random search to your data
random_search.fit(X_encoded, y)

# get the best model
best_model = random_search.best_estimator_

# Print the best hyperparameters and score
print("Best Hyperparameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Best Hyperparameters: {'model': GradientBoostingRegressor()}
Best Score: -0.11212634489556979


In [38]:
# CREATE A SEPARATE X and y to try a XGBoost model
X2 = df[['City', 'Venue', 'TossWinner', 'TossDecision', 'Team1', 'Team2', 'TeamA_batting_average',
       'TeamB_batting_average']]
y2 = df['Team_1_Win']

# encode the new X2 data
# Choose which columns to be OneHotEncoded or LabelEncoded

categorical_cols = ['City', 'Venue', 'TossWinner', 'TossDecision', 'Team1', 'Team2']
numerical_cols = ['TeamA_batting_average', 'TeamB_batting_average']

# Create a column transformer to normalise the different types of data
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(drop='if_binary', handle_unknown='ignore'), categorical_cols),
    ('num', MinMaxScaler(), numerical_cols)
])

# Create the pipeline to run the data through
pipeline3 = Pipeline([
    ('preprocessor', preprocessor)
])

X2_encoded = pipeline3.fit_transform(X)

# Get the column names from the preprocessor
X2_encoded = pd.DataFrame.sparse.from_spmatrix(X2_encoded, columns=preprocessor.get_feature_names_out())

X2_encoded

Unnamed: 0,cat__City_abu dhabi,cat__City_ahmedabad,cat__City_bangalore,cat__City_bengaluru,cat__City_bloemfontein,cat__City_cape town,cat__City_centurion,cat__City_chandigarh,cat__City_chennai,cat__City_cuttack,...,cat__Team2_lucknow super giants,cat__Team2_mumbai indians,cat__Team2_pune warriors,cat__Team2_punjab kings,cat__Team2_rajasthan royals,cat__Team2_rising pune supergiant,cat__Team2_royal challengers bangalore,cat__Team2_sunrisers hyderabad,num__TeamA_batting_average,num__TeamB_batting_average
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.575459,0.899382
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.584557,0.575459
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.584557,1.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.575459,0.899382
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.579806,0.667019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500405,0.515889
944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.672768,0.584557
945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.575459,0.527237
946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.726434,0.667019


In [40]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, 

# Try with a simple Logistic Regression
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X2_encoded, y2, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model


accuracy = round(accuracy_score(y_test, y_pred), 2)

print(f'Accuracy: {accuracy}')

Accuracy: 0.52


In [43]:
import xgboost as xgb

# Initialize and train the logistic regression model
# Create an XGBoost classifier
clf = xgb.XGBClassifier(
    objective='binary:logistic',  # For binary classification
    n_estimators=1000,            # Number of boosting rounds
    max_depth=5,                 # Maximum depth of trees
    learning_rate=0.01            # Learning rate
)
clf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = round(accuracy_score(y_test, y_pred), 2)

print(f'Accuracy: {accuracy}')

Accuracy: 0.45
