In [18]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, RobustScaler, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

In [19]:
df = pd.read_csv("../raw_data/final_to_model_df.csv")
df.columns

Index(['ID', 'City', 'Date', 'Season', 'MatchNumber', 'Team1', 'Team2',
       'Venue', 'TossWinner', 'TossDecision', 'WinningTeam', 'innings_total',
       'Team1Players', 'Team2Players', 'TeamA_batting_average',
       'TeamB_batting_average', 'Player_of_Match', 'TeamA_innings_total',
       'TeamB_innings_total'],
      dtype='object')

In [20]:
df = df.drop(columns=['Date', 'Season'])

In [21]:
df.dropna(axis=0, inplace=True)

In [22]:
df.head()

Unnamed: 0,ID,City,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,WinningTeam,innings_total,Team1Players,Team2Players,TeamA_batting_average,TeamB_batting_average,Player_of_Match,TeamA_innings_total,TeamB_innings_total
0,1312200,ahmedabad,final,rajasthan royals,gujarat titans,"narendra modi stadium, ahmedabad",rajasthan royals,bat,gujarat titans,130,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",155.397906,166.4375,hh pandya,130.0,133.0
1,1312199,ahmedabad,qualifier 2,royal challengers bangalore,rajasthan royals,"narendra modi stadium, ahmedabad",rajasthan royals,field,rajasthan royals,157,"['v kohli', 'f du plessis', 'rm patidar', 'gj ...","['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...",155.707965,155.397906,jc buttler,157.0,161.0
2,1312198,kolkata,eliminator,royal challengers bangalore,lucknow super giants,eden gardens,lucknow super giants,field,royal challengers bangalore,207,"['v kohli', 'f du plessis', 'rm patidar', 'gj ...","['q de kock', 'kl rahul', 'm vohra', 'dj hooda...",155.707965,169.866667,rm patidar,207.0,193.0
3,1312197,kolkata,qualifier 1,rajasthan royals,gujarat titans,eden gardens,gujarat titans,field,gujarat titans,188,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",155.397906,166.4375,da miller,188.0,191.0
4,1304116,mumbai,70,sunrisers hyderabad,punjab kings,wankhede stadium,sunrisers hyderabad,bat,punjab kings,157,"['pk garg', 'abhishek sharma', 'ra tripathi', ...","['jm bairstow', 's dhawan', 'm shahrukh khan',...",155.546053,158.518349,harpreet brar,157.0,160.0


In [23]:
# Create a new column "team_x win". 1 will indicate that team_x won the match
df['Team_1_Win'] = (df['Team1'] == df['WinningTeam']).astype(int)
df.head()

Unnamed: 0,ID,City,MatchNumber,Team1,Team2,Venue,TossWinner,TossDecision,WinningTeam,innings_total,Team1Players,Team2Players,TeamA_batting_average,TeamB_batting_average,Player_of_Match,TeamA_innings_total,TeamB_innings_total,Team_1_Win
0,1312200,ahmedabad,final,rajasthan royals,gujarat titans,"narendra modi stadium, ahmedabad",rajasthan royals,bat,gujarat titans,130,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",155.397906,166.4375,hh pandya,130.0,133.0,0
1,1312199,ahmedabad,qualifier 2,royal challengers bangalore,rajasthan royals,"narendra modi stadium, ahmedabad",rajasthan royals,field,rajasthan royals,157,"['v kohli', 'f du plessis', 'rm patidar', 'gj ...","['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...",155.707965,155.397906,jc buttler,157.0,161.0,0
2,1312198,kolkata,eliminator,royal challengers bangalore,lucknow super giants,eden gardens,lucknow super giants,field,royal challengers bangalore,207,"['v kohli', 'f du plessis', 'rm patidar', 'gj ...","['q de kock', 'kl rahul', 'm vohra', 'dj hooda...",155.707965,169.866667,rm patidar,207.0,193.0,1
3,1312197,kolkata,qualifier 1,rajasthan royals,gujarat titans,eden gardens,gujarat titans,field,gujarat titans,188,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",155.397906,166.4375,da miller,188.0,191.0,0
4,1304116,mumbai,70,sunrisers hyderabad,punjab kings,wankhede stadium,sunrisers hyderabad,bat,punjab kings,157,"['pk garg', 'abhishek sharma', 'ra tripathi', ...","['jm bairstow', 's dhawan', 'm shahrukh khan',...",155.546053,158.518349,harpreet brar,157.0,160.0,0


In [24]:
# Create a new Team1 win toss feature
df['team_1_toss_winner'] = (df['Team1'] == df['TossWinner']).astype(int)

In [25]:
df.MatchNumber.unique()

array(['final', 'qualifier 2', 'eliminator', 'qualifier 1', '70', '69',
       '68', '67', '66', '65', '64', '63', '62', '61', '60', '59', '58',
       '57', '56', '55', '54', '53', '52', '51', '50', '49', '48', '47',
       '46', '45', '44', '43', '42', '41', '40', '39', '38', '37', '36',
       '35', '34', '33', '32', '31', '30', '29', '28', '27', '26', '25',
       '24', '23', '22', '21', '20', '19', '18', '17', '16', '15', '14',
       '13', '12', '11', '10', '9', '8', '7', '6', '5', '4', '3', '2',
       '1', 'qualifier', 'elimination final', '72', '71',
       '3rd place play-off', 'semi final'], dtype=object)

In [26]:
# function
def map_match_number(value):
    if isinstance(value, int) or value.isnumeric():
        return 1
    elif value in ['qualifier 2', 'eliminator', 'qualifier 1', 'qualifier', 'elimination final', '3rd place play-off', 'semi final']:
        return 2
    elif value == 'final':
        return 3
    else:
        return 0

In [27]:
# Apply the function
df['MatchNumber'] = df['MatchNumber'].apply(map_match_number)

# Rename it to something more descriptive 
df.rename(columns={'MatchNumber': 'MatchImportance'}, inplace=True)

In [28]:
df.head()

Unnamed: 0,ID,City,MatchImportance,Team1,Team2,Venue,TossWinner,TossDecision,WinningTeam,innings_total,Team1Players,Team2Players,TeamA_batting_average,TeamB_batting_average,Player_of_Match,TeamA_innings_total,TeamB_innings_total,Team_1_Win,team_1_toss_winner
0,1312200,ahmedabad,3,rajasthan royals,gujarat titans,"narendra modi stadium, ahmedabad",rajasthan royals,bat,gujarat titans,130,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",155.397906,166.4375,hh pandya,130.0,133.0,0,1
1,1312199,ahmedabad,2,royal challengers bangalore,rajasthan royals,"narendra modi stadium, ahmedabad",rajasthan royals,field,rajasthan royals,157,"['v kohli', 'f du plessis', 'rm patidar', 'gj ...","['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...",155.707965,155.397906,jc buttler,157.0,161.0,0,0
2,1312198,kolkata,2,royal challengers bangalore,lucknow super giants,eden gardens,lucknow super giants,field,royal challengers bangalore,207,"['v kohli', 'f du plessis', 'rm patidar', 'gj ...","['q de kock', 'kl rahul', 'm vohra', 'dj hooda...",155.707965,169.866667,rm patidar,207.0,193.0,1,0
3,1312197,kolkata,2,rajasthan royals,gujarat titans,eden gardens,gujarat titans,field,gujarat titans,188,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...","['wp saha', 'shubman gill', 'ms wade', 'hh pan...",155.397906,166.4375,da miller,188.0,191.0,0,0
4,1304116,mumbai,1,sunrisers hyderabad,punjab kings,wankhede stadium,sunrisers hyderabad,bat,punjab kings,157,"['pk garg', 'abhishek sharma', 'ra tripathi', ...","['jm bairstow', 's dhawan', 'm shahrukh khan',...",155.546053,158.518349,harpreet brar,157.0,160.0,0,1


## Add Matheus's new features

In [29]:
# Calculate the average points scored against a each team
df['Team1_points_against_avg'] = df.groupby('Team1')['TeamB_innings_total'].transform('mean')
df['Team2_points_against_avg'] = df.groupby('Team2')['TeamA_innings_total'].transform('mean')
df.head()

Unnamed: 0,ID,City,MatchImportance,Team1,Team2,Venue,TossWinner,TossDecision,WinningTeam,innings_total,...,Team2Players,TeamA_batting_average,TeamB_batting_average,Player_of_Match,TeamA_innings_total,TeamB_innings_total,Team_1_Win,team_1_toss_winner,Team1_points_against_avg,Team2_points_against_avg
0,1312200,ahmedabad,3,rajasthan royals,gujarat titans,"narendra modi stadium, ahmedabad",rajasthan royals,bat,gujarat titans,130,...,"['wp saha', 'shubman gill', 'ms wade', 'hh pan...",155.397906,166.4375,hh pandya,130.0,133.0,0,1,151.561798,167.666667
1,1312199,ahmedabad,2,royal challengers bangalore,rajasthan royals,"narendra modi stadium, ahmedabad",rajasthan royals,field,rajasthan royals,157,...,"['ybk jaiswal', 'jc buttler', 'sv samson', 'd ...",155.707965,155.397906,jc buttler,157.0,161.0,0,0,146.976,161.813725
2,1312198,kolkata,2,royal challengers bangalore,lucknow super giants,eden gardens,lucknow super giants,field,royal challengers bangalore,207,...,"['q de kock', 'kl rahul', 'm vohra', 'dj hooda...",155.707965,169.866667,rm patidar,207.0,193.0,1,0,146.976,176.285714
3,1312197,kolkata,2,rajasthan royals,gujarat titans,eden gardens,gujarat titans,field,gujarat titans,188,...,"['wp saha', 'shubman gill', 'ms wade', 'hh pan...",155.397906,166.4375,da miller,188.0,191.0,0,0,151.561798,167.666667
4,1304116,mumbai,1,sunrisers hyderabad,punjab kings,wankhede stadium,sunrisers hyderabad,bat,punjab kings,157,...,"['jm bairstow', 's dhawan', 'm shahrukh khan',...",155.546053,158.518349,harpreet brar,157.0,160.0,0,1,145.594203,164.537037


In [30]:
# Calculate the average number of times that team_1 has MVP
def get_match_winner(player_of_match, team1_players, team2_players):
    if pd.isna(player_of_match):
        return 'N/A'  
    
    if pd.isna(team1_players):
        team1_players = []  
    
    if pd.isna(team2_players):
        team2_players = [] 
    
    if player_of_match in team1_players:
        return 'Team1'
    elif player_of_match in team2_players:
        return 'Team2'
    else:
        return 'N/A'
    
df['Team_MVP'] = df.apply(lambda row: get_match_winner(row['Player_of_Match'], row['Team1Players'], row['Team2Players']), axis=1)

def replace_team_mvp_with_name(row):
    if row['Team_MVP'] == 'Team1':
        return row['Team1']
    elif row['Team_MVP'] == 'Team2':
        return row['Team2']
    else:
        return row['Team_MVP']
    
df['Team_MVP'] = df.apply(replace_team_mvp_with_name, axis=1)
team_mvp_counts = df['Team_MVP'].value_counts().reset_index()

team_mvp_counts.columns = ['Team_MVP', 'MVP_Count']

# Create the columns Team1_MVP_appearances e Team2_MVP_appearances
df = df.merge(team_mvp_counts, left_on='Team1', right_on='Team_MVP', how='left').fillna(0)
df.rename(columns={'MVP_Count': 'Team1_MVP_appearances'}, inplace=True)

df = df.merge(team_mvp_counts, left_on='Team2', right_on='Team_MVP', how='left').fillna(0)
df.rename(columns={'MVP_Count': 'Team2_MVP_appearances'}, inplace=True)

# Drop columns
df.drop(['Team_MVP_x', 'Team_MVP_y', 'Team_MVP'], axis=1, inplace=True)

# Now, let's create the average of MVP appearences
total_games_team1 = df['Team1'].value_counts()
total_games_team2 = df['Team2'].value_counts()

total_games = total_games_team2 + total_games_team1

df['Team1_MVP_average'] = df['Team1_MVP_appearances'] / total_games[df['Team1']].values
df['Team2_MVP_average'] = df['Team2_MVP_appearances'] / total_games[df['Team2']].values
df.head()

Unnamed: 0,ID,City,MatchImportance,Team1,Team2,Venue,TossWinner,TossDecision,WinningTeam,innings_total,...,TeamA_innings_total,TeamB_innings_total,Team_1_Win,team_1_toss_winner,Team1_points_against_avg,Team2_points_against_avg,Team1_MVP_appearances,Team2_MVP_appearances,Team1_MVP_average,Team2_MVP_average
0,1312200,ahmedabad,3,rajasthan royals,gujarat titans,"narendra modi stadium, ahmedabad",rajasthan royals,bat,gujarat titans,130,...,130.0,133.0,0,1,151.561798,167.666667,96,11,0.502618,0.6875
1,1312199,ahmedabad,2,royal challengers bangalore,rajasthan royals,"narendra modi stadium, ahmedabad",rajasthan royals,field,rajasthan royals,157,...,157.0,161.0,0,0,146.976,161.813725,111,96,0.493333,0.502618
2,1312198,kolkata,2,royal challengers bangalore,lucknow super giants,eden gardens,lucknow super giants,field,royal challengers bangalore,207,...,207.0,193.0,1,0,146.976,176.285714,111,9,0.493333,0.6
3,1312197,kolkata,2,rajasthan royals,gujarat titans,eden gardens,gujarat titans,field,gujarat titans,188,...,188.0,191.0,0,0,151.561798,167.666667,96,11,0.502618,0.6875
4,1304116,mumbai,1,sunrisers hyderabad,punjab kings,wankhede stadium,sunrisers hyderabad,bat,punjab kings,157,...,157.0,160.0,0,1,145.594203,164.537037,74,101,0.486842,0.463303


In [35]:
# Create your X and y for the model
X = df[['City', 'Venue', 'MatchImportance', 'TossDecision', 'Team1', 'Team2', 'TeamA_batting_average',
       'TeamB_batting_average', 'team_1_toss_winner', 'Team1_points_against_avg', 'Team2_points_against_avg', 'Team1_MVP_average', 'Team2_MVP_average']]
y = df['TeamA_innings_total']
X.MatchImportance.isna().sum()

0

In [36]:
# Choose which columns to be OneHotEncoded or standardised

categorical_cols = ['City', 'Venue', 'TossDecision', 'Team1', 'Team2']
numerical_cols = ['TeamA_batting_average', 'TeamB_batting_average', 'MatchImportance', 'team_1_toss_winner', 'Team1_points_against_avg', 'Team2_points_against_avg', 'Team1_MVP_average', 'Team2_MVP_average']

# Create a column transformer to normalise the different types of data
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(drop='if_binary', handle_unknown='ignore'), categorical_cols),
    ('num', MinMaxScaler(), numerical_cols)
])

# Create the pipeline to run the data through
pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

X_encoded = pipeline.fit_transform(X)

# Get the column names from the preprocessor
X_encoded = pd.DataFrame.sparse.from_spmatrix(X_encoded, columns=preprocessor.get_feature_names_out())


X_encoded

Unnamed: 0,cat__City_abu dhabi,cat__City_ahmedabad,cat__City_bangalore,cat__City_bengaluru,cat__City_bloemfontein,cat__City_cape town,cat__City_centurion,cat__City_chandigarh,cat__City_chennai,cat__City_cuttack,...,cat__Team2_royal challengers bangalore,cat__Team2_sunrisers hyderabad,num__TeamA_batting_average,num__TeamB_batting_average,num__MatchImportance,num__team_1_toss_winner,num__Team1_points_against_avg,num__Team2_points_against_avg,num__Team1_MVP_average,num__Team2_MVP_average
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.575459,0.899382,1.0,1.0,0.663842,0.754743,0.508800,1.000000
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.584557,0.575459,0.5,0.0,0.524501,0.588195,0.484133,0.508800
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.584557,1.000000,0.5,0.0,0.524501,1.000000,0.484133,0.767528
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.575459,0.899382,0.5,0.0,0.663842,0.754743,0.508800,1.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.579806,0.667019,0.0,1.0,0.482515,0.665688,0.466887,0.404347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.500405,0.515889,0.0,0.0,0.500054,0.524729,0.531630,0.200738
944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.672768,0.584557,0.0,1.0,0.665491,0.702764,0.657114,0.484133
945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.575459,0.527237,0.0,0.0,0.562876,0.588195,0.388662,0.508800
946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.726434,0.667019,0.0,0.0,0.697874,0.567094,0.404347,0.744536


In [37]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = GradientBoostingRegressor()

# Fit the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")

Mean Squared Error: 749.07
Root Mean Squared Error: 27.37
R-squared (R2) Score: -0.06


In [None]:
# Initialize the GradientBoostingRegressor model
model = GradientBoostingRegressor()

# Define the hyperparameter grid for the random search
param_grid = {
    'n_estimators': np.arange(50, 251, 10),        
    'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.5],  
    'max_depth': np.arange(3, 11),                
    'min_samples_split': np.arange(2, 11),        
    'min_samples_leaf': np.arange(1, 11),        
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0]      
}

# Initialize the RandomizedSearchCV object
random_search = RandomizedSearchCV(
    model, 
    param_distributions=param_grid, 
    n_iter=50, 
    scoring='r2', 
    cv=5, 
    random_state=42, 
    n_jobs=-1
)

# Fit the random search to the training data
random_search.fit(X_train, y_train)

# Get the best model from the random search
best_model = random_search.best_estimator_

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)

# Evaluate the best model
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print("Best Model Hyperparameters:")
print(random_search.best_params_)
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")

In [38]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

# CREATE A SEPARATE X and y to try a XGBoost model
X2 = df[['City', 'Venue', 'MatchImportance', 'TossDecision', 'Team1', 'Team2', 'TeamA_batting_average',
       'TeamB_batting_average', 'team_1_toss_winner', 'Team1_points_against_avg', 'Team2_points_against_avg', 'Team1_MVP_average', 'Team2_MVP_average']]
y2 = df['Team_1_Win']

# encode the new X2 data
# Choose which columns to be OneHotEncoded or LabelEncoded
categorical_cols = ['City', 'Venue', 'TossDecision', 'Team1', 'Team2' ]
numerical_cols = ['TeamA_batting_average', 'TeamB_batting_average', 'team_1_toss_winner', 'MatchImportance', 'Team1_points_against_avg', 'Team2_points_against_avg']

# Create a column transformer to normalise the different types of data
preprocessor2 = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(drop='if_binary', handle_unknown='ignore'), categorical_cols),
    ('num', RobustScaler(), numerical_cols)
])

# Create the pipeline to run the data through
pipeline3 = Pipeline([
    ('preprocessor', preprocessor2)
])

X2_encoded = pipeline3.fit_transform(X)

# Get the column names from the preprocessor
X2_encoded = pd.DataFrame.sparse.from_spmatrix(X2_encoded, columns=preprocessor2.get_feature_names_out())
X2_encoded



# Try with a simple Logistic Regression
# Split data into training and testing sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_encoded, y2, test_size=0.2, random_state=42, shuffle=True)

# Initialize and train the logistic regression model
model = LogisticRegression(penalty='l2', C=5, random_state=42)
model.fit(X2_train, y2_train)

# Make predictions on the testing set
y2_pred = model.predict(X2_test)

# Evaluate the model
accuracy = round(accuracy_score(y2_test, y2_pred), 4)

print(f'Accuracy: {accuracy}')

Accuracy: 0.5053


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
# Do a random grid search to optimize the model

# Define the hyperparameter grid you want to search over
param_grid = {
    'penalty': ['l1', 'l2'],  
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  
    'solver': ['liblinear', 'saga'] 
}

# Create the Logistic Regression model
logistic_regression = LogisticRegression()

# Create the gridsearch
grid_search = GridSearchCV(logistic_regression, 
                           param_grid, 
                           cv=5,
                           n_jobs=-1,
                           scoring='accuracy')

# Fit the random search to the data
grid_search.fit(X2_train, y2_train)

# Get the best hyperparameters from the random search
best_params = grid_search.best_params_

# Get the best estimator (model) from the random search
best_model = grid_search.best_estimator_

# Make predictions on the testing set using the best model
y2_pred = best_model.predict(X2_test)

# Evaluate the best model
accuracy = round(accuracy_score(y2_test, y2_pred), 4)

print(f'Best Hyperparameters: {best_params}')
print(f'Best Model Accuracy: {accuracy}')






Best Hyperparameters: {'C': 1, 'penalty': 'l1', 'solver': 'saga'}
Best Model Accuracy: 0.5316




In [40]:
import xgboost as xgb

# Initialize and train the model
# Create an XGBoost classifier
clf = xgb.XGBClassifier(
    objective='binary:logistic',  
    n_estimators=1000,            
    max_depth=10,                 
    learning_rate=0.01            
)
clf.fit(X2_train, y2_train)

# Make predictions on the testing set
y2_pred = clf.predict(X2_test)

# Evaluate the model
accuracy = round(accuracy_score(y2_test, y2_pred), 2)

print(f'Accuracy: {accuracy}')

Accuracy: 0.51


In [41]:
from scipy.stats import uniform, randint
# RandomSearch the XGBoost

# Define hyperparameter values
param_dist = {
    'n_estimators': randint(300, 500),  
    'max_depth': randint(8,12),  
    'learning_rate': uniform(0.1, 1),  
    'subsample': uniform(0.2, 0.9),  
    'colsample_bytree': uniform(0.1, 0.5),
    'alpha': uniform(0, 2),      
    'lambda': uniform(0, 2),     
    'min_child_weight': randint(1, 10),
    'gamma': uniform(0, 1)
}

# Create the XGBoost classifier
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

# Create the RandomizedSearchCV object with 5-fold cross-validation
random_search = RandomizedSearchCV(xgb_classifier, 
                                   param_distributions=param_dist, 
                                   n_iter=500, 
                                   cv=5,
                                   n_jobs=-1,
                                   scoring='accuracy', 
                                   random_state=42)

# Fit the random search to the data
random_search.fit(X2_train, y2_train)

# Get the best hyperparameters from the random search
best_params = random_search.best_params_

# Get the best estimator (model) from the random search
best_model = random_search.best_estimator_

# Make predictions on the testing set using the best model
y2_pred = best_model.predict(X2_test)

# Evaluate the best model
accuracy = round(accuracy_score(y2_test, y2_pred), 2)

print(f'Best Hyperparameters: {best_params}')
print(f'Best Model Accuracy: {accuracy}')

255 fits failed out of a total of 2500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/patrickevans29/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/patrickevans29/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/xgboost/core.py", line 575, in inner_f
    return f(**kwargs)
  File "/Users/patrickevans29/.pyenv/versions/3.10.6/envs/lewagon/lib/python3.10/site-packages/xgboost/sklearn.py", line 1400, in fit
    self._Booster = train(
  File "/Users/patrickevans29/.pyenv/versions/3.10.6

Best Hyperparameters: {'alpha': 1.1299687153505071, 'colsample_bytree': 0.5332739908745434, 'gamma': 0.5718091636071537, 'lambda': 1.8143996431212446, 'learning_rate': 1.0429838168160426, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 497, 'subsample': 0.5395862485668736}
Best Model Accuracy: 0.51


In [42]:
# Build Thibault's best XGBoost

t_clf = xgb.XGBClassifier(colsample_bytree=0.8, 
                        objective='binary:logistic',
                        learning_rate=0.01, 
                        max_depth=, 
                        n_estimators=1000, 
                        subsample=0.8, 
                        alpha=1.1299687153505071,
                        gamma=0.571809163607,
                         )

t_clf.fit(X2_train, y2_train)

# Make predictions on the testing set
y2_pred = t_clf.predict(X2_test)

# Evaluate the model
accuracy = round(accuracy_score(y2_test, y2_pred), 2)

print(f'Accuracy: {accuracy}')

Accuracy: 0.49


In [None]:
df.Venue.unique()