In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

In [None]:
file_path = 'train_data.csv'
df = pd.read_csv(file_path)

# Remove the 'id' column
df = df.drop(columns=['id'])

In [None]:
test_data = pd.read_csv('same_season_test_data.csv')

In [None]:
# Identify numeric columns train data
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

In [None]:
# Identify numeric columns test data
numeric_columns_test = test_data.select_dtypes(include=['float64', 'int64']).columns.tolist()
print(numeric_columns_test)

['id', 'home_team_rest', 'away_team_rest', 'home_pitcher_rest', 'away_pitcher_rest', 'season', 'home_batting_batting_avg_10RA', 'home_batting_onbase_perc_10RA', 'home_batting_onbase_plus_slugging_10RA', 'home_batting_leverage_index_avg_10RA', 'home_batting_RBI_10RA', 'away_batting_batting_avg_10RA', 'away_batting_onbase_perc_10RA', 'away_batting_onbase_plus_slugging_10RA', 'away_batting_leverage_index_avg_10RA', 'away_batting_RBI_10RA', 'home_pitching_earned_run_avg_10RA', 'home_pitching_SO_batters_faced_10RA', 'home_pitching_H_batters_faced_10RA', 'home_pitching_BB_batters_faced_10RA', 'away_pitching_earned_run_avg_10RA', 'away_pitching_SO_batters_faced_10RA', 'away_pitching_H_batters_faced_10RA', 'away_pitching_BB_batters_faced_10RA', 'home_pitcher_earned_run_avg_10RA', 'home_pitcher_SO_batters_faced_10RA', 'home_pitcher_H_batters_faced_10RA', 'home_pitcher_BB_batters_faced_10RA', 'away_pitcher_earned_run_avg_10RA', 'away_pitcher_SO_batters_faced_10RA', 'away_pitcher_H_batters_faced_

In [None]:
def handle_outliers_iqr(df, numeric_columns):
    for column in numeric_columns:
        # Calculate Q1, Q3, and IQR
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1

        # Define outlier boundaries
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Cap outliers to the boundaries
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])

    return df

# Select numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns

# Apply IQR handling to numeric columns
df_cleaned = handle_outliers_iqr(df, numeric_columns)

df_cleaned

Unnamed: 0,home_team_abbr,away_team_abbr,date,is_night_game,home_team_win,home_pitcher,away_pitcher,home_team_rest,away_team_rest,home_pitcher_rest,...,away_pitcher_H_batters_faced_skew,away_pitcher_BB_batters_faced_mean,away_pitcher_BB_batters_faced_std,away_pitcher_BB_batters_faced_skew,away_pitcher_leverage_index_avg_mean,away_pitcher_leverage_index_avg_std,away_pitcher_leverage_index_avg_skew,away_pitcher_wpa_def_mean,away_pitcher_wpa_def_std,away_pitcher_wpa_def_skew
0,KFH,KJP,2021-05-16,False,True,juradar01,carraca01,1.0,1.0,7.5,...,1.133350,-0.974559,-0.900633,-1.093425,0.896974,-0.611051,-0.398111,0.949021,1.007072,0.340438
1,VJV,HXK,2019-05-04,True,False,ramirer02,rodrich01,1.0,,7.5,...,-0.044641,-0.878649,-1.079528,-1.719608,0.050448,-0.851738,-0.202878,0.489511,-0.876286,1.416154
2,VJV,JEM,2019-06-10,True,True,jarvibr01,tropeni01,1.0,1.0,6.0,...,-0.100180,-1.702937,-0.867762,1.992552,-0.404961,-0.132717,-0.106344,2.481020,-0.200110,-0.026083
3,BPH,FBW,2018-06-26,True,True,diazyi01,johnsji04,1.0,1.0,5.0,...,-1.385079,-1.549095,-1.008470,0.116080,-1.236753,-0.119898,0.005985,1.646317,-0.764309,
4,RLJ,DPS,2016-07-05,True,False,willibr02,armstsh01,1.0,1.0,6.0,...,0.356122,0.663967,-0.123547,0.361822,-0.035276,-0.285671,-2.563819,0.527432,-0.911987,-1.109533
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11062,VQC,MOO,2016-07-08,True,True,lodolni01,danieda01,1.0,1.0,5.0,...,0.446414,0.498714,-0.016668,-0.360635,0.155712,0.054024,,1.031686,0.045731,-0.175099
11063,GKO,VQC,2023-04-19,False,False,gearrco01,ginnjt01,1.0,1.0,5.0,...,0.714723,-0.473958,0.793842,1.039308,1.475612,-0.929588,-0.903482,1.208667,2.000674,-1.396761
11064,ECN,QPO,2022-05-27,True,True,avilape01,rodrije01,1.0,1.0,5.0,...,2.539210,0.307611,-1.203648,0.436108,-0.429071,-0.005314,0.091690,0.673315,-0.254586,-2.032732
11065,QDH,HXK,2021-05-19,True,False,alvarjo02,rodriya01,1.0,1.0,5.0,...,0.595529,-0.715971,0.515236,1.147914,-0.038848,-0.328605,0.058396,0.807748,,1.090447


In [None]:
numeric_columns = df_cleaned.select_dtypes(include=[np.number]).columns
for column in numeric_columns:
    df_cleaned[column] = df_cleaned[column].fillna(df_cleaned[column].mean())

In [None]:
duplicate_count = df_cleaned.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

Number of duplicate rows: 0


In [None]:
# Check for categorical columns in test_data
categorical_columns = df_cleaned.select_dtypes(include=['object', 'category']).columns.tolist()

# Print the list of categorical columns
print("Categorical Columns in Train Data:", categorical_columns)

Categorical Columns in Train Data: ['home_team_abbr', 'away_team_abbr', 'date', 'is_night_game', 'home_pitcher', 'away_pitcher', 'home_team_season', 'away_team_season']


In [None]:
# Check for categorical columns in test_data
categorical_columns_test = test_data.select_dtypes(include=['object', 'category']).columns.tolist()

# Print the list of categorical columns
print("Categorical Columns in Test Data:", categorical_columns_test)


Categorical Columns in Test Data: ['home_team_abbr', 'away_team_abbr', 'is_night_game', 'home_pitcher', 'away_pitcher', 'home_team_season', 'away_team_season']


In [None]:
# Define the categorical columns based on the training data
categorical_columns = ['home_team_abbr', 'away_team_abbr', 'is_night_game',
                       'home_pitcher', 'away_pitcher', 'home_team_season', 'away_team_season']

# Define the target column
target_column = 'home_team_win'

# Create copies of the DataFrames to avoid modifying the originals
df_target_encoded = df_cleaned.copy()
test_data_encoded = test_data.copy()

# K-fold cross-validation setup to prevent data leakage
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform target encoding for each categorical column
for col in categorical_columns:
    print(f"Performing Target Encoding for column: {col}")

    # Create a new column for the encoded values in the training and test DataFrames
    df_target_encoded[f"{col}_target_encoded"] = np.nan
    test_data_encoded[f"{col}_target_encoded"] = np.nan

    # Apply target encoding within each fold on the training data
    for train_idx, valid_idx in kf.split(df_target_encoded):
        # Split the data into training and validation sets
        train_data = df_target_encoded.iloc[train_idx]
        valid_data = df_target_encoded.iloc[valid_idx]

        # Compute the mean of the target variable for each category in the training set
        target_mean = train_data.groupby(col)[target_column].mean()

        # Map the computed mean to the training and test data sets
        df_target_encoded.loc[valid_idx, f"{col}_target_encoded"] = valid_data[col].map(target_mean)
        test_data_encoded[f"{col}_target_encoded"] = test_data_encoded[col].map(target_mean)

    # Fill NaN values (e.g., categories not seen in training data) with the global mean of the target
    global_mean = df_target_encoded[target_column].mean()
    df_target_encoded[f"{col}_target_encoded"].fillna(global_mean, inplace=True)
    test_data_encoded[f"{col}_target_encoded"].fillna(global_mean, inplace=True)

# Drop the original categorical columns since they are encoded
df_target_encoded.drop(columns=categorical_columns, inplace=True)
test_data_encoded.drop(columns=categorical_columns, inplace=True)

# Check the updated DataFrames
print("Updated training data after target encoding:\n", df_target_encoded.head())
print("Updated test data after target encoding:\n", test_data_encoded.head())


Performing Target Encoding for column: home_team_abbr
Performing Target Encoding for column: away_team_abbr


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_target_encoded[f"{col}_target_encoded"].fillna(global_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data_encoded[f"{col}_target_encoded"].fillna(global_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work be

Performing Target Encoding for column: is_night_game
Performing Target Encoding for column: home_pitcher


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_target_encoded[f"{col}_target_encoded"].fillna(global_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data_encoded[f"{col}_target_encoded"].fillna(global_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work be

Performing Target Encoding for column: away_pitcher


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_target_encoded[f"{col}_target_encoded"].fillna(global_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data_encoded[f"{col}_target_encoded"].fillna(global_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work be

Performing Target Encoding for column: home_team_season
Performing Target Encoding for column: away_team_season
Updated training data after target encoding:
          date  home_team_win  home_team_rest  away_team_rest  \
0  2021-05-16           True             1.0             1.0   
1  2019-05-04          False             1.0             1.0   
2  2019-06-10           True             1.0             1.0   
3  2018-06-26           True             1.0             1.0   
4  2016-07-05          False             1.0             1.0   

   home_pitcher_rest  away_pitcher_rest  season  \
0                7.5                5.0  2021.0   
1                7.5                6.0  2019.0   
2                6.0                6.0  2019.0   
3                5.0                6.0  2018.0   
4                6.0                5.0  2016.0   

   home_batting_batting_avg_10RA  home_batting_onbase_perc_10RA  \
0                      -1.225891                      -1.043317   
1               

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_target_encoded[f"{col}_target_encoded"].fillna(global_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data_encoded[f"{col}_target_encoded"].fillna(global_mean, inplace=True)


In [None]:
# Convert 'date' column to datetime, coercing errors to NaT
df_target_encoded['date'] = pd.to_datetime(df_target_encoded['date'], errors='coerce')

# Verify the conversion
print("Date column dtype after conversion:", df_target_encoded['date'].dtype)
print("Sample of the 'date' column:", df_target_encoded['date'].head())

Date column dtype after conversion: datetime64[ns]
Sample of the 'date' column: 0   2021-05-16
1   2019-05-04
2   2019-06-10
3   2018-06-26
4   2016-07-05
Name: date, dtype: datetime64[ns]


In [None]:
# Identify rows with invalid dates
invalid_dates = df_target_encoded[df_target_encoded['date'].isna()]
print(invalid_dates)

Empty DataFrame
Columns: [date, home_team_win, home_team_rest, away_team_rest, home_pitcher_rest, away_pitcher_rest, season, home_batting_batting_avg_10RA, home_batting_onbase_perc_10RA, home_batting_onbase_plus_slugging_10RA, home_batting_leverage_index_avg_10RA, home_batting_RBI_10RA, away_batting_batting_avg_10RA, away_batting_onbase_perc_10RA, away_batting_onbase_plus_slugging_10RA, away_batting_leverage_index_avg_10RA, away_batting_RBI_10RA, home_pitching_earned_run_avg_10RA, home_pitching_SO_batters_faced_10RA, home_pitching_H_batters_faced_10RA, home_pitching_BB_batters_faced_10RA, away_pitching_earned_run_avg_10RA, away_pitching_SO_batters_faced_10RA, away_pitching_H_batters_faced_10RA, away_pitching_BB_batters_faced_10RA, home_pitcher_earned_run_avg_10RA, home_pitcher_SO_batters_faced_10RA, home_pitcher_H_batters_faced_10RA, home_pitcher_BB_batters_faced_10RA, away_pitcher_earned_run_avg_10RA, away_pitcher_SO_batters_faced_10RA, away_pitcher_H_batters_faced_10RA, away_pitcher_

In [None]:
# Drop 'date' and 'id' columns from the features
X = df_target_encoded.drop(columns=['home_team_win', 'date'])  # Drop target and unnecessary columns
y = df_target_encoded['home_team_win']  # Target column
z = df_target_encoded['date']
# Initialize and train the RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)

# Get feature importance
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
})

# Sort features by importance
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)


NameError: name 'df_imp_features' is not defined

In [None]:
# Step 2: Filter features based on importance (threshold adjustment)
threshold = 0.004  # Only keep features with importance greater than this

important_features = feature_importances[feature_importances['Importance'] > threshold]

# Get the list of important features
important_columns = important_features['Feature'].tolist()

# Keep only the important features in X
df_imp_features = X[important_columns]

# Append the target column 'home_team_win' (y) to df_imp_features
df_imp_features['home_team_win'] = y  # Append target column to the DataFrame

# Append the 'date' column from z
df_imp_features['date'] = z.values  # Re-add the 'date' column to the final DataFrame

# Check the first few rows to ensure it's added correctly
print(df_imp_features)

       home_team_season_target_encoded  away_pitcher_target_encoded  \
0                             0.583333                     0.437500   
1                             0.511111                     0.500000   
2                             0.590909                     0.428571   
3                             0.487179                     0.512821   
4                             0.644444                     0.400000   
...                                ...                          ...   
11062                         0.540541                     0.543478   
11063                         0.476190                     0.666667   
11064                         0.459459                     0.615385   
11065                         0.600000                     0.425532   
11066                         0.571429                     0.529412   

       home_team_abbr_target_encoded  away_team_season_target_encoded  \
0                           0.584459                         0.515152   
1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_imp_features['home_team_win'] = y  # Append target column to the DataFrame
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_imp_features['date'] = z.values  # Re-add the 'date' column to the final DataFrame


In [None]:
non_imp_features = feature_importances[feature_importances['Importance'] < threshold]
non_imp_features

Unnamed: 0,Feature,Importance
4,season,0.002969
2,home_pitcher_rest,0.001802
3,away_pitcher_rest,0.001767
1,away_team_rest,0.0
0,home_team_rest,0.0


In [None]:
test_data_imp_features = test_data_encoded.drop(columns = ['season', 'home_pitcher_rest', 'away_pitcher_rest', 'away_team_rest', 'home_team_rest'])

In [None]:
valid_data = df_imp_features[(df_imp_features['date'] >= '2016-08-01') & (df_imp_features['date'] <= '2023-12-31')]
valid_data

Unnamed: 0,home_team_season_target_encoded,away_pitcher_target_encoded,home_team_abbr_target_encoded,away_team_season_target_encoded,away_pitching_SO_batters_faced_10RA,away_pitching_SO_batters_faced_mean,home_pitcher_target_encoded,away_team_abbr_target_encoded,away_batting_leverage_index_avg_10RA,home_pitcher_wpa_def_mean,...,home_batting_onbase_perc_mean,away_batting_onbase_plus_slugging_std,away_pitcher_leverage_index_avg_std,home_batting_RBI_10RA,away_batting_RBI_10RA,away_team_wins_std,home_team_wins_std,is_night_game_target_encoded,home_team_win,date
0,0.583333,0.437500,0.584459,0.515152,-0.208237,1.951345,0.200000,0.501706,0.736905,0.266738,...,-0.682960,-1.061603,-0.611051,-0.117454,-0.293145,0.234694,0.269419,0.546199,True,2021-05-16
1,0.511111,0.500000,0.519608,0.435897,0.922561,1.519616,0.642857,0.405844,0.345131,-0.713193,...,0.324222,0.969078,-0.851738,0.858849,-0.115893,0.226817,0.275837,0.526013,False,2019-05-04
2,0.590909,0.428571,0.518152,0.461538,0.834443,0.580312,0.764706,0.432343,-0.362811,0.675709,...,0.975731,0.567774,-0.132717,1.036358,-0.204519,-0.304426,0.276870,0.529980,True,2019-06-10
3,0.487179,0.512821,0.535836,0.525000,0.665862,0.799526,0.480000,0.496753,-2.644722,-0.007993,...,-0.147080,0.582897,-0.119898,-0.383718,1.213491,0.235484,0.274461,0.518926,True,2018-06-26
6,0.485714,0.461538,0.500000,0.589744,0.675813,0.334970,0.676471,0.546075,1.190538,0.617588,...,-0.573253,2.342471,-0.953373,-1.005002,0.327235,0.287269,0.114405,0.529980,False,2019-04-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11061,0.700000,1.000000,0.617747,0.700000,0.218074,-0.541665,0.709091,0.606061,-0.211600,0.406028,...,-0.005400,-0.786023,0.219244,-0.472473,-1.179401,0.089273,-0.382057,0.520209,True,2022-06-03
11063,0.476190,0.666667,0.540816,0.605263,-0.249441,-0.399764,0.583333,0.584775,-0.211600,-2.309079,...,-0.288156,0.626991,-0.929588,-0.472473,-0.204519,0.076419,-0.382057,0.543307,False,2023-04-19
11064,0.459459,0.615385,0.461794,0.567568,-1.601572,-0.140705,0.272727,0.514754,-0.603374,-0.881181,...,-0.005400,0.441618,-0.005314,0.148810,1.390742,0.236106,-0.312460,0.526013,True,2022-05-27
11065,0.600000,0.425532,0.493243,0.421053,0.675784,0.567707,0.375000,0.410596,0.235160,0.011260,...,-0.280767,0.374036,-0.328605,-0.472473,1.213491,0.205666,0.097200,0.518926,False,2021-05-19


In [None]:
# Split the training data (January to July, 2016–2023)
train_data = df_imp_features[df_imp_features['date'] <= '2023-07-31']

# Separate features and target
X_train = train_data.drop(['home_team_win', 'date'], axis=1)
y_train = train_data['home_team_win']


NameError: name 'df_imp_features' is not defined

In [None]:
X_test = test_data_imp_features.drop(['id'], axis=1)
X_test

Unnamed: 0,home_batting_batting_avg_10RA,home_batting_onbase_perc_10RA,home_batting_onbase_plus_slugging_10RA,home_batting_leverage_index_avg_10RA,home_batting_RBI_10RA,away_batting_batting_avg_10RA,away_batting_onbase_perc_10RA,away_batting_onbase_plus_slugging_10RA,away_batting_leverage_index_avg_10RA,away_batting_RBI_10RA,...,away_pitcher_wpa_def_mean,away_pitcher_wpa_def_std,away_pitcher_wpa_def_skew,home_team_abbr_target_encoded,away_team_abbr_target_encoded,is_night_game_target_encoded,home_pitcher_target_encoded,away_pitcher_target_encoded,home_team_season_target_encoded,away_team_season_target_encoded
0,0.815178,-0.139214,0.860108,-0.685997,0.592584,0.423601,0.720581,1.307045,0.716285,1.213491,...,0.710790,-0.305795,-1.103229,0.609428,0.534014,0.541571,0.648649,0.583333,0.595238,0.611111
1,-0.819056,-1.043317,-0.501758,0.266741,-0.827492,0.099930,0.548661,0.850646,-1.895540,1.213491,...,-0.129330,0.467740,0.873353,0.519608,0.577049,0.526013,0.411765,0.578947,0.575000,0.525000
2,0.566940,0.133967,1.167550,-0.119877,0.681339,0.974529,0.720581,,-0.582754,,...,-0.486104,0.380234,0.944937,0.515358,0.533557,0.526013,0.500000,0.285714,0.425000,0.531038
3,0.842760,,0.494216,0.384107,-0.028699,0.213559,-0.388785,-0.707848,,-0.470396,...,-0.583598,0.654581,,0.470199,0.602007,0.541571,0.250000,0.580645,0.275000,0.642857
4,1.849503,1.038070,0.939598,0.722399,0.326320,0.520013,0.412424,0.027269,0.888116,-0.027268,...,-0.262628,0.444447,-0.113465,0.532203,0.500000,0.541571,0.531038,0.576923,0.275000,0.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6180,0.329045,-0.334344,-0.410577,0.252933,-0.827492,-1.563185,-2.250703,-2.327891,2.022198,-1.268026,...,-0.515166,-1.832176,-2.189157,0.491468,0.617747,0.526013,0.333333,0.625000,0.371429,0.702703
6181,0.225612,0.576262,-0.217695,0.867381,-0.028699,0.265209,0.470811,1.287303,-1.008894,,...,-0.849379,-1.228204,0.145295,0.590909,0.533557,0.541571,0.694444,0.666667,0.743590,0.531038
6182,-0.367401,-0.669318,-0.354466,0.356492,0.148810,,-0.012509,0.021462,-0.967655,0.770363,...,0.444184,0.262688,0.417515,0.519608,0.577049,0.526013,1.000000,0.437500,0.567568,0.605263
6183,1.487489,1.691755,1.822181,-1.680159,1.213868,0.003518,0.269698,-0.001764,-0.012276,0.149983,...,0.690523,0.815152,-0.113742,0.560976,0.460526,0.541571,0.627907,0.414634,0.648649,0.500000


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_imp_features[numeric_columns] = scaler.fit_transform(df_cleaned[numeric_columns])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_imp_features[numeric_columns] = scaler.fit_transform(df_cleaned[numeric_columns])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_imp_features[numeric_columns] = scaler.fit_transform(df_cleaned[numeric_columns])


In [None]:
from sklearn.ensemble import GradientBoostingClassifier


In [None]:
param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'subsample': [0.8]
}

In [None]:
gb_model = GradientBoostingClassifier(random_state=42)

In [None]:
random_search = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=param_grid,
    n_iter=10,             # Moderate number of iterations
    scoring='accuracy',    # Optimize for accuracy
    cv=3,                  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1
)

In [None]:
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits




In [None]:
best_params = random_search.best_params_
print("Best Parameters:", best_params)

Best Parameters: {'subsample': 0.8, 'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 3, 'learning_rate': 0.05}


In [None]:
best_gb_model = random_search.best_estimator_

In [None]:
# Align columns of X_test with X_train
X_test = X_test[X_train.columns]

# Now the feature names and order should match
print("X_test columns aligned with X_train:")
print(X_test.columns)

X_test columns aligned with X_train:
Index(['home_team_season_target_encoded', 'away_pitcher_target_encoded',
       'home_team_abbr_target_encoded', 'away_team_season_target_encoded',
       'away_pitching_SO_batters_faced_10RA',
       'away_pitching_SO_batters_faced_mean', 'home_pitcher_target_encoded',
       'away_team_abbr_target_encoded', 'away_batting_leverage_index_avg_10RA',
       'home_pitcher_wpa_def_mean',
       ...
       'home_team_wins_skew', 'away_batting_onbase_plus_slugging_mean',
       'home_batting_onbase_perc_mean',
       'away_batting_onbase_plus_slugging_std',
       'away_pitcher_leverage_index_avg_std', 'home_batting_RBI_10RA',
       'away_batting_RBI_10RA', 'away_team_wins_std', 'home_team_wins_std',
       'is_night_game_target_encoded'],
      dtype='object', length=159)


In [None]:
# Ensure the test data has all the columns in the training data
missing_columns = set(X_train.columns) - set(X_test.columns)
for col in missing_columns:
    X_test[col] = 0  # Assign a default value (e.g., 0)

# Drop any extra columns in X_test not present in X_train
X_test = X_test[X_train.columns]

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(random_state=42)
X_test = pd.DataFrame(imputer.fit_transform(X_test), columns=X_test.columns)



In [None]:
y_valid_pred_gb = best_gb_model.predict(X_test)

In [None]:
submission = pd.DataFrame({
    'id': test_data['id'],        # Test data IDs
    'home_team_win': ['True' if pred == 1 else 'False' for pred in y_valid_pred_gb]  # Convert 1/0 to True/False
})

In [None]:
submission

Unnamed: 0,id,home_team_win
0,0,True
1,1,False
2,2,True
3,3,True
4,4,False
...,...,...
6180,6180,True
6181,6181,True
6182,6182,True
6183,6183,True


In [None]:
submission.to_csv('submission2.3_stage1.csv', index=False)

print("Submission file 'submission_stage1.csv' created successfully!")

Submission file 'submission_stage1.csv' created successfully!
