# Importing Libraries and Data Sets

In [28]:
import pandas as pd
from flaml import AutoML
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import joblib

df_elo_latest = pd.read_csv('mlb_elo_latest.csv')
df_elo = pd.read_csv('mlb_elo.csv')
df_combined = pd.concat([df_elo_latest, df_elo], ignore_index=True)

# Feature Engineering

In [29]:
#using averages team wise to fill the NaN values for elo1_post & elo2_post
df_combined['elo1_post'] = df_combined.groupby('team1')['elo1_post'].transform(lambda x: x.fillna(x.mean()))
df_combined['elo2_post'] = df_combined.groupby('team2')['elo2_post'].transform(lambda x: x.fillna(x.mean()))

# Drop rows with NaN values in pitcher1 and pitcher2
df_combined.dropna(subset=['pitcher1', 'pitcher2'], inplace=True)
df_combined['pitcher1_rgs'].fillna(df_combined['pitcher1_rgs'].mean(), inplace=True)
df_combined['pitcher2_rgs'].fillna(df_combined['pitcher2_rgs'].mean(), inplace=True)

# Fill NaN values in 'pitcher1_adj' and 'pitcher2_adj' using the mean for each team
df_combined['pitcher1_adj'] = df_combined.groupby('team1')['pitcher1_adj'].transform(lambda x: x.fillna(x.mean()))
df_combined['pitcher1_adj'].fillna(df_combined['pitcher1_adj'].mean(), inplace=True)

df_combined['pitcher2_adj'] = df_combined.groupby('team2')['pitcher2_adj'].transform(lambda x: x.fillna(x.mean()))
df_combined['pitcher2_adj'].fillna(df_combined['pitcher2_adj'].mean(), inplace=True)

# Drop rows with NaN values in 'rating1_post', 'rating2_post', 'score1', and 'score2'
df_combined.dropna(subset=['rating1_post', 'rating2_post', 'score1', 'score2'], inplace=True)
df_combined.drop(columns=['playoff'], inplace=True)
df_combined.drop(columns=['date'], inplace=True)
team_and_pitchers = df_combined[['team1', 'team2', 'pitcher1', 'pitcher2']]
df_combined.drop(columns=['team1', 'team2', 'pitcher1', 'pitcher2'], inplace=True)

# Train-Test Split for L1

In [40]:
target = 'elo1_post'
features = [col for col in df_combined.columns if col != target]

X = df_combined[features]
y = df_combined[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
scaler = StandardScaler()

lasso = Lasso(alpha=0.01)

pipeline = Pipeline([
    ('scaler', scaler),
    ('lasso', lasso)
])

pipeline.fit(X_train, y_train)
coef = pipeline.named_steps['lasso'].coef_
selected_features_lasso = [X.columns[i] for i in range(len(coef)) if coef[i] != 0]
print("Selected Features by Lasso:", selected_features_lasso)

Selected Features by Lasso: ['season', 'elo1_pre', 'elo2_pre', 'rating2_pre', 'pitcher1_rgs', 'pitcher2_rgs', 'rating1_post', 'score1', 'score2']


# Using AUTOML on selected features for best model

In [43]:
X_selected = df_combined[selected_features_lasso]
X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split(X_selected, y, test_size=0.2, random_state=42)


automl_sel = AutoML()
automl_sel.fit(X_train_sel, y_train_sel, task='regression', time_budget=3600)

predictions_sel = automl_sel.predict(X_test_sel)

[flaml.automl.logger: 06-16 18:20:29] {1680} INFO - task = regression
[flaml.automl.logger: 06-16 18:20:29] {1691} INFO - Evaluation method: holdout
[flaml.automl.logger: 06-16 18:20:29] {1789} INFO - Minimizing error metric: 1-r2
[flaml.automl.logger: 06-16 18:20:29] {1901} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth']
[flaml.automl.logger: 06-16 18:20:29] {2219} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 06-16 18:20:29] {2345} INFO - Estimated sufficient time budget=5923s. Estimated necessary time budget=42s.
[flaml.automl.logger: 06-16 18:20:29] {2392} INFO -  at 0.5s,	estimator lgbm's best error=0.4871,	best estimator lgbm's best error=0.4871
[flaml.automl.logger: 06-16 18:20:29] {2219} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 06-16 18:20:29] {2392} INFO -  at 0.5s,	estimator lgbm's best error=0.4871,	best estimator lgbm's best error=0.4871
[flaml.automl.logger: 06-16 18:20:29] {2219} IN

[flaml.automl.logger: 06-16 18:20:32] {2219} INFO - iteration 34, current learner lgbm
[flaml.automl.logger: 06-16 18:20:32] {2392} INFO -  at 3.6s,	estimator lgbm's best error=0.0005,	best estimator lgbm's best error=0.0005
[flaml.automl.logger: 06-16 18:20:32] {2219} INFO - iteration 35, current learner xgboost
[flaml.automl.logger: 06-16 18:20:32] {2392} INFO -  at 3.7s,	estimator xgboost's best error=0.0099,	best estimator lgbm's best error=0.0005
[flaml.automl.logger: 06-16 18:20:32] {2219} INFO - iteration 36, current learner extra_tree
[flaml.automl.logger: 06-16 18:20:32] {2392} INFO -  at 3.8s,	estimator extra_tree's best error=0.0015,	best estimator lgbm's best error=0.0005
[flaml.automl.logger: 06-16 18:20:32] {2219} INFO - iteration 37, current learner xgboost
[flaml.automl.logger: 06-16 18:20:32] {2392} INFO -  at 3.8s,	estimator xgboost's best error=0.0057,	best estimator lgbm's best error=0.0005
[flaml.automl.logger: 06-16 18:20:32] {2219} INFO - iteration 38, current le

[flaml.automl.logger: 06-16 18:20:41] {2219} INFO - iteration 70, current learner xgb_limitdepth
[flaml.automl.logger: 06-16 18:20:41] {2392} INFO -  at 12.9s,	estimator xgb_limitdepth's best error=0.0025,	best estimator lgbm's best error=0.0002
[flaml.automl.logger: 06-16 18:20:41] {2219} INFO - iteration 71, current learner xgb_limitdepth
[flaml.automl.logger: 06-16 18:20:41] {2392} INFO -  at 13.0s,	estimator xgb_limitdepth's best error=0.0019,	best estimator lgbm's best error=0.0002
[flaml.automl.logger: 06-16 18:20:41] {2219} INFO - iteration 72, current learner xgb_limitdepth
[flaml.automl.logger: 06-16 18:20:41] {2392} INFO -  at 13.1s,	estimator xgb_limitdepth's best error=0.0019,	best estimator lgbm's best error=0.0002
[flaml.automl.logger: 06-16 18:20:41] {2219} INFO - iteration 73, current learner xgb_limitdepth
[flaml.automl.logger: 06-16 18:20:41] {2392} INFO -  at 13.1s,	estimator xgb_limitdepth's best error=0.0019,	best estimator lgbm's best error=0.0002
[flaml.automl.lo

[flaml.automl.logger: 06-16 18:21:00] {2219} INFO - iteration 105, current learner rf
[flaml.automl.logger: 06-16 18:21:00] {2392} INFO -  at 31.9s,	estimator rf's best error=0.0010,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 18:21:00] {2219} INFO - iteration 106, current learner xgboost
[flaml.automl.logger: 06-16 18:21:01] {2392} INFO -  at 33.2s,	estimator xgboost's best error=0.0008,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 18:21:01] {2219} INFO - iteration 107, current learner xgb_limitdepth
[flaml.automl.logger: 06-16 18:21:02] {2392} INFO -  at 33.5s,	estimator xgb_limitdepth's best error=0.0005,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 18:21:02] {2219} INFO - iteration 108, current learner lgbm
[flaml.automl.logger: 06-16 18:21:04] {2392} INFO -  at 36.2s,	estimator lgbm's best error=0.0001,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 18:21:04] {2219} INFO - iteration 109, cur

[flaml.automl.logger: 06-16 18:25:02] {2392} INFO -  at 274.1s,	estimator lgbm's best error=0.0001,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 18:25:02] {2219} INFO - iteration 141, current learner extra_tree
[flaml.automl.logger: 06-16 18:25:03] {2392} INFO -  at 275.0s,	estimator extra_tree's best error=0.0004,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 18:25:03] {2219} INFO - iteration 142, current learner extra_tree
[flaml.automl.logger: 06-16 18:25:05] {2392} INFO -  at 276.4s,	estimator extra_tree's best error=0.0003,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 18:25:05] {2219} INFO - iteration 143, current learner extra_tree
[flaml.automl.logger: 06-16 18:25:06] {2392} INFO -  at 277.4s,	estimator extra_tree's best error=0.0003,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 18:25:06] {2219} INFO - iteration 144, current learner extra_tree
[flaml.automl.logger: 06-16 18:25:09] {2392} I

[flaml.automl.logger: 06-16 18:35:13] {2219} INFO - iteration 176, current learner xgb_limitdepth
[flaml.automl.logger: 06-16 18:35:15] {2392} INFO -  at 886.3s,	estimator xgb_limitdepth's best error=0.0005,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 18:35:15] {2219} INFO - iteration 177, current learner rf
[flaml.automl.logger: 06-16 18:35:20] {2392} INFO -  at 891.3s,	estimator rf's best error=0.0004,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 18:35:20] {2219} INFO - iteration 178, current learner rf
[flaml.automl.logger: 06-16 18:35:23] {2392} INFO -  at 895.0s,	estimator rf's best error=0.0004,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 18:35:23] {2219} INFO - iteration 179, current learner rf
[flaml.automl.logger: 06-16 18:35:33] {2392} INFO -  at 904.5s,	estimator rf's best error=0.0004,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 18:35:33] {2219} INFO - iteration 180, current learn

[flaml.automl.logger: 06-16 18:51:21] {2219} INFO - iteration 211, current learner lgbm
[flaml.automl.logger: 06-16 18:52:29] {2392} INFO -  at 1920.3s,	estimator lgbm's best error=0.0001,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 18:52:29] {2219} INFO - iteration 212, current learner lgbm
[flaml.automl.logger: 06-16 18:53:15] {2392} INFO -  at 1966.4s,	estimator lgbm's best error=0.0001,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 18:53:15] {2219} INFO - iteration 213, current learner xgb_limitdepth
[flaml.automl.logger: 06-16 18:53:25] {2392} INFO -  at 1976.6s,	estimator xgb_limitdepth's best error=0.0003,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 18:53:25] {2219} INFO - iteration 214, current learner extra_tree
[flaml.automl.logger: 06-16 18:53:29] {2392} INFO -  at 1980.3s,	estimator extra_tree's best error=0.0003,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 18:53:29] {2219} INFO - 

[flaml.automl.logger: 06-16 19:07:31] {2219} INFO - iteration 246, current learner extra_tree
[flaml.automl.logger: 06-16 19:07:41] {2392} INFO -  at 2833.2s,	estimator extra_tree's best error=0.0001,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 19:07:41] {2219} INFO - iteration 247, current learner extra_tree
[flaml.automl.logger: 06-16 19:08:20] {2392} INFO -  at 2871.8s,	estimator extra_tree's best error=0.0001,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 19:08:20] {2219} INFO - iteration 248, current learner lgbm
[flaml.automl.logger: 06-16 19:09:09] {2392} INFO -  at 2920.5s,	estimator lgbm's best error=0.0001,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 19:09:09] {2219} INFO - iteration 249, current learner extra_tree
[flaml.automl.logger: 06-16 19:09:22] {2392} INFO -  at 2934.0s,	estimator extra_tree's best error=0.0001,	best estimator lgbm's best error=0.0001
[flaml.automl.logger: 06-16 19:09:22] {2219} INF

NameError: name 'mean_squared_error' is not defined

In [44]:
#mse_sel = mean_squared_error(y_test_sel, predictions_sel)
#print(f"MSE with Selected Features: {mse_sel}")

# Printing the best model found by AutoML
print("\nBest Model Found by AutoML with Selected Features")
print(automl_sel.best_estimator)

MSE with Selected Features: 0.10704020887447696

Best Model Found by AutoML with Selected Features
lgbm


# Saving the model in a pickle file

In [45]:
#in our case AUTOML chose lgbm as the best model
joblib_file = "automl_model.pkl"
joblib.dump(automl_sel, joblib_file)

['automl_model.pkl']