In [37]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Load data
game_stats_df = pd.read_csv('game_stats.csv')

# Prepare the target variable
game_stats_df['target'] = (game_stats_df['game_winner'] == 'HOME').astype(int)

# Select relevant features
# feature_columns = [col for col in game_stats_df.columns if '10day_avg' in col]
feature_columns = [
    'ast_home_10game_avg', 'blk_home_10game_avg', 'dreb_home_10game_avg', 
    'fg3_pct_home_10game_avg', 'fg_pct_home_10game_avg', 'ft_pct_home_10game_avg', 
    'oreb_home_10game_avg', 'pf_home_10game_avg', 'pts_home_10game_avg', 
    'reb_home_10game_avg', 'stl_home_10game_avg', 'turnover_home_10game_avg',
    'ast_visitor_10game_avg', 'blk_visitor_10game_avg', 'dreb_visitor_10game_avg', 
    'fg3_pct_visitor_10game_avg', 'fg_pct_visitor_10game_avg', 'ft_pct_visitor_10game_avg', 
    'oreb_visitor_10game_avg', 'pf_visitor_10game_avg', 'pts_visitor_10game_avg', 
    'reb_visitor_10game_avg', 'stl_visitor_10game_avg', 'turnover_visitor_10game_avg',
    'ast_visitor_season_avg', 'blk_visitor_season_avg', 'dreb_visitor_season_avg', 
    'fg3_pct_visitor_season_avg', 'fg_pct_visitor_season_avg', 'ft_pct_visitor_season_avg', 
    'oreb_visitor_season_avg', 'pf_visitor_season_avg', 'pts_visitor_season_avg', 
    'reb_visitor_season_avg', 'stl_visitor_season_avg', 'turnover_visitor_season_avg'
    # ... make sure this order is exactly the same as in the training dataset
]

X = game_stats_df[feature_columns]

# Target variable
y = game_stats_df['target']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Imputation and scaling
imputer = SimpleImputer(strategy='mean')  # Replace missing values with the mean of the column
scaler = StandardScaler()

# Create a pipeline
model_pipeline = make_pipeline(imputer, scaler, LogisticRegression(max_iter=1000))

# Train the model
model_pipeline.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6290322580645161
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.70      0.58        23
           1       0.77      0.59      0.67        39

    accuracy                           0.63        62
   macro avg       0.63      0.64      0.62        62
weighted avg       0.67      0.63      0.64        62



In [36]:
from joblib import dump

# Save the model to a file
dump(model_pipeline, 'nba_game_predictor_model.joblib')

['nba_game_predictor_model.joblib']