In [7]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
import xgboost as xgb
import joblib


In [11]:
# Load the dataset
df = pd.read_csv('../model/masterdata.csv')

#Remove first 250 rows (because many custom stat values are empty)
df = df.tail(len(df)-255)

# Convert 'game_date' column to datetime objects
df['game_date'] = pd.to_datetime(df['game_date'])

# Get today's date
today = datetime.today().strftime('%Y-%m-%d')

# Filter out rows with today's date
df = df[df['game_date'] != today]

# Drop the columns containing 'Name', 'ID', or '_P_'
columns_to_drop = [col for col in df.columns if 'Name' in col or 'ID' in col or '_P_' in col or 'bbrefID' in col]
df = df.drop(columns=columns_to_drop)

# Drop rows with missing values in the target variable
df = df.dropna(subset=['over_under_runline'])

# Define features and target variable
X = df.drop(columns=['over_under_target', 'runs_total', 'game_date', 'runs_home', 'runs_away', 'game_id', 'home_name', 'away_name'])
y = df['over_under_target']

# You can now proceed to train your model using X and y
# Example: Train a RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


  df = pd.read_csv('../model/masterdata.csv')


In [12]:
# Define a pipeline with imputer, scaler, and model
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('var_thresh', VarianceThreshold(threshold=0.1)),  # Remove low-variance features
    ('scaler', StandardScaler()),
    ('model', xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
])

# Define a parameter grid for XGBoost
param_grid1 = {
    'model__n_estimators': [50, 100, 200],
    'model__learning_rate': [0.01, 0.1],
    'model__max_depth': [3, 5, 9],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0],
    'model__gamma': [0, 0.1],
    'model__min_child_weight': [1, 3, 7]
}

param_grid2 = {
    'model__n_estimators': [25, 75, 150, 250],
    'model__learning_rate': [0.05, 0.15],
    'model__max_depth': [6, 12, 15],
    'model__subsample': [0.25, 0.5, 0.9],
    'model__colsample_bytree': [0.2, 0.4, 0.6],
    'model__gamma': [0.05, 0.2, 0.3],
    'model__min_child_weight': [4, 8, 12]
}

# Grid search for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid1, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1, verbose=1)

# Fit the model on the training data
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:\n', conf_matrix)
print('Classification Report:\n', class_report)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Accuracy: 0.6278153153153153
Confusion Matrix:
 [[475 386]
 [275 640]]
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.55      0.59       861
           1       0.62      0.70      0.66       915

    accuracy                           0.63      1776
   macro avg       0.63      0.63      0.62      1776
weighted avg       0.63      0.63      0.63      1776



In [13]:
print(len(y_test), len(y_train))

1776 7101


In [14]:
# Save the best model
joblib.dump(best_model, 'xgb_model.pkl')

['xgb_model.pkl']

## Train 0 Model

In [101]:
# Load the dataset
df0 = pd.read_csv('masterdata.csv')

# Convert 'game_date' column to datetime objects
df0['game_date'] = pd.to_datetime(df0['game_date'])

# Change the 'push' games to equal 0.
df0.loc[df0['over_under_runline'] == df0['runs_total'], 'over_under_target'] = 0

# Get today's date
today = datetime.today().strftime('%Y-%m-%d')

# Filter out rows with today's date
df0 = df0[df0['game_date'] != today]

# Drop the columns containing 'Name', 'ID', or '_P_'
columns_to_drop = [col for col in df0.columns if 'Name' in col or 'ID' in col or '_P_' in col or 'bbrefID' in col]
df0 = df0.drop(columns=columns_to_drop)

# Drop rows with missing values in the target variable
df0 = df0.dropna(subset=['over_under_runline'])


# Define features and target variable
X = df0.drop(columns=['over_under_target', 'runs_total', 'game_date', 'runs_home', 'runs_away', 'game_id', 'home_name', 'away_name'])
y = df0['over_under_target']

# You can now proceed to train your model using X and y
# Example: Train a RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Define a pipeline with imputer, scaler, and model
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('var_thresh', VarianceThreshold(threshold=0.1)),  # Remove low-variance features
    ('scaler', StandardScaler()),
    ('model', xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
])

# Define a more granular parameter grid for XGBoost
param_grid = {
    'model__n_estimators': [50, 100, 150, 200],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0],
    'model__gamma': [0, 0.1],
    'model__min_child_weight': [1, 3, 5]
}

# Grid search for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1, verbose=1)

# Fit the model on the training data
grid_search.fit(X_train, y_train)

# Best model from grid search
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:\n', conf_matrix)
print('Classification Report:\n', class_report)

# Save the best model
joblib.dump(best_model, 'xgb_model_0.pkl')

  df0 = pd.read_csv('masterdata.csv')


Fitting 5 folds for each of 864 candidates, totalling 4320 fits
Accuracy: 0.639030612244898
Confusion Matrix:
 [[325  97]
 [186 176]]
Classification Report:
               precision    recall  f1-score   support

           0       0.64      0.77      0.70       422
           1       0.64      0.49      0.55       362

    accuracy                           0.64       784
   macro avg       0.64      0.63      0.63       784
weighted avg       0.64      0.64      0.63       784



['xgb_model_0.pkl']

## Create 0-Trained Predictions

In [133]:
# Load the dataset
df = pd.read_csv('masterdata.csv')

# Convert 'game_date' column to datetime objects
df['game_date'] = pd.to_datetime(df['game_date'])

# Change the 'push' games to equal 0.
df0.loc[df0['over_under_runline'] == df0['runs_total'], 'over_under_target'] = 0

# Get today's date
today = datetime.today().strftime('%Y-%m-%d')

# Separate the data for today's games
todays_games = df[df['game_date'] == today]

# Check if there are any games today
if todays_games.empty:
    print("No games found for today.")
else:
    # Define the columns to drop
    columns_to_drop = [col for col in df.columns if 'Name' in col or 'ID' in col or '_P_' in col or '12' in col or '13' in col or '14' in col or '15'in col ]
    columns_to_drop.extend(['over_under_target', 'runs_total', 'game_date', 'runs_home', 'runs_away', 'game_id', 'home_name', 'away_name']) 

    # Drop the unnecessary columns
    X_todays_games = todays_games.drop(columns=columns_to_drop)

    # Load the trained model (assuming it's saved as 'model.pkl')
    xgb_model = joblib.load('xgb_model_0.pkl')

# Make predictions
predictions = xgb_model.predict(X_todays_games)

# Interpret and display the predictions
todays_games['prediction'] = predictions
for i, row in todays_games.iterrows():
    result = 'Over' if row['prediction'] == 1 else 'Under'
    print(f"Game {i + 2}: {result} the runline")

  df = pd.read_csv('masterdata.csv')


Game 8088: Under the runline
Game 8089: Under the runline
Game 8090: Under the runline
Game 8091: Under the runline
Game 8092: Under the runline
Game 8093: Under the runline
Game 8094: Under the runline
Game 8095: Over the runline
Game 8096: Under the runline
Game 8097: Over the runline
Game 8098: Under the runline
Game 8099: Under the runline


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  todays_games['prediction'] = predictions
