In [None]:
import pandas as pd
import numpy as np
import json
import requests
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn import metrics
import seaborn as sn

import sys
sys.path.insert(1, '/Users/peterberryman/Desktop/bts_advisor')
import app

### Predictors

In [None]:
schedule = app.get_schedule(year=2021)
schedule

In [None]:
schedule = schedule[schedule['game_time'] == 'Final']
game_dates = schedule['game_date'].unique()[30:-1] # Not interested in first month of season (not enough data)
game_dates

In [None]:
dfs = list()
for date in game_dates[-11:-1]:
    print(date)
    dfs.append(pd.read_json(f'http://localhost:5000/loadTableData?hitMin=10&date={date}'))
df = pd.concat(dfs, ignore_index=True)

In [None]:
# Remove extraneous columns
df.drop([col for col in ['batter', 'game_pk', 'probability', 'B', 'name', 'team', 'G_weighted', 'H_weighted'] if col in df.columns], axis=1, inplace=True)
df['hit'] = df['hit'].apply(lambda x: 1 if x > 0 else 0)
df.columns

In [None]:
df

### Model

In [None]:
X = df.drop('hit', axis=1)
y = df['hit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
y_train.value_counts()

In [None]:
X_train[X_train.isna().any(axis=1)]

In [None]:
model = sm.Logit(y_train, X_train.astype(float)).fit()
model.summary()

In [None]:
y_pred = model.predict(X_test.astype(float))
X_test_copy = X_test.copy()
X_test_copy['probability'] = y_pred
X_test_copy

In [None]:
threshold = 0.75

In [None]:
confusion_matrix = pd.crosstab(y_test, y_pred.apply(lambda x: 1 if x >= threshold else 0), rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)

In [None]:
model.save('log_reg_model.pickle')