In [None]:
import pandas as pd
import numpy as np
import json
import requests
import pymongo
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn import metrics
import seaborn as sn

import sys
sys.path.insert(1, '/Users/peterberryman/Desktop/bts_advisor')
import app

### Past Results

In [None]:
# Database
client = pymongo.MongoClient(input())

In [None]:
database = client['statcast']
collection = database['statcastEvents']

In [None]:
statcast_df = pd.DataFrame(collection.find())
statcast_df

In [None]:
statcast_df['hit'] = statcast_df['events'].apply(lambda event: 1 if event in ['home_run', 'triple', 'double', 'single'] else 0)
statcast_df

In [None]:
outcomes_df = statcast_df.groupby(['game_pk', 'batter'])['hit'].sum().reset_index()
outcomes_df['hit'] = outcomes_df['hit'].apply(lambda x: 1 if x > 0 else x)
outcomes_df

In [None]:
outcomes_df['hit'].value_counts()

### Predictors

In [None]:
schedule = app.get_schedule(year=2021)

In [None]:
schedule = schedule[schedule['game_time'] == 'Final']
game_dates = schedule['game_date'].unique()[10:-1]
game_dates

In [None]:
rows_df, games_df, metrics_df = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
for date in game_dates[-4:-1]:
    url = f'http://localhost:5000/loadTableData?hitMin=10&date={date}'
    print(url)
    response = requests.get(url)
    response_json = json.loads(response.text)
    rows_df = rows_df.append(pd.DataFrame(response_json['rows']), ignore_index=True)
    games_df = games_df.append(pd.DataFrame(response_json['games']).T.reset_index().rename({'index': 'game_pk'}, axis=1), ignore_index=True)
    metrics_df = metrics_df.append(pd.DataFrame(response_json['metrics']).T.reset_index().rename({'index': 'batter'}, axis=1), ignore_index=True)

In [None]:
# Base
rows_df

In [None]:
# Lineups
games_df = games_df
games_df['game_pk'] = games_df['game_pk'].astype(int)
df = pd.merge(rows_df, games_df[['game_pk', 'away_lineup', 'home_lineup']], on='game_pk')
df['order'] = df.apply(lambda row: row['away_lineup'].index(row['batter']) + 1 if row['batter'] in row['away_lineup'] else row['home_lineup'].index(row['batter']) + 1 if row['batter'] in row['home_lineup'] else 0, axis=1)
df = df[df['order'] > 0].drop(['away_lineup', 'home_lineup'], axis=1)
df

In [None]:
# Outcomes
df = pd.merge(df, outcomes_df, on=['game_pk', 'batter'])
df

In [None]:
# Metrics
metrics_df['batter'] = metrics_df['batter'].astype(int)
metrics_df.drop([col for col in metrics_df.columns if col.endswith('_color')], axis=1, inplace=True)
df = pd.merge(df, metrics_df, how='left', on='batter')
df

In [None]:
# Remove extraneous columns
df.drop([col for col in ['batter', 'game_pk', 'probability', 'B', 'name', 'team', 'G_weighted', 'H_weighted'] if col in df.columns], axis=1, inplace=True)
df

### Model

In [None]:
X = df.drop('hit', axis=1)
y = df['hit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
y_train.value_counts()

In [None]:
X_train[X_train.isna().any(axis=1)]

In [None]:
model = sm.Logit(y_train, X_train.astype(float)).fit()
model.summary()

In [None]:
y_pred = model.predict(X_test.astype(float))
X_test_copy = X_test.copy()
X_test_copy['probability'] = y_pred
X_test_copy

In [None]:
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)