In [1]:
# The intent of the notebook is to begin modelling the baseball data. This will likely not be the final model, 
#     however,z it will act as a building block to more advanced models.
# Version: 1.0

In [10]:
import import_ipynb
from sklearn import svm
from pymysql import connect
from sys import path
from pathlib import Path
path.append('../../../') 
from BaseballAnalytics.bin.app_utils.queries import Queries
from BaseballAnalytics.bin.app_utils.common_help import Log_Helper
from pickle import dump, load

In [3]:
# Connect to the database.
conn = connect(host="localhost", user="root", passwd="praquplDop#odlg73h?c", db="baseball_stats_db")
qu = Queries(conn)

In [4]:
# Get all the game ids. The information is returned as 
#    (Game_ID, year, day, month, Home_Score, Vis_Score, Home_Team, Visitng_Team, Home_Win) where a winning home team is flagged at 0.
game_outcomes = qu.get_game_outcomes()         
game_outcomes[0:5]

(('BOS199004090', 1990, 9, 4, 5, 2, 'BOS', 'DET', 0),
 ('CAL199004090', 1990, 9, 4, 4, 7, 'CAL', 'SEA', 1),
 ('CHA199004090', 1990, 9, 4, 2, 1, 'CHA', 'MIL', 0),
 ('HOU199004090', 1990, 9, 4, 4, 8, 'HOU', 'CIN', 1),
 ('KCA199004090', 1990, 9, 4, 6, 7, 'KCA', 'BAL', 1))

In [5]:
# Filter the games past May 15 (Approximentally one week) to acquire some data.
game_outcomes_cpy = list(game_outcomes)
filter_games = []
for game in game_outcomes_cpy:
    if (game[2] < 15 and game[3] < 5):        # Remove all games prior to April 15.
        pass
    else:
        filter_games.append(game)
game_outcomes_cpy = None                      # Send the data to the ether to reserve space.
filter_games[0:10]

[('ATL199004150', 1990, 15, 4, 6, 13, 'ATL', 'CIN', 1),
 ('CAL199004150', 1990, 15, 4, 4, 1, 'CAL', 'MIN', 0),
 ('CHA199004150', 1990, 15, 4, 4, 1, 'CHA', 'CLE', 0),
 ('DET199004150', 1990, 15, 4, 6, 4, 'DET', 'BAL', 0),
 ('HOU199004150', 1990, 15, 4, 4, 5, 'HOU', 'LAN', 1),
 ('KCA199004150', 1990, 15, 4, 4, 5, 'KCA', 'TOR', 1),
 ('MON199004150', 1990, 15, 4, 3, 1, 'MON', 'NYN', 0),
 ('NYA199004150', 1990, 15, 4, 3, 1, 'NYA', 'TEX', 0),
 ('PHI199004150', 1990, 15, 4, 4, 0, 'PHI', 'SLN', 0),
 ('PIT199004150', 1990, 15, 4, 3, 3, 'PIT', 'CHN', 0)]

In [6]:
# Retrieve only the game ids and their outcomes.
game_ids = [[game_id[0], game_id[8]] for game_id in filter_games]
game_ids = game_ids[200:]                                   # Cut off the first 200 games.
print("The number of game ids: {}".format(len(game_ids)))
game_ids[0:5]

The number of game ids: 65734


[['LAN199005020', 0],
 ['MON199005020', 1],
 ['NYA199005020', 1],
 ['NYN199005020', 0],
 ['PHI199005020', 0]]

In [7]:
# Fetch the players and pitchers from the game.
data_locs = Path(r'C:\Users\micha\Documents\Baseball_Analytics_Source_Data\model_v1')
pitchers_pickle = 'game_pitchers.pickle'
batters_pickle = 'game_players.pickle'
offensive_feat_pickle = 'raw_offensive_features.pickle'
offensive_features = qu.get_all_offensive_features(data_locs / offensive_feat_pickle)
all_pitcher = qu.get_pitchers_in_all_games_vOne(data_locs / pitchers_pickle)
all_batters = qu.get_batters_in_all_games_vOne(data_locs / batters_pickle)

In [8]:
# Gather all the features from every game.
features_pickle = 'game_features.pickle'
lh = Log_Helper()
all_features = {}
num_games = len(game_ids)
lh.print_progress_bar(0, num_games, prefix = 'Progress:', suffix = 'Complete', length = 50)    # Initial call to print 0% progress
for num, game_id in enumerate(game_ids):
    all_features[game_id[0]] = qu.get_game_features(all_batters, all_pitcher, offensive_features, game_id[0])
    lh.print_progress_bar(num + 1, num_games, prefix = 'Progress:', suffix = 'Complete', length = 50)
with open(data_locs / features_pickle, 'wb') as f:
    dump(all_features, f)

0507659999984753
8.0.10531849999961196
1.0.00014520000013362733
2.0.00028830000019297586
3.0.011289300000044022
4.0.011521899999934249
5.0.011658299999908195
6.0.01183549999996103
7.0.13407210000013947
8.0.13468210000019099
1.0.0007507000000259723
2.0.0010079999997287814
3.0.013399299999946379
4.0.01372140000012223
5.0.013970299999982672
6.0.014199400000052265
7.0.031125900000006368
8.0.03140309999980673
1.0.0016372000000046683
2.0.002578900000116846
3.0.04316560000006575
4.0.04394570000022213
5.0.044089099999837345
6.0.0442137999998522
7.0.065385200000037
8.0.06564710000020568
1.0.0002104000000144879
2.0.000387499999760621
3.0.011846800000057556
4.0.012529300000096555
5.0.01279530000010709
6.0.013495199999852048
7.0.025247199999739678
8.0.025574499999947875
1.0.0007184000000961532
2.0.0008858000001055188
3.0.009566299999733019
4.0.01024939999979324
5.0.010447900000144728
6.0.01059979999990901
7.0.09476420000009966
8.0.0950751999998829
1.0.00023700000019744039
2.0.00039640000022700406


KeyboardInterrupt: 

60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
60
