In [58]:
# imports
import sqlite3
import operator
import re
import os

In [59]:
# create connection to sql db
sql_connect = sqlite3.connect('../data/lahmansbaseballdb.sqlite')
cursor = sql_connect.cursor()

# switches between retrosheet ID's and Lahman database ID's
def retro_to_lahman(retro_id):
    query = 'SELECT playerID FROM people WHERE retroID = "' + retro_id + '"'
    result = cursor.execute(query).fetchall()
    return result[0][0] if result else ''

In [84]:
# aggregates all career data from all players up to, but not including, year
def get_career_stats(year):
    query = """
        SELECT playerID, (sum(H) + sum(BB) + sum(HBP) + sum(IBB))
        / CAST((sum(AB) + sum(BB) + sum(IBB) + sum(HBP) + sum(SF)) as REAL),
        (sum(H) + sum("2B") + 2*sum("3B") + 3*sum(HR)) / CAST(sum(AB) as REAL), 
        sum(HR), sum(BB), sum(RBI), sum(SO), sum(AB)
        FROM batting WHERE {y1} <= yearID < {y2} GROUP BY playerID;""".format(y1 = int(year) - 30, y2 = int(year))
    # query goes thirty years back for long-time veterans
    result = list(cursor.execute(query).fetchall())
    
    # convert list of tuples into dictionary
    career_stats = {}
    for p in result:
        career_stats[p[0]] = dict([('OBP', p[1]), ('SLG', p[2]), ('HR', p[3]),
                             ('BB', p[4]), ('RBI', p[5]), ('K', p[6]), ('AB', p[7])])
    return career_stats

In [85]:
# updates the season_stats dict based on an input series of plays, until the end date has been reached.
# the season_stats dict consists of pairs of a playerID key with a value of a dict containing that player's stats
def update_season_stats(season_stats, plays, end_date):
    for play in plays:
        # reached the end date
        if play.startswith('id') and (re.search(".*" + end_date + ".", play) or int(play[-6:-2]) > int(end_date)):
            return season_stats # technically don't need return val, as season_stats param persists outside of function
        elif play.startswith('play'):
            fields = play.split(',')
            # create empty dict if new player
            if fields[3] not in season_stats:
                season_stats[fields[3]] = {}
            player = season_stats[fields[3]]
            
            # parse through specific play and increment specific values based on the play.
            # see https://www.retrosheet.org/datause.txt for reference
            to_increment = []
            if fields[6].startswith('S') and not fields[6].startswith('SB'): # single
                to_increment += [('H', 1), ('TB', 1), ('AB', 1), ('PA', 1)]
            elif fields[6].startswith('D'): # double
                to_increment += [('H', 1), ('TB', 2), ('AB', 1), ('PA', 1)]
            elif fields[6].startswith('T'): # triple
                to_increment += [('H', 1), ('TB', 3), ('AB', 1), ('PA', 1)]
            elif fields[6].startswith('HR'): # home run
                to_increment += [('H', 1), ('TB', 4), ('HR', 1), ('AB', 1), ('PA', 1)]
                if not fields[6].count('-H'): # sometimes HR's don't record base advancement
                    to_increment += [('RBI', 1)]
            elif fields[6].startswith('W') or fields[6].startswith('IW'): # walk or intentional walk
                to_increment += [('BB', 1), ('PA', 1)]
            elif fields[6].startswith('K'): # strikeout
                to_increment += [('K', 1), ('AB', 1), ('PA', 1)]
            elif 'SF' in fields[6]: # sac fly
                to_increment += [('PA', 1)]
            elif not fields[6].startswith('NP') and not fields[6].startswith('HP') and not fields[6].startswith('CS'):
                to_increment += [('AB', 1), ('PA', 1)]
            # for RBI's, each play dictates the advancement of baserunners, i.e. 2-H indicates going from second to home.
            # since each play doesn't explicitly track the RBI's, we can count the number of advancements to home instead
            to_increment += [('RBI', fields[6].count('-H'))]
            
            for key_value_pair in to_increment:
                dict_add(player, key_value_pair)
            
# one function for creation of dict entry and incrementing of dict entry
def dict_add(dictionary, key_value_pair):
    key, value = key_value_pair
    if key in dictionary:
        dictionary[key] += value
    else:
        dictionary[key] = value

In [170]:
# creates examples for a given year in time
def season(year, verbose=True):
    games = open('../data/retrosheet_game_logs/GL' + str(year) + '.TXT').readlines()
    oldest_date = 0
    career_stats = get_career_stats(year) # career stats don't change
    season_stats = {} # season stats change as the season progresses
    print_counter = 1000
    for game in games[0:400]:
        game = game.replace('"', '')
        date = game[4:8]
        
        print_counter += 1
        if print_counter > 350 and verbose:
            print_counter = 0
            print('Evaluating game ' + date + " " +  str(year))
        if int(date) > oldest_date: # update stats to new date
            oldest_date = int(date)
            season_stats = get_season_stats(year, date)
    
# gets season stats of year until end_date
def get_season_stats(year, end_date):
    season_stats = {}
    for file in os.listdir('../data/retrosheet_event_logs'):
        if re.search(str(year) + '.*EV.', file):
            update_season_stats(season_stats, open('../data/retrosheet_event_logs/' + file), end_date)
    return season_stats
        
dubbo, bedward = season(2016)
i = 0
for idx in home_batters_idx:
    print(dubbo.split(',')[idx])
    print(bedward[0][2 + 14*i:16 + 14*i])
    i += 1
print('done')

Evaluating game 0403 2016
utlec001
[0.36302100301603896, 0.4649920600548578, 259, 729, 1032, 1205, 6927, 0.36, 0.45714285714285713, 0, 5, 7, 12, 70]
seagc001
[0.36462218698949406, 0.48732540093119503, 75, 198, 277, 409, 1933, 0.3068181818181818, 0.4050632911392405, 2, 8, 11, 11, 79]
turnj001
[0.3684804444683327, 0.4678124089717448, 120, 338, 477, 569, 3433, 0.3088235294117647, 0.3548387096774194, 0, 5, 5, 12, 62]
gonza003
[0.3706929656201955, 0.48530634876628775, 320, 791, 1216, 1417, 7214, 0.4117647058823529, 0.52, 3, 9, 14, 16, 75]
puigy001
[0.3522361373551561, 0.47457078069322967, 134, 305, 424, 697, 3087, 0.3625, 0.4722222222222222, 2, 8, 9, 16, 72]
grany001
[0.35101665697719947, 0.4454674220963173, 141, 466, 420, 775, 2824, 0.425, 0.3870967741935484, 0, 9, 4, 2, 31]
crawc002
[0.332587106555406, 0.43474347434743477, 136, 377, 768, 1070, 6666, 0.2727272727272727, 0.36363636363636365, 0, 0, 2, 3, 11]
pedej001
[0.34477670494648593, 0.475071907957814, 126, 303, 295, 596, 2086, 0.390625

In [148]:
home_batters_idx = [132, 135, 138, 141, 144, 147, 150, 153, 156]
away_batters_idx = [105, 108, 111, 114, 117, 120, 123, 126, 129]

# generates the features and label of a given game
def example(game, career_stats, season_stats):
    fields = game.split(',')
    year = fields[0][0:4]
    date = fields[0][4:8]
    home_team = fields[6]
    away_team = fields[3]
    home_pitcher = fields[103]
    away_pitcher = fields[101]
    label = 1 if int(fields[10]) - int(fields[9]) > 0 else 0 # 1 if home team won
    
    # add features
    features = []
    features += team_stats(home_team, year, date)
    features += team_stats(away_team, year, date)
    for idx in home_batters_idx:
        features += batter_stats(fields[idx], year, date, career_stats, season_stats)
    features += pitcher_stats(home_pitcher, year, date, career_stats, season_stats)
    for idx in away_batters_idx:
        features += batter_stats(fields[idx], year, date, career_stats, season_stats)
    features += pitcher_stats(away_pitcher, year, date, career_stats, season_stats)
    
    return features, label

In [169]:
# return's a given batter's stats at the current date and year in the following format:
# first seven features: career OBP, SLG, HR, BB, RBI, K, AB
# next seven features: current season OBP, SLG, HR, BB, RBI, K, AB
def batter_stats(player, year, date, career_stats, season_stats):
    
    lahman_id = retro_to_lahman(player)
    
    # season stats
    if player not in season_stats or season_stats[player]['AB'] == 0: # first player appearance
        season = [0.2, 0.3, 0, 0, 0, 0, 0] # approx replacement level OBP and SLG
    else:
        s = season_stats[player]
        season = [(s.get('H', 0) + s.get('BB', 0)) / float(s.get('PA', 1)), # OBP
                  s.get('TB', 0) / float(s.get('AB', 1)), # SLG
                  s.get('HR', 0), s.get('BB', 0), s.get('RBI', 0), s.get('K', 0), s.get('AB', 0)]
        
    # career stats
    if lahman_id not in career_stats or career_stats[lahman_id]['AB'] == 0: # rookie season
        career = [0, 0, 0, 0, 0, 0, 0]
    else:
        c = career_stats[lahman_id]
        career = [c['OBP'], c['SLG'], c['HR'], c['BB'], c['RBI'], c['K'], c['AB']]
            
    # incorporate current season into career stats
    
    # weighted average of OBP and SLG
    total_AB = season[6] + career[6]
    total_PA = total_AB + season[3] + career[3]
    if total_AB == 0 or total_PA == 0:
        career[0] = 0.2
        career_stats[1] = 0.3
    else:
        career[0] = (season[0] * (season[6] + season[3]) + career[0] * (career[6] + career[3])) / total_PA
        career[1] = (season[1] * season[6] + career[1] * career[6]) / total_AB
    # add all of the counting stats
    for i in [2, 3, 4, 5, 6]:
        career[i] += season[i]
    
    return career + season

In [127]:
def pitcher_stats(player, year, date, career_stats, season_stats):
    return [player + ' pitching stats']

In [128]:
def team_stats(team, year, date):
    return [team + ' team stats']