In [58]:
# imports
import sqlite3
import operator
import re
import os

In [59]:
# create connection to sql db
sql_connect = sqlite3.connect('../data/lahmansbaseballdb.sqlite')
cursor = sql_connect.cursor()

# switches between retrosheet ID's and Lahman database ID's
def retro_to_lahman(retro_id):
    query = 'SELECT playerID FROM people WHERE retroID = "' + retro_id + '"'
    result = cursor.execute(query).fetchall()
    return result[0][0] if result else ''

In [60]:
home_batters_idx = [132, 135, 138, 141, 144, 147, 150, 153, 156]
away_batters_idx = [105, 108, 111, 114, 117, 120, 123, 126, 129]

# generates the features and label of a given game
def example(game):
    fields = game.split(',')
    print(fields[0])
    year = fields[0][0:4]
    date = fields[0][4:8]
    features = []
    for idx in [105]: # away_batters_idx:
        features += player_stats(fields[idx], year, date, None)

In [84]:
# aggregates all career data from all players up to, but not including, year
def get_career_stats(year):
    query = """
        SELECT playerID, (sum(H) + sum(BB) + sum(HBP) + sum(IBB))
        / CAST((sum(AB) + sum(BB) + sum(IBB) + sum(HBP) + sum(SF)) as REAL),
        (sum(H) + sum("2B") + 2*sum("3B") + 3*sum(HR)) / CAST(sum(AB) as REAL), 
        sum(HR), sum(BB), sum(RBI), sum(SO), sum(AB)
        FROM batting WHERE {y1} <= yearID < {y2} GROUP BY playerID;""".format(y1 = int(year) - 30, y2 = int(year))
    # query goes thirty years back for long-time veterans
    result = list(cursor.execute(query).fetchall())
    
    # convert list of tuples into dictionary
    career_stats = {}
    for p in result:
        career_stats[p[0]] = dict([('OBP', p[1]), ('SLG', p[2]), ('HR', p[3]),
                             ('BB', p[4]), ('RBI', p[5]), ('K', p[6]), ('AB', p[7])])
    return career_stats

In [85]:
# updates the season_stats dict based on an input series of plays, until the end date has been reached.
# the season_stats dict consists of pairs of a playerID key with a value of a dict containing that player's stats
def update_season_stats(season_stats, plays, end_date):
    for play in plays:
        # reached the end date
        if play.startswith('id') and (re.search(".*" + end_date + ".", play) or int(play[-6:-2]) > int(end_date)):
            return season_stats # technically don't need return val, as season_stats param persists outside of function
        elif play.startswith('play'):
            fields = play.split(',')
            # create empty dict if new player
            if fields[3] not in season_stats:
                season_stats[fields[3]] = {}
            player = season_stats[fields[3]]
            
            # parse through specific play and increment specific values based on the play.
            # see https://www.retrosheet.org/datause.txt for reference
            to_increment = []
            if fields[6].startswith('S') and not fields[6].startswith('SB'): # single
                to_increment += [('H', 1), ('TB', 1), ('AB', 1), ('PA', 1)]
            elif fields[6].startswith('D'): # double
                to_increment += [('H', 1), ('TB', 2), ('AB', 1), ('PA', 1)]
            elif fields[6].startswith('T'): # triple
                to_increment += [('H', 1), ('TB', 3), ('AB', 1), ('PA', 1)]
            elif fields[6].startswith('HR'): # home run
                to_increment += [('H', 1), ('TB', 4), ('HR', 1), ('AB', 1), ('PA', 1)]
                if not fields[6].count('-H'): # sometimes HR's don't record base advancement
                    to_increment += [('RBI', 1)]
            elif fields[6].startswith('W') or fields[6].startswith('IW'): # walk or intentional walk
                to_increment += [('BB', 1), ('PA', 1)]
            elif fields[6].startswith('K'): # strikeout
                to_increment += [('K', 1), ('AB', 1), ('PA', 1)]
            elif 'SF' in fields[6]: # sac fly
                to_increment += [('PA', 1)]
            elif not fields[6].startswith('NP') and not fields[6].startswith('HP') and not fields[6].startswith('CS'):
                to_increment += [('AB', 1), ('PA', 1)]
            # for RBI's, each play dictates the advancement of baserunners, i.e. 2-H indicates going from second to home.
            # since each play doesn't explicitly track the RBI's, we can count the number of advancements to home instead
            to_increment += [('RBI', fields[6].count('-H'))]
            
            for key_value_pair in to_increment:
                dict_add(player, key_value_pair)
            
# one function for creation of dict entry and incrementing of dict entry
def dict_add(dictionary, key_value_pair):
    key, value = key_value_pair
    if key in dictionary:
        dictionary[key] += value
    else:
        dictionary[key] = value

In [91]:
# creates examples for a given year in time
def season(year, verbose=True):
    games = open('../data/retrosheet_game_logs/GL' + str(year) + '.TXT').readlines()
    oldest_date = 0
    career_stats = get_career_stats(year) # career stats don't change
    season_stats = {} # season stats change as the season progresses
    print_counter = 1000
    for game in games:
        game = game.replace('"', '')
        date = game[4:8]
        
        print_counter += 1
        if print_counter > 350 and verbose:
            print_counter = 0
            print('Evaluating game ' + date + " " +  str(year))
            season_stats = get_season_stats(year, date)
        if int(date) > oldest_date: # update stats to new date
            oldest_date = int(date)
            
        pass# example(game)
    example(games[0].replace('"',''))
    
# gets season stats of year until end_date
def get_season_stats(year, end_date):
    season_stats = {}
    for file in os.listdir('../data/retrosheet_event_logs'):
        if re.search(str(year) + '.*EV.', file):
            update_season_stats(season_stats, open('../data/retrosheet_event_logs/' + file), date)
    return season_stats
        
season(2016)
# season_stats = {}

# print(season_stats.keys())
print(season_stats['milte001'])
print('done')

Evaluating game 0403 2016
Evaluating game 0430 2016
Evaluating game 0526 2016
Evaluating game 0621 2016
Evaluating game 0720 2016
Evaluating game 0815 2016
Evaluating game 0910 2016
20160403
[0, 0, 263, 684, 742, 1459, 5582]
{'K': 1, 'AB': 2, 'PA': 2, 'RBI': 0}
done


In [61]:
# return's a given player's stats at the current date and year in the following format:
# first seven features: career OBP, SLG, HR, BB, RBI, K, AB
# next seven features: current season OBP, SLG, HR, BB, RBI, K, AB
def player_stats(player, year, date, season_stats):
    stats = []
    
    # season stats
    
    season_stats = [0.2, 0.3, 0, 0, 0, 0, 0, 0] # approx replacement level OBP and SLG
    
    # career stats
    
    query = """
        SELECT (sum(H) + sum(BB) + sum(HBP) + sum(IBB)) / (sum(AB) + sum(BB) + sum(IBB) + sum(HBP) + sum(SF)) as OBP,
        (sum(H) + sum("2B") + 2*sum("3B") + 3*sum(HR)) / sum(AB) as SLG, sum(HR), sum(BB), sum(RBI), sum(SO), sum(AB)
        FROM batting WHERE playerID = "{p}" and yearID < {y};""".format(p=retro_to_lahman(player), y=year)
    career_stats = list(cursor.execute(query).fetchall()[0])
    print(career_stats)
    if None in career_stats: # rookie player
        career_stats = season_stats
    else:
        
        career_stats = list(map(operator.add, career_stats, season_stats))
    
    stats += career_stats
    
    # season stats
    
    return stats