In [None]:
START_YEAR = 2005
SPORT_IDS = [1,11,12,13,14,15,16]

In [None]:
CURRENT_SEASON = 2024
ONLY_UPDATE_CURRENT_SEASON = False

In [None]:
STOP_MONTH = 8

In [None]:
import requests
import sqlite3

In [None]:
from tqdm import tqdm
import threading

Generate Game Log Data

In [None]:
dbWriteLock = threading.Lock()

In [None]:
def GenerateHitterYearGameLogs(db, mlbId, year, startMonth=0, endMonth=13):
    cursor = db.cursor()
    # Check if data already exists
    if cursor.execute(f"SELECT COUNT(*) FROM Player_Hitter_GameLog WHERE mlbId='{mlbId}' AND Year='{year}' AND Month >='{startMonth}'").fetchone()[0] > 0:
        return
    
    gameLogs = []
    response = requests.get(f"https://statsapi.mlb.com/api/v1/people/{mlbId}/stats?stats=gameLog&leagueListId=mlb_milb&group=hitting&gameType=R&sitCodes=1,2,3,4,5,6,7,8,9,10,11,12&hydrate=team&language=en&season={year}")
    if response.status_code != 200:
        print(f"Status code {response.status_code} for id={mlbId} year={year}")
        return
    try:
        games = response.json()["stats"][0]["splits"]
        for game in games:
            _, month, day = game["date"].split("-")
            if int(month) < int(startMonth) or int(month) >= int(endMonth):
                continue
            
            gameId = int(game["game"]["gamePk"])
            if len(game["positionsPlayed"] > 0):
                pos = int(game["positionsPlayed"][0]["code"])
            else:
                pos = 10
            isHomeGame = int(game["isHome"])
            if isHomeGame:
                homeTeamId = int(game["team"]["id"])
            else:
                homeTeamId = int(game["opponent"]["id"])
            level = int(game["sport"]["id"])
            stats = game["stat"]
            ab = int(stats["atBats"])
            h = int(stats["hits"])
            double = int(stats["doubles"])
            triple = int(stats["triples"])
            hr = int(stats["homeRuns"])
            k = int(stats["strikeOuts"])
            bb = int(stats["baseOnBalls"])
            hbp = int(stats["hitByPitch"])
            sb = int(stats["stolenBases"])
            cs = int(stats["caughtStealing"])
            gameLogs.append((gameId, int(mlbId), int(day), int(month), int(year), ab, h, double, triple, hr, k, bb, sb, cs, hbp, pos, level, homeTeamId))
            
        with dbWriteLock:
            cursor.execute("BEGIN TRANSACTION")
            cursor.executemany("INSERT INTO Player_Hitter_GameLog('gameId', 'mlbId', 'Day', 'Month','Year','AB','H','2B','3B','HR','K','BB','SB','CS','HBP','Position','Level','HomeTeamId') VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", gameLogs)
            cursor.execute("END TRANSACTION")
            db.commit()
            
    except Exception as e:
        print(f"Exception On Id={mlbId} year={year}: {e}")
        return

In [None]:
# Create dictionary of player ids-year combination
# Don't need any data, just a lookup table
playerYearDict = {}
for year in range(START_YEAR, CURRENT_SEASON + 1):
    if ONLY_UPDATE_CURRENT_SEASON and year != CURRENT_SEASON:
            continue

    for sportId in SPORT_IDS:
        print(f"Getting players for Year={year} SportId={sportId}")
        response = requests.get(f"https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season={year}&sportId={sportId}&stats=season&group=hitting&gameType=R&limit=5000&offset=0&sortStat=homeRuns&order=desc")
        if response.status_code != 200:
            print(f"Code {response.status_code} for Year={year} and sportId={sportId} and position={position}")
            continue

        responseJson = response.json()
        jsonPlayers = responseJson['stats']
        for player in jsonPlayers:
            playerString = str(player["playerId"]) + "," + str(year)
            playerYearDict[playerString] = None

Getting players for Year=2005 SportId=1
Getting players for Year=2005 SportId=11
Getting players for Year=2005 SportId=12
Getting players for Year=2005 SportId=13
Getting players for Year=2005 SportId=14
Getting players for Year=2005 SportId=15
Getting players for Year=2005 SportId=16
Getting players for Year=2006 SportId=1
Getting players for Year=2006 SportId=11
Getting players for Year=2006 SportId=12
Getting players for Year=2006 SportId=13
Getting players for Year=2006 SportId=14
Getting players for Year=2006 SportId=15
Getting players for Year=2006 SportId=16
Getting players for Year=2007 SportId=1
Getting players for Year=2007 SportId=11
Getting players for Year=2007 SportId=12
Getting players for Year=2007 SportId=13
Getting players for Year=2007 SportId=14
Getting players for Year=2007 SportId=15
Getting players for Year=2007 SportId=16
Getting players for Year=2008 SportId=1
Getting players for Year=2008 SportId=11
Getting players for Year=2008 SportId=12
Getting players for 

In [None]:
NUM_THREADS = 16

In [None]:
def HitterGameLogThreadFunction(data, threadIdx):
    db = sqlite3.connect("BaseballStats.db")
    cursor = db.cursor()
    with dbWriteLock:
        cursor.execute("PRAGMA journal_mode = WAL")
        db.commit()
    cursor = db.cursor()
    global threadCompleteCounts
    for d in data:
        try:
            mlbId, year = d.split(",")
            if year == CURRENT_SEASON:
                months = cursor.execute(f"SELECT DISTINCT Month FROM Player_Hitter_GameLog WHERE mlbId='{mlbId}' AND Year='{year}' ORDER BY Month DESC").fetchall()
                if len(months) == 0:
                    startMonth = 0
                else:
                    startMonth = months[0][0]
                GenerateHitterYearGameLogs(db, mlbId, year, startMonth, STOP_MONTH)
            else:
                GenerateHitterYearGameLogs(db, mlbId, year)
        except Exception as e:
            print(f"Error for id={mlbId} year={year}: {e}")
        finally:
            threadCompleteCounts[threadIdx] += 1
    
        

In [None]:
playerYears = list(playerYearDict.keys())

threads = []
threadCompleteCounts = [0] * NUM_THREADS
for i in range(NUM_THREADS):
    thread = threading.Thread(target=HitterGameLogThreadFunction, args=[playerYears[len(playerYears) * i // NUM_THREADS : len(playerYears) * (i + 1) // NUM_THREADS], i])
    threads.append(thread)
    thread.start()
    
progressBar = tqdm(total=len(playerYears))

# Start progress bar
keepTimerRunning = True
def UpdateTimer():
    if keepTimerRunning:
        threading.Timer(5.0, UpdateTimer).start()
    
    count = 0
    global threadCompleteCounts
    for i in range(NUM_THREADS):
        count += threadCompleteCounts[i]
    
    global progressBar
    progressBar.n = count
    progressBar.last_print_n = progressBar.n
    progressBar.refresh()
    
UpdateTimer()

for thread in threads:
    thread.join()
    
keepTimerRunning = False

In [None]:
db = sqlite3.connect("BaseballStats.db")
cursor = db.cursor()