Constant Parameters

In [4]:
START_YEAR = 2005
SPORT_IDS = [1,11,12,13,14,15,16]

Year Parameters to update

In [5]:
CURRENT_SEASON = 2024
ONLY_UPDATE_CURRENT_SEASON = True

Get List of Players to add to database

In [6]:
import requests
import sqlite3

In [7]:
db = sqlite3.connect("BaseballStats.db")
cursor = db.cursor()

In [None]:
db.rollback()
cursor = db.cursor()

for year in range(START_YEAR, CURRENT_SEASON + 1):
    if ONLY_UPDATE_CURRENT_SEASON and year != CURRENT_SEASON:
        continue
    
    for sportId in SPORT_IDS:
        playersToInsert = []
        for position in ["hitting","pitching"]:
            print(f"Getting players for Year={year} SportId={sportId} Position={position}")
            response = requests.get(f"https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season={year}&sportId={sportId}&stats=season&group={position}&gameType=R&limit=5000&offset=0&sortStat=homeRuns&order=desc")
            if response.status_code != 200:
                print(f"Code {response.status_code} for Year={year} and sportId={sportId} and position={position}")
                continue
            
            responseJson = response.json()
            jsonPlayers = responseJson['stats']
            for player in jsonPlayers:
                try:
                    if cursor.execute(f"SELECT COUNT(*) FROM Player WHERE mlbId='{player['playerId']}' AND position='{position}'").fetchone()[0] > 0:
                        continue
                    playersToInsert.append((player["playerId"], position))
                except:
                    pass
                
        cursor.execute("BEGIN TRANSACTION")
        cursor.executemany("INSERT INTO Player('mlbId','position') VALUES(?,?)", playersToInsert)
        cursor.execute("END TRANSACTION")
        db.commit()
        cursor = db.cursor()

Get Player Data for all players

In [8]:
from tqdm import tqdm
import threading

In [None]:
db.rollback()
cursor = db.cursor()

NUM_THREADS = 16
threadOutputs = [[]] * NUM_THREADS
threadCompleteCounts = [0] * NUM_THREADS

unsetPlayers = cursor.execute("SELECT mlbId, position FROM Player WHERE birthYear IS NULL").fetchall()


In [None]:
def ReadPlayer(mlbId, position, threadIdx):
    response = requests.get(f"https://statsapi.mlb.com/api/v1/people/{mlbId}?hydrate=currentTeam,team,stats(type=[yearByYear](team(league)),leagueListId=mlb_milb)&site=en")
    if response.status_code != 200:
        print(f"Status code {response.status_code} for {mlbId} {position}")
        return
    
    try:
        response = response.json()
        person = response["people"][0]
        useFirstName = person["useName"]
        useLastName = person["useLastName"]
        bats = person["batSide"]["code"]
        throws = person["pitchHand"]["code"]
        birthdateFormatted = person["birthDate"]
        birthYear, birthMonth, birthDate = birthdateFormatted.split("-")
        global threadOutputs
        threadOutputs[threadIdx].append((mlbId, position, useFirstName, useLastName, bats, throws, birthYear, birthMonth, birthDate))
        
    except Exception as e:
        print(f"Exception {e} for {mlbId}")
        return
    
def ReadPlayers(threadIdx):
    global unsetPlayers
    global threadCompleteCounts
    for mlbId, position in unsetPlayers[threadIdx * len(unsetPlayers) // NUM_THREADS : (threadIdx + 1) * len(unsetPlayers) // NUM_THREADS]:
        ReadPlayer(mlbId, position, threadIdx)
        threadCompleteCounts[threadIdx] += 1

In [None]:
threads = []
for i in range(NUM_THREADS):
    thread = threading.Thread(target=ReadPlayers, args=[i])
    threads.append(thread)
    thread.start()
    
progressBar = tqdm(total=len(unsetPlayers))

# Start progress bar
keepTimerRunning = True
def UpdateTimer():
    if keepTimerRunning:
        threading.Timer(5.0, UpdateTimer).start()
    
    count = 0
    global threadCompleteCounts
    for i in range(NUM_THREADS):
        count += threadCompleteCounts[i]
    
    global progressBar
    progressBar.n = count
    progressBar.last_print_n = progressBar.n
    progressBar.refresh()
    
UpdateTimer()

for thread in threads:
    thread.join()
    
keepTimerRunning = False

In [None]:
db.rollback()
cursor = db.cursor()
cursor.execute("BEGIN TRANSACTION")

for threadOutput in threadOutputs:
    for mlbId, position, useFirstName, useLastName, bats, throws, birthYear, birthMonth, birthDate in threadOutput:
    # for data in threadOutput:
        # cursor.execute(f"UPDATE Player SET birthYear='{birthYear}', birthMonth='{birthMonth}', birthDate='{birthDate}', bats='{bats}', throws='{throws}', useFirstName='{useFirstName}', useLastName='{useLastName}' WHERE mlbId='{mlbId}' AND position='{position}'")
        cursor.execute("UPDATE Player SET birthYear=?, birthMonth=?, birthDate=?, bats=?, throws=?, useFirstName=?,  useLastName=? WHERE mlbId=? AND position=?", (birthYear, birthMonth, birthDate, bats, throws, useFirstName, useLastName, mlbId, position))
cursor.execute("END TRANSACTION")
db.commit()

Calculate Draft Position

In [None]:
db.rollback()
cursor = db.cursor()
cursor.execute("BEGIN TRANSACTION")

for year in tqdm(range(2001, CURRENT_SEASON + 1), desc="Drafts"):
    response = requests.get(f"https://statsapi.mlb.com/api/v1/draft/{year}")
    json = response.json()
    for rounds in json["drafts"]["rounds"]:
        for pick in rounds["picks"]:
            try:
                draftPick = pick["pickNumber"]
                mlbId = pick["person"]["id"]
                cursor.execute("UPDATE Player SET draftPick=?, signingYear=?, signingMonth='7', signingDate='1' WHERE mlbId=?", (draftPick, year, mlbId))
            except:
                continue
    
cursor.execute("END TRANSACTION")
db.commit()

Generate Game Log Data

In [9]:
dbWriteLock = threading.Lock()

In [18]:
def GenerateHitterYearGameLogs(db, mlbId, year, startMonth=0, endMonth=13):
    cursor = db.cursor()
    # Check if data already exists
    if cursor.execute(f"SELECT COUNT(*) FROM Player_Hitter_GameLog WHERE mlbId='{mlbId}' AND Year='{year}' AND Month >='{startMonth}'").fetchone()[0] > 0:
        return
    
    gameLogs = []
    response = requests.get(f"https://statsapi.mlb.com/api/v1/people/{mlbId}/stats?stats=gameLog&leagueListId=mlb_milb&group=hitting&gameType=R&sitCodes=1,2,3,4,5,6,7,8,9,10,11,12&hydrate=team&language=en&season={year}")
    if response.status_code != 200:
        print(f"Status code {response.status_code} for id={mlbId} year={year}")
        return
    try:
        games = response.json()["stats"][0]["splits"]
        for game in games:
            _, month, day = game["date"].split("-")
            if int(month) < int(startMonth) or int(month) >= int(endMonth):
                continue
            
            gameId = int(game["game"]["gamePk"])
            if len(game["positionsPlayed"] > 0):
                pos = int(game["positionsPlayed"][0]["code"])
            else:
                pos = 10
            isHomeGame = int(game["isHome"])
            if isHomeGame:
                homeTeamId = int(game["team"]["id"])
            else:
                homeTeamId = int(game["opponent"]["id"])
            level = int(game["sport"]["id"])
            stats = game["stat"]
            ab = int(stats["atBats"])
            h = int(stats["hits"])
            double = int(stats["doubles"])
            triple = int(stats["triples"])
            hr = int(stats["homeRuns"])
            k = int(stats["strikeOuts"])
            bb = int(stats["baseOnBalls"])
            hbp = int(stats["hitByPitch"])
            sb = int(stats["stolenBases"])
            cs = int(stats["caughtStealing"])
            gameLogs.append((gameId, int(mlbId), int(day), int(month), int(year), ab, h, double, triple, hr, k, bb, sb, cs, hbp, pos, level, homeTeamId))
            
        with dbWriteLock:
            cursor.execute("BEGIN TRANSACTION")
            cursor.executemany("INSERT INTO Player_Hitter_GameLog('gameId', 'mlbId', 'Day', 'Month','Year','AB','H','2B','3B','HR','K','BB','SB','CS','HBP','Position','Level','HomeTeamId') VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", gameLogs)
            cursor.execute("END TRANSACTION")
            db.commit()
            
    except Exception as e:
        print(f"Exception On Id={mlbId} year={year}: {e}")
        return

In [11]:
ONLY_UPDATE_CURRENT_SEASON = False
STOP_MONTH = 8

In [13]:
# Create dictionary of player ids-year combination
# Don't need any data, just a lookup table
playerYearDict = {}
for year in range(START_YEAR, CURRENT_SEASON + 1):
    if ONLY_UPDATE_CURRENT_SEASON and year != CURRENT_SEASON:
            continue

    for sportId in SPORT_IDS:
        print(f"Getting players for Year={year} SportId={sportId}")
        response = requests.get(f"https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season={year}&sportId={sportId}&stats=season&group=hitting&gameType=R&limit=5000&offset=0&sortStat=homeRuns&order=desc")
        if response.status_code != 200:
            print(f"Code {response.status_code} for Year={year} and sportId={sportId} and position={position}")
            continue

        responseJson = response.json()
        jsonPlayers = responseJson['stats']
        for player in jsonPlayers:
            playerString = str(player["playerId"]) + "," + str(year)
            playerYearDict[playerString] = None

Getting players for Year=2005 SportId=1
Getting players for Year=2005 SportId=11
Getting players for Year=2005 SportId=12
Getting players for Year=2005 SportId=13
Getting players for Year=2005 SportId=14
Getting players for Year=2005 SportId=15
Getting players for Year=2005 SportId=16
Getting players for Year=2006 SportId=1
Getting players for Year=2006 SportId=11
Getting players for Year=2006 SportId=12
Getting players for Year=2006 SportId=13
Getting players for Year=2006 SportId=14
Getting players for Year=2006 SportId=15
Getting players for Year=2006 SportId=16
Getting players for Year=2007 SportId=1
Getting players for Year=2007 SportId=11
Getting players for Year=2007 SportId=12
Getting players for Year=2007 SportId=13
Getting players for Year=2007 SportId=14
Getting players for Year=2007 SportId=15
Getting players for Year=2007 SportId=16
Getting players for Year=2008 SportId=1
Getting players for Year=2008 SportId=11
Getting players for Year=2008 SportId=12
Getting players for 

In [14]:
NUM_THREADS = 16

In [17]:
def HitterGameLogThreadFunction(data, threadIdx):
    db = sqlite3.connect("BaseballStats.db")
    cursor = db.cursor()
    with dbWriteLock:
        cursor.execute("PRAGMA journal_mode = WAL")
        db.commit()
    cursor = db.cursor()
    global threadCompleteCounts
    for d in data:
        try:
            mlbId, year = d.split(",")
            if year == CURRENT_SEASON:
                months = cursor.execute(f"SELECT DISTINCT Month FROM Player_Hitter_GameLog WHERE mlbId='{mlbId}' AND Year='{year}' ORDER BY Month DESC").fetchall()
                if len(months) == 0:
                    startMonth = 0
                else:
                    startMonth = months[0][0]
                GenerateHitterYearGameLogs(db, mlbId, year, startMonth, STOP_MONTH)
            else:
                GenerateHitterYearGameLogs(db, mlbId, year)
        except Exception as e:
            print(f"Error for id={mlbId} year={year}: {e}")
        finally:
            threadCompleteCounts[threadIdx] += 1
    
        

In [16]:
playerYears = list(playerYearDict.keys())

threads = []
threadCompleteCounts = [0] * NUM_THREADS
for i in range(NUM_THREADS):
    thread = threading.Thread(target=HitterGameLogThreadFunction, args=[playerYears[len(playerYears) * i // NUM_THREADS : len(playerYears) * (i + 1) // NUM_THREADS], i])
    threads.append(thread)
    thread.start()
    
progressBar = tqdm(total=len(playerYears))

# Start progress bar
keepTimerRunning = True
def UpdateTimer():
    if keepTimerRunning:
        threading.Timer(5.0, UpdateTimer).start()
    
    count = 0
    global threadCompleteCounts
    for i in range(NUM_THREADS):
        count += threadCompleteCounts[i]
    
    global progressBar
    progressBar.n = count
    progressBar.last_print_n = progressBar.n
    progressBar.refresh()
    
UpdateTimer()

for thread in threads:
    thread.join()
    
keepTimerRunning = False

  0%|          | 0/26260 [00:00<?, ?it/s]Exception in thread Thread-19 (HitterGameLogThreadFunction):
Traceback (most recent call last):
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\threading.py", line 1045, in _bootstrap_inner
  0%|          | 0/26260 [00:00<?, ?it/s]    self.run()
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\nitzr\AppData\Local\Temp\ipykernel_27616\2342525845.py", line 4, in HitterGameLogThreadFunction
sqlite3.OperationalError: database is locked
 15%|█▍        | 3828/26260 [02:55<17:07, 21.84it/s] 

Exception On Id=517388: list index out of range


 24%|██▍       | 6248/26260 [06:00<19:14, 17.33it/s]

Exception On Id=593755: list index out of range


 29%|██▉       | 7573/26260 [06:45<16:40, 18.67it/s]

Exception On Id=452034: list index out of range
Exception On Id=450240: list index out of range


 40%|████      | 10634/26260 [10:40<15:41, 16.59it/s]

Exception On Id=516755: list index out of range


 42%|████▏     | 11003/26260 [11:11<15:30, 16.40it/s]

Exception On Id=593700: list index out of range


 43%|████▎     | 11372/26260 [11:41<15:17, 16.22it/s]

Exception On Id=594025: list index out of range


 44%|████▍     | 11494/26260 [11:51<15:13, 16.16it/s]

Exception On Id=593696: list index out of range


 51%|█████     | 13361/26260 [13:26<12:58, 16.57it/s]

Exception On Id=593757: list index out of range


 53%|█████▎    | 13890/26260 [14:11<12:38, 16.32it/s]

Exception On Id=594024: list index out of range


 58%|█████▊    | 15218/26260 [16:01<11:37, 15.83it/s]

Exception On Id=593698: list index out of range


 60%|█████▉    | 15689/26260 [16:41<11:14, 15.66it/s]

Exception On Id=594196: list index out of range


 65%|██████▍   | 17036/26260 [18:36<10:04, 15.25it/s]

Exception On Id=516692: list index out of range
Exception On Id=593699: list index out of range


 69%|██████▉   | 18236/26260 [19:51<08:44, 15.30it/s]

Exception On Id=457858: list index out of range


 90%|█████████ | 23756/26260 [30:02<03:10, 13.18it/s]

Exception On Id=450648: list index out of range


 94%|█████████▎| 24613/26260 [33:23<02:14, 12.29it/s]