In [26]:
import requests

In [27]:
END_YEAR = 2024
GENERATE_PARK_RUN_DATA = False

In [28]:
teamRunCounts = {}

In [29]:
import time

In [30]:
def CheckForKeysAndAddIfNecessary(t, l, year, level, teamRunCounts):
    if not l in teamRunCounts.keys():
        teamRunCounts[l] = {}
    if not t in teamRunCounts[l].keys():
        teamRunCounts[l][t] = {}
    if not year in teamRunCounts[l][t].keys():
        teamRunCounts[l][t][year] = {"home": {"innings":0,"runs":0,"hrs":0}, "away": {"innings":0,"runs":0,"hrs":0}, "level": level, "league": l, "team":t}


def GetGameRequest(id, year, level, teamRunCounts, session, repeat=True):
    req = session.get(f'https://ws.statsapi.mlb.com/api/v1.1/game/{id}/feed/live')
    if req.status_code == 200:
        try:
            data = req.json()
            htId = data["gameData"]["teams"]["home"]["id"]
            atId = data["gameData"]["teams"]["away"]["id"]
            leagueId = data["gameData"]["teams"]["home"]["league"]["id"]
            if not "currentInning" in data["liveData"]["linescore"].keys():
                return
            innings = data["liveData"]["linescore"]["currentInning"]
            tmp = data["liveData"]["linescore"]["teams"]
            # Check if any data on home exists.  Will be empty if game is cancelled
            if not "runs" in tmp["home"].keys() or not "runs" in tmp["away"].keys():
                return
            runs = tmp["home"]["runs"] + tmp["away"]["runs"]
            #innings = data["liveData"]["linescore"]["currentInning"]
            tmp = data["liveData"]["boxscore"]["teams"]
            hrs = tmp["away"]["teamStats"]["batting"]["homeRuns"] + tmp["home"]["teamStats"]["batting"]["homeRuns"]

            CheckForKeysAndAddIfNecessary(htId, leagueId, year, level, teamRunCounts)
            teamRunCounts[leagueId][htId][year]["home"]["innings"] += innings
            teamRunCounts[leagueId][htId][year]["home"]["runs"] += runs
            teamRunCounts[leagueId][htId][year]["home"]["hrs"] += hrs
            
            CheckForKeysAndAddIfNecessary(atId, leagueId, year, level, teamRunCounts)
            teamRunCounts[leagueId][atId][year]["away"]["innings"] += innings
            teamRunCounts[leagueId][atId][year]["away"]["runs"] += runs
            teamRunCounts[leagueId][atId][year]["away"]["hrs"] += hrs
        except:
            return
    else:
        #print(f"Incorrect Code: {req.status_code} : L{level} Y{year} {req.content}")
        time.sleep(1)
        if repeat:
            GetGameRequest(id, year, level, teamRunCounts, session, False)

In [31]:
def GetGames(leagueId, year):
    req = requests.get(f"https://ws.statsapi.mlb.com/api/v1/schedule/games/?sportId={leagueId}&startDate={year}-01-01&endDate={year}-12-30")
    if req.status_code == 200:
        data = req.json()
        dates = data["dates"]
        games = []
        for date in dates:
            gs = date["games"]
            for g in gs:
                games.append(g)
        return games
    else:
        print(f"Incorrect Code: {req.status_code} : L{leagueId} Y{year} {req.content}")
        return GetGames(leagueId, year)

In [32]:
from tqdm import tqdm

In [33]:
import sqlite3

In [34]:
import threading

In [35]:
dbWriteLock = threading.Lock()
UpdateProcessLock = threading.Lock()
CompleteProcessLock = threading.Lock()
GetNewDataLock = threading.Lock()

In [36]:
def UpdateDatabase(db, teamRunCounts):
    global dbWriteLock
    with dbWriteLock:
        db.rollback()
        cursor = db.cursor()
        cursor.execute("BEGIN TRANSACTION")
        insertionData = []
        for leagueId, teamData in teamRunCounts.items():
            for teamid, yearData in teamData.items():
                for year, parkData in yearData.items():
                    valueExists = cursor.execute(f"SELECT COUNT(*) FROM Park_Run_Factors WHERE TeamId='{teamid}' AND Year='{year}'").fetchone()[0] > 0
                    if valueExists:
                        continue
                    homeInnings = parkData["home"]["innings"]
                    homeRuns = parkData["home"]["runs"]
                    homeHRs = parkData["home"]["hrs"]
                    awayInnings = parkData["away"]["innings"]
                    awayRuns = parkData["away"]["runs"]
                    awayHRs = parkData["away"]["hrs"]
                    levelId = parkData["level"]
                    insertionData.append([teamid, year, levelId, leagueId, homeInnings, homeRuns, homeHRs, awayInnings, awayRuns, awayHRs])
        cursor.executemany("INSERT INTO Park_Run_Factors('TeamId','Year','LevelId','LeagueId','HomeInnings','HomeRuns','HomeHRs','AwayInnings','AwayRuns','AwayHRs') VALUES (?,?,?,?,?,?,?,?,?,?)", insertionData)
        cursor.execute("END TRANSACTION")
        db.commit()

In [37]:
NUM_THREADS = 16
threadProcess = [0] * NUM_THREADS
completedThreads = 0
threads = []

In [38]:
years = range(2005, END_YEAR)
levels = [1,11,12,13,14,15,16]
nextYearIdx = 0
nextLevelIdx = 0

In [39]:
totalSteps = 10000
progressBar = tqdm(total=totalSteps)

  0%|          | 0/10000 [07:40<?, ?it/s]


In [40]:
def UpdateProgressBar():
    completed = completedThreads
    for p in threadProcess:
        completed += p
    possible = len(years) * len(levels)
    progress = completed / possible
    progressBar.n = int(progress * totalSteps)
    progressBar.last_print_n = progressBar.n
    progressBar.refresh()

In [41]:
def GetNextYearLevel():
    global nextYearIdx
    global nextLevelIdx
    global years
    global levels
    with GetNewDataLock:
        if (nextYearIdx == -1): # Data has been exhausted
            return (-1,-1)
        year = years[nextYearIdx]
        level = levels[nextLevelIdx]
        nextLevelIdx += 1
        if (nextLevelIdx >= len(levels)):
            nextYearIdx += 1
            nextLevelIdx = 0
            if (nextYearIdx >= len(years)):
                nextYearIdx = -1
        return (year, level)

In [42]:
def CompleteThread(threadId):
    global threadProcess
    global completedThreads
    with UpdateProcessLock:
        threadProcess[threadId] = 0
    with CompleteProcessLock:
        completedThreads += 1
    

def LogYearLevel(db, threadId, year, level):
    if db == None:
        # Create DB Connection
        db = sqlite3.connect("BaseballStats.db")
        cursor = db.cursor()
        cursor.execute("PRAGMA journal_mode = WAL")
        db.commit()
        cursor = db.cursor()
    
    # Check if data already exists
    cursor = db.cursor()
    if cursor.execute(f"SELECT COUNT(*) FROM Park_Run_Factors WHERE Year='{year}' AND LevelId='{level}'").fetchone()[0] > 0:
        # Get next data
        CompleteThread(threadId)
        # year, level = GetNextYearLevel()
        # if year != -1:
        #     LogYearLevel(db, threadId, year, level)
        return
    
    #
    teamRunCounts = {}
    games = GetGames(level,year)
    yearString = str(year)
    session = requests.Session()
    for k, game in enumerate(games):
        try:
            GetGameRequest(game["gamePk"], yearString, level, teamRunCounts, session)
        finally:
            threadProcess[threadId] = k / len(games)
    print(f"Completed Year: {year} Level: {level}")
    UpdateDatabase(db, teamRunCounts)
        
    # CompleteThread(threadId)
    # year, level = GetNextYearLevel()
    # if year != -1:
    #     LogYearLevel(db, threadId, year, level)

In [43]:
if GENERATE_PARK_RUN_DATA:
    # Create Threads
    for i in range(NUM_THREADS):
        year, level = GetNextYearLevel()
        thread = threading.Thread(target=LogYearLevel, args=[None, i, year, level])
        threads.append(thread)
        thread.start()
        
    # Start progress bar
    keepTimerRunning = True
    def UpdateTimer():
        if keepTimerRunning:
            threading.Timer(5.0, UpdateTimer).start()
        UpdateProgressBar()
        
    UpdateTimer()

    for thread in threads:
        thread.join()
    keepTimerRunning = False

In [51]:
# LogYearLevel(None, 0, 2008, 14)
LogYearLevel(None, 0, 2014, 1)
LogYearLevel(None, 0, 2016, 1)

Completed Year: 2014 Level: 1
Completed Year: 2016 Level: 1


Calculate Park Factors

In [56]:
ROLLING_PERIOD = 3
INNING_CUTOFF = 100

db = sqlite3.connect("BaseballStats.db")
cursor = db.cursor()
teams = cursor.execute("SELECT DISTINCT TeamId FROM Park_Run_Factors").fetchall()

for year in range(2005, END_YEAR):
    for team in teams:
        team = team[0]
        # Check if data already exists
        if cursor.execute(f"SELECT COUNT(*) FROM Park_Factors WHERE year='{year}' AND TeamId='{team}'").fetchone()[0] > 0:
            #print("Continuing")
            continue
        
        parkRunData = cursor.execute(f"SELECT * FROM Park_Run_Factors WHERE TeamId='{team}' AND year > '{year - ROLLING_PERIOD}' AND year <= '{year}' ORDER BY Year DESC, HomeInnings DESC").fetchall()
        # Check if data exists
        if len(parkRunData) == 0:
            continue
        
        # Make sure there is some data from the current year
        if parkRunData[0][1] != year:
            continue
        
        # Get current league from first entry
        leagueId = parkRunData[0][2]
        levelId = parkRunData[0][3]
        
        awayInnings = 0
        awayRuns = 0
        awayHRs = 0
        homeInnings = 0
        homeRuns = 0
        homeHRs = 0
        for data in parkRunData:
            homeInnings += data[4]
            homeRuns += data[5]
            homeHRs += data[6]
            awayInnings += data[7]
            awayRuns += data[8]
            awayHRs += data[9]
            
        # Ensure enough data to actually calculate park factors
        if awayInnings < INNING_CUTOFF or homeInnings < INNING_CUTOFF:
            continue
        
        runFactor = (homeRuns / homeInnings) / (awayRuns / awayInnings)
        hrFactor = (homeHRs / homeInnings) / (awayHRs / awayInnings)
        params = [(team, leagueId, levelId, year, runFactor, hrFactor)]
        cursor.executemany("INSERT INTO Park_Factors('TeamId','LeagueId','LevelId','Year','RunFactor','HRFactor') VALUES(?,?,?,?,?,?)", params)
        db.commit()

Calculate Level Factors

In [58]:
db.rollback()
cursor = db.cursor()

for year in cursor.execute("SELECT DISTINCT Year FROM Park_Factors").fetchall():
    year = year[0]
    data = []
    totalInnings = 0
    totalRuns = 0
    totalHrs = 0
    for level in cursor.execute(f"SELECT DISTINCT LevelId FROM Park_Factors WHERE Year='{year}'").fetchall():
        level = level[0]
        # Covid check
        if year == 2020 and level != 1:
            continue
        if year > 2020 and level == 15:
            continue
        innings, runs, hrs = cursor.execute(f"SELECT SUM(HomeInnings), SUM(HomeRuns), SUM(HomeHRs) FROM Park_Run_Factors WHERE Year='{year}' AND LevelId='{level}'").fetchone()
        
        data.append((level, runs/innings, hrs/innings))
        totalInnings += innings
        totalRuns += runs
        totalHrs += hrs
        
    baseRunFactor = totalRuns / totalInnings
    baseHRFactor = totalHrs / totalInnings
    cursor.execute("BEGIN TRANSACTION")
    for d in data:
        if cursor.execute(f"SELECT COUNT(*) FROM Level_Factors WHERE LevelId='{d[0]}' AND Year='{year}'").fetchone()[0] == 0:
            cursor.execute("INSERT INTO Level_Factors('LevelId','Year','RunFactor','HRFactor') VALUES(?,?,?,?)", [d[0], year, d[1] / baseRunFactor, d[2] / baseHRFactor])
    cursor.execute("END TRANSACTION")
    db.commit()
    cursor = db.cursor()

Calculate League Factors

In [59]:
leagues = cursor.execute("SELECT DISTINCT LeagueId FROM Park_Factors").fetchall()
db.rollback()
cursor = db.cursor()

for year in range(2005, END_YEAR):
    # Check if data already exists for year
    if cursor.execute(f"SELECT COUNT(*) FROM League_Factors WHERE Year='{year}'").fetchone()[0] > 0:
        continue
    
    # Get data for each league
    yearlyLeagueData = []
    for league in leagues:
        league = league[0]
        
        data = cursor.execute(f"SELECT HomeInnings, HomeRuns, HomeHRs, LevelId FROM Park_Run_Factors WHERE LeagueId='{league}' AND Year='{year}'").fetchall()
        homeInnings = 0
        homeRuns = 0
        homeHRs = 0
        for innings, runs, hrs, _ in data:
            homeInnings += innings
            homeRuns += runs
            homeHRs += hrs
            
        # if league == 134:
        #     print(f"Year={year} HomeInnings={homeInnings}")
            
        if len(data) > 0:
            yearlyLeagueData.append((league, homeInnings, homeRuns, homeHRs, data[0][3]))
        
    # Get Average of all leagues
    innings = 0
    runs = 0
    hrs = 0
    for _, inn, r, hr, _ in yearlyLeagueData:
        innings += inn
        runs += r
        hrs += hr
        
    # Normalize each league to the average
    cursor.execute("BEGIN TRANSACTION")
    dbData = []
    for leagueId, leagueInnings, leagueRuns, leagueHRs, levelId in yearlyLeagueData:
        levelRunFactor, levelHRFactor = cursor.execute(f"SELECT RunFactor, HRFactor FROM Level_Factors WHERE LevelId='{levelId}' AND Year='{year}'").fetchone()
        if leagueId == 120:
            print(f"Innings={leagueInnings} Year={year}")
        if leagueInnings > INNING_CUTOFF:
            dbData.append((leagueId, year, (leagueRuns / leagueInnings)/(runs / innings)/levelRunFactor, (leagueHRs / leagueInnings)/(hrs / innings)/levelHRFactor))
        else:
            dbData.append((leagueId, year, 1, 1))
    
    cursor.executemany("INSERT INTO League_Factors('LeagueId','Year','RunFactor','HRFactor') VALUES(?,?,?,?)", dbData)
    cursor.execute("END TRANSACTION")
    db.commit()