In [20]:
import requests

In [53]:
END_YEAR = 2024

In [21]:
teamRunCounts = {}

In [22]:
import time

In [23]:
def CheckForKeysAndAddIfNecessary(t, l, year, level, teamRunCounts):
    if not l in teamRunCounts.keys():
        teamRunCounts[l] = {}
    if not t in teamRunCounts[l].keys():
        teamRunCounts[l][t] = {}
    if not year in teamRunCounts[l][t].keys():
        teamRunCounts[l][t][year] = {"home": {"innings":0,"runs":0,"hrs":0}, "away": {"innings":0,"runs":0,"hrs":0}, "level": level}


def GetGameRequest(id, year, level, teamRunCounts, session, repeat=True):
    req = session.get(f'https://ws.statsapi.mlb.com/api/v1.1/game/{id}/feed/live')
    if req.status_code == 200:
        data = req.json()
        htId = data["gameData"]["teams"]["home"]["id"]
        atId = data["gameData"]["teams"]["away"]["id"]
        leagueId = data["gameData"]["teams"]["home"]["league"]["id"]
        if not "currentInning" in data["liveData"]["linescore"].keys():
            return
        innings = data["liveData"]["linescore"]["currentInning"]
        tmp = data["liveData"]["linescore"]["teams"]
        # Check if any data on home exists.  Will be empty if game is cancelled
        if not "runs" in tmp["home"].keys() or not "runs" in tmp["away"].keys():
            return
        runs = tmp["home"]["runs"] + tmp["away"]["runs"]
        #innings = data["liveData"]["linescore"]["currentInning"]
        tmp = data["liveData"]["boxscore"]["teams"]
        hrs = tmp["away"]["teamStats"]["batting"]["homeRuns"] + tmp["home"]["teamStats"]["batting"]["homeRuns"]

        CheckForKeysAndAddIfNecessary(htId, leagueId, year, level, teamRunCounts)
        teamRunCounts[leagueId][htId][year]["home"]["innings"] += innings
        teamRunCounts[leagueId][htId][year]["home"]["runs"] += runs
        teamRunCounts[leagueId][htId][year]["home"]["hrs"] += hrs
        
        CheckForKeysAndAddIfNecessary(atId, leagueId, year, level, teamRunCounts)
        teamRunCounts[leagueId][atId][year]["away"]["innings"] += innings
        teamRunCounts[leagueId][atId][year]["away"]["runs"] += runs
        teamRunCounts[leagueId][atId][year]["away"]["hrs"] += hrs
    else:
        #print(f"Incorrect Code: {req.status_code} : L{level} Y{year} {req.content}")
        time.sleep(1)
        if repeat:
            GetGameRequest(id, year, level, teamRunCounts, session, False)

In [24]:
def GetGames(leagueId, year):
    req = requests.get(f"https://ws.statsapi.mlb.com/api/v1/schedule/games/?sportId={leagueId}&startDate={year}-01-01&endDate={year}-12-30")
    if req.status_code == 200:
        data = req.json()
        dates = data["dates"]
        games = []
        for date in dates:
            gs = date["games"]
            for g in gs:
                games.append(g)
        return games
    else:
        print(f"Incorrect Code: {req.status_code} : L{leagueId} Y{year} {req.content}")
        return GetGames(leagueId, year)

In [25]:
from tqdm import tqdm

In [26]:
import sqlite3

In [27]:
import threading

In [28]:
dbWriteLock = threading.Lock()
UpdateProcessLock = threading.Lock()
CompleteProcessLock = threading.Lock()
GetNewDataLock = threading.Lock()

In [29]:
def UpdateDatabase(db, teamRunCounts):
    global dbWriteLock
    with dbWriteLock:
        db.rollback()
        cursor = db.cursor()
        cursor.execute("BEGIN TRANSACTION")
        insertionData = []
        for leagueId, teamData in teamRunCounts.items():
            for teamid, yearData in teamData.items():
                for year, parkData in yearData.items():
                    valueExists = cursor.execute(f"SELECT COUNT(*) FROM ParkRunData WHERE TeamId={teamid} AND Year={year}").fetchone()[0] > 0
                    if valueExists:
                        continue
                    homeInnings = parkData["home"]["innings"]
                    homeRuns = parkData["home"]["runs"]
                    homeHRs = parkData["home"]["hrs"]
                    awayInnings = parkData["away"]["innings"]
                    awayRuns = parkData["away"]["runs"]
                    awayHRs = parkData["away"]["hrs"]
                    levelId = parkData["level"]
                    insertionData.append([teamid, year, levelId, leagueId, homeInnings, homeRuns, homeHRs, awayInnings, awayRuns, awayHRs])
        cursor.executemany("INSERT INTO ParkRunData('TeamId','Year','LevelId','LeagueId','HomeInnings','HomeRuns','HomeHRs','AwayInnings','AwayRuns','AwayHRs') VALUES (?,?,?,?,?,?,?,?,?,?)", insertionData)
        cursor.execute("END TRANSACTION")
        db.commit()

In [30]:
NUM_THREADS = 16
threadProcess = [0] * NUM_THREADS
completedThreads = 0
threads = []

In [31]:
years = range(2005, END_YEAR)
levels = [1,11,12,13,14,15,16]
nextYearIdx = 0
nextLevelIdx = 0

In [32]:
totalSteps = 10000
progressBar = tqdm(total=totalSteps)

 98%|█████████▊| 9838/10000 [5:39:22<05:35,  2.07s/it]


In [33]:
def UpdateProgressBar():
    completed = completedThreads
    for p in threadProcess:
        completed += p
    possible = len(years) * len(levels)
    progress = completed / possible
    progressBar.n = int(progress * totalSteps)
    progressBar.last_print_n = progressBar.n
    progressBar.refresh()

In [34]:
def GetNextYearLevel():
    global nextYearIdx
    global nextLevelIdx
    global years
    global levels
    with GetNewDataLock:
        if (nextYearIdx == -1): # Data has been exhausted
            return (-1,-1)
        year = years[nextYearIdx]
        level = levels[nextLevelIdx]
        nextLevelIdx += 1
        if (nextLevelIdx >= len(levels)):
            nextYearIdx += 1
            nextLevelIdx = 0
            if (nextYearIdx >= len(years)):
                nextYearIdx = -1
        return (year, level)

In [35]:
def CompleteThread(threadId):
    global threadProcess
    global completedThreads
    with UpdateProcessLock:
        threadProcess[threadId] = 0
    with CompleteProcessLock:
        completedThreads += 1
    

def LogYearLevel(db, threadId, year, level):
    if db == None:
        # Create DB Connection
        db = sqlite3.connect("playerData.db")
        cursor = db.cursor()
        cursor.execute("PRAGMA journal_mode = WAL")
        db.commit()
        cursor = db.cursor()
    
    # Check if data already exists
    cursor = db.cursor()
    if cursor.execute(f"SELECT COUNT(*) FROM ParkRunData WHERE Year={year} AND LevelId={level}").fetchone()[0] > 0:
        # Get next data
        CompleteThread(threadId)
        year, level = GetNextYearLevel()
        if year != -1:
            LogYearLevel(db, threadId, year, level)
        return
    
    #
    teamRunCounts = {}
    games = GetGames(level,year)
    yearString = str(year)
    session = requests.Session()
    for k, game in enumerate(games):
        try:
            GetGameRequest(game["gamePk"], yearString, level, teamRunCounts, session)
        finally:
            threadProcess[threadId] = k / len(games)
    print(f"Completed Year: {year} Level: {level}")
    UpdateDatabase(db, teamRunCounts)
        
    CompleteThread(threadId)
    year, level = GetNextYearLevel()
    if year != -1:
        LogYearLevel(db, threadId, year, level)

In [36]:
# Create Threads
for i in range(NUM_THREADS):
    year, level = GetNextYearLevel()
    thread = threading.Thread(target=LogYearLevel, args=[None, i, year, level])
    threads.append(thread)
    thread.start()

In [37]:
# Start progress bar
keepTimerRunning = True
def UpdateTimer():
    if keepTimerRunning:
        threading.Timer(5.0, UpdateTimer).start()
    UpdateProgressBar()
    
UpdateTimer()



In [39]:
for thread in threads:
    thread.join()
keepTimerRunning = False

In [52]:
ROLLING_PERIOD = 3
INNING_CUTOFF = 150

db = sqlite3.connect("playerData.db")
cursor = db.cursor()
teams = cursor.execute("SELECT DISTINCT TeamId FROM ParkRunData").fetchall()

for year in range(2005, END_YEAR):
    for team in teams:
        team = team[0]
        # Check if data already exists
        if cursor.execute(f"SELECT COUNT(*) FROM ParkFactors WHERE year='{year}' AND TeamId='{team}'").fetchone()[0] > 0:
            #print("Continuing")
            continue
        
        parkRunData = cursor.execute(f"SELECT * FROM ParkRunData WHERE TeamId='{team}' AND year > '{year - ROLLING_PERIOD}' AND year <= '{year}' ORDER BY Year DESC, HomeInnings DESC").fetchall()
        # Check if data exists
        if len(parkRunData) == 0:
            continue
        # Get current league from first entry
        leagueId = parkRunData[0][2]
        
        awayInnings = 0
        awayRuns = 0
        awayHRs = 0
        homeInnings = 0
        homeRuns = 0
        homeHRs = 0
        for data in parkRunData:
            homeInnings += data[4]
            homeRuns += data[5]
            homeHRs += data[6]
            awayInnings += data[7]
            awayRuns += data[8]
            awayHRs += data[9]
            
        # Ensure enough data to actually calculate park factors
        if awayInnings < INNING_CUTOFF or homeInnings < INNING_CUTOFF:
            continue
        
        runFactor = (homeRuns / homeInnings) / (awayRuns / awayInnings)
        hrFactor = (homeHRs / homeInnings) / (awayHRs / awayInnings)
        params = [(team, leagueId, year, runFactor, hrFactor)]
        cursor.executemany("INSERT INTO ParkFactors('TeamId','LeagueId','Year','RunFactor','HRFactor') VALUES(?,?,?,?,?)", params)
        db.commit()

In [60]:
leagues = cursor.execute("SELECT DISTINCT LeagueId FROM ParkFactors").fetchall()
db.rollback()
cursor = db.cursor()

for year in range(2005, END_YEAR):
    # Check if data already exists for year
    if cursor.execute(f"SELECT COUNT(*) FROM LeagueFactors WHERE Year='{year}'").fetchone()[0] > 0:
        continue
    
    # Get data for each league
    yearlyLeagueData = []
    for league in leagues:
        league = league[0]
        
        data = cursor.execute(f"SELECT HomeInnings, HomeRuns, HomeHRs FROM ParkRunData WHERE LeagueId='{league}' AND Year='{year}'").fetchall()
        homeInnings = 0
        homeRuns = 0
        homeHRs = 0
        for innings, runs, hrs in data:
            homeInnings += innings
            homeRuns += runs
            homeHRs += hrs
        yearlyLeagueData.append((league, homeInnings, homeRuns, homeHRs))
        
    # Get Average of all leagues
    innings = 0
    runs = 0
    hrs = 0
    for _, inn, r, hr in yearlyLeagueData:
        innings += inn
        runs += r
        hrs += hr
        
    # Normalize each league to the average
    cursor.execute("BEGIN TRANSACTION")
    dbData = []
    for leagueId, leagueInnings, leagueRuns, leagueHRs in yearlyLeagueData:
        if leagueInnings > INNING_CUTOFF:
            dbData.append((leagueId, year, (leagueRuns / leagueInnings)/(runs / innings), (leagueHRs / leagueInnings)/(hrs / innings)))
        else:
            dbData.append((leagueId, year, 1, 1))
    cursor.executemany("INSERT INTO LeagueFactors('LeagueId','Year','RunFactor','HRFactor') VALUES(?,?,?,?)", dbData)
    cursor.execute("END TRANSACTION")
    db.commit()