In [1]:
START_YEAR = 2005
CURRENT_YEAR = 2024

In [2]:
import sqlite3
from tqdm import tqdm

In [3]:
db = sqlite3.connect("BaseballStats.db")

In [4]:
db.rollback()
cursor = db.cursor()
cursor.execute("BEGIN TRANSACTION")
cursor.execute("DELETE FROM Model_Players")

data = cursor.execute(f'''
                      SELECT DISTINCT(pcs.mlbId), pcs.position, pcs.agedOut, pcs.serviceEndYear, pre.year, pre.month, psl.year, p.birthYear, p.birthMonth, p.birthDate, p.signingYear
                      FROM Player_CareerStatus AS pcs
                      LEFT JOIN Player_RookieEligibility AS pre ON pcs.mlbId = pre.mlbId
                      LEFT JOIN Player_ServiceLapse AS psl ON pcs.mlbId = psl.mlbId
                      INNER JOIN Player AS p ON pcs.mlbId = p.mlbId
                      WHERE pcs.isPrimaryPosition='1'
                      AND pcs.careerStartYear>='{START_YEAR}'
                      AND (
                          pcs.serviceEndYear IS NOT NULL
                          OR pcs.agedOut>'0'
                          OR psl.year IS NOT NULL
                      )
                      AND pcs.ignorePlayer IS NULL
                      AND p.birthYear IS NOT NULL
                      AND p.birthMonth IS NOT NULL
                      AND p.birthDate IS NOT NULL
                      ''').fetchall()

for id, position, agedOut, serviceEndYear, rookieYear, rookieMonth, serviceLapseYear, birthYear, birthMonth, birthDate, signingYear in tqdm(data):
    # Positions
    if position == "hitting":
        isHitter = 1
        isPitcher = 0
    else:
        isHitter = 0
        isPitcher = 1
        
    # Determine last MLB Season
    if agedOut is not None and agedOut!= 0:
        lastMLBSeason = agedOut
    elif serviceLapseYear is not None:
        lastMLBSeason = serviceLapseYear
    elif serviceEndYear is not None:
        lastMLBSeason = serviceEndYear
    else:
        raise Exception(f"No valid last MLB season for id={id}")
    
    # Last prospect year/month
    if agedOut is not None and agedOut != 0:
        lastProspectYear = agedOut
        lastProspectMonth = 13
    elif rookieYear is not None and rookieMonth is not None:
        lastProspectYear = rookieYear
        lastProspectMonth = rookieMonth
    elif serviceLapseYear is not None:
        lastProspectYear = serviceLapseYear
        lastProspectMonth = 13
    else:
        raise Exception(f"No valid last prospect season for id={id}")
    
    # Age at signing
    # Use 07/01/SigningYear for signing date.  Should try to get better data for this
    signingYear += 0.5
    signingAge = signingYear - birthYear - (birthMonth - 1) / 12 - (birthDate - 1) / 365
    if signingAge >= 27: # Player will immediately be ineligible, so discard
        continue
    
    cursor.execute("INSERT INTO Model_Players VALUES(?,?,?,?,?,?,?)", (id, isHitter, isPitcher, lastProspectYear, lastProspectMonth, lastMLBSeason, signingAge))

cursor.execute("END TRANSACTION")
db.commit()

100%|██████████| 23716/23716 [00:00<00:00, 696920.18it/s]


Check how many players, and percentage of players at each start year that have had usable careers

Use to check what year cutoff to use for model

21AUG2024 Chose 2013 as cutoff with 95.5% of players eligible in that year, overall 8183 eligible players

In [5]:
cursor = db.cursor()
cumulativePlayers = 0
for year in range(START_YEAR, CURRENT_YEAR + 1):
    ids = cursor.execute('''
                         SELECT mlbId
                         FROM Player_CareerStatus 
                         WHERE position='hitting'
                         AND isPrimaryPosition='1'
                         AND careerStartYear=?
                         AND ignorePlayer IS NULL
                         ''', (year,)).fetchall()
    
    foundPlayers = 0
    totalPlayers = 0
    for (id,) in ids:
        if cursor.execute("SELECT COUNT(*) FROM Model_Players WHERE mlbId=?", (id,)).fetchone()[0] > 0:
            foundPlayers += 1
        totalPlayers += 1
        
    cumulativePlayers += foundPlayers
    if totalPlayers == 0:
        totalPlayers = 1
    print(f'Year={year:5d} Cumulative={cumulativePlayers:6d} Total={totalPlayers:5d} Percentage={(foundPlayers / totalPlayers):.3f}')

Year= 2005 Cumulative=   819 Total=  841 Percentage=0.974
Year= 2006 Cumulative=  1953 Total= 1147 Percentage=0.989
Year= 2007 Cumulative=  2945 Total=  994 Percentage=0.998
Year= 2008 Cumulative=  3875 Total=  937 Percentage=0.993
Year= 2009 Cumulative=  4777 Total=  903 Percentage=0.999
Year= 2010 Cumulative=  5689 Total=  916 Percentage=0.996
Year= 2011 Cumulative=  6526 Total=  849 Percentage=0.986
Year= 2012 Cumulative=  7360 Total=  856 Percentage=0.974
Year= 2013 Cumulative=  8183 Total=  862 Percentage=0.955
Year= 2014 Cumulative=  8976 Total=  885 Percentage=0.896
Year= 2015 Cumulative=  9557 Total=  880 Percentage=0.660
Year= 2016 Cumulative= 10017 Total=  928 Percentage=0.496
Year= 2017 Cumulative= 10422 Total=  916 Percentage=0.442
Year= 2018 Cumulative= 10734 Total= 1033 Percentage=0.302
Year= 2019 Cumulative= 10865 Total=  893 Percentage=0.147
Year= 2020 Cumulative= 10865 Total=    1 Percentage=0.000
Year= 2021 Cumulative= 10892 Total= 1277 Percentage=0.021
Year= 2022 Cum

In [6]:
CUTOFF_YEAR = 2013

In [7]:
db.rollback()
cursor = db.cursor()
cursor.execute("BEGIN TRANSACTION")
cursor.execute("DELETE FROM Model_Players")

data = cursor.execute(f'''
                      SELECT DISTINCT(pcs.mlbId), pcs.position, pcs.agedOut, pcs.serviceEndYear, pre.year, pre.month, psl.year, p.birthYear, p.birthMonth, p.birthDate, p.signingYear
                      FROM Player_CareerStatus AS pcs
                      LEFT JOIN Player_RookieEligibility AS pre ON pcs.mlbId = pre.mlbId
                      LEFT JOIN Player_ServiceLapse AS psl ON pcs.mlbId = psl.mlbId
                      INNER JOIN Player AS p ON pcs.mlbId = p.mlbId
                      WHERE pcs.isPrimaryPosition='1'
                      AND pcs.careerStartYear>=?
                      AND (
                          pcs.serviceEndYear IS NOT NULL
                          OR pcs.agedOut>'0'
                          OR psl.year IS NOT NULL
                      )
                      AND pcs.ignorePlayer IS NULL
                      AND p.birthYear IS NOT NULL
                      AND p.birthMonth IS NOT NULL
                      AND p.birthDate IS NOT NULL
                      AND pcs.careerStartYear<=?
                      ''', (START_YEAR, CUTOFF_YEAR)).fetchall()

for id, position, agedOut, serviceEndYear, rookieYear, rookieMonth, serviceLapseYear, birthYear, birthMonth, birthDate, signingYear in tqdm(data):
    # Positions
    if position == "hitting":
        isHitter = 1
        isPitcher = 0
    else:
        isHitter = 0
        isPitcher = 1
        
    # Determine last MLB Season
    if agedOut is not None and agedOut!= 0:
        lastMLBSeason = agedOut
    elif serviceLapseYear is not None:
        lastMLBSeason = serviceLapseYear
    elif serviceEndYear is not None:
        lastMLBSeason = serviceEndYear
    else:
        raise Exception(f"No valid last MLB season for id={id}")
    
    # Last prospect year/month
    if agedOut is not None and agedOut != 0:
        lastProspectYear = agedOut
        lastProspectMonth = 13
    elif rookieYear is not None and rookieMonth is not None:
        lastProspectYear = rookieYear
        lastProspectMonth = rookieMonth
    elif serviceLapseYear is not None:
        lastProspectYear = serviceLapseYear
        lastProspectMonth = 13
    else:
        raise Exception(f"No valid last prospect season for id={id}")
    
    # Age at signing
    # Use 07/01/SigningYear for signing date.  Should try to get better data for this
    signingYear += 0.5
    signingAge = signingYear - birthYear - (birthMonth - 1) / 12 - (birthDate - 1) / 365
    if signingAge >= 27: # Player will immediately be ineligible, so discard
        continue
    
    cursor.execute("INSERT INTO Model_Players VALUES(?,?,?,?,?,?,?)", (id, isHitter, isPitcher, lastProspectYear, lastProspectMonth, lastMLBSeason, signingAge))

cursor.execute("END TRANSACTION")
db.commit()

100%|██████████| 17350/17350 [00:00<00:00, 663383.45it/s]
