In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
import re


# AAA

In [35]:
# https://www.milb.com/stats/team

def read_clipboard_aaa():
    df = pd.DataFrame(pd.read_clipboard(delimiter=chr(1), header=None).values.reshape((-1,3)))
    df = pd.concat([df,df.loc[:,2].str.split('\t', expand=True)], axis=1, ignore_index=True)
    df = df.drop(2, axis=1)
    df.columns = ['rank','team','league','g','ab','r','h','2b','3b','hr',
                  'rbi','bb','so','sb','cs','avg','obp','slg','ops']
    return df

In [79]:
def read_clipboard_aaa_pitching():
    df = pd.DataFrame(pd.read_clipboard(delimiter=chr(1), header=None).values.reshape((-1,3)))
    df = pd.concat([df,df.loc[:,2].str.split('\t', expand=True)], axis=1, ignore_index=True)
    df = df.drop(2, axis=1)
    df.columns = ['rank','team','league','w','l','era','g','gs','cg','sho','sv','svo','ip','h',
                  'r','er','hr','hb','bb','so','whip','avg']
    return df

In [43]:
il_h = read_clipboard_aaa()

In [44]:
il_a = read_clipboard_aaa()

In [41]:
pcl_a = read_clipboard_aaa()

In [42]:
pcl_h = read_clipboard_aaa()

In [80]:
il_hp = read_clipboard_aaa_pitching()

In [81]:
il_ap = read_clipboard_aaa_pitching()

In [82]:
pcl_hp = read_clipboard_aaa_pitching()

In [84]:
pcl_ap = read_clipboard_aaa_pitching()

In [91]:
pcl = pd.merge(pcl_h, pcl_a, on=['team','league'], suffixes=('_home', '_away'))
il = pd.merge(il_h, il_a, on=['team','league'], suffixes=('_home','_away'))
aaa = pd.concat([pcl, il], ignore_index=True)

In [92]:
pclp = pd.merge(pcl_hp, pcl_ap, on=['team','league'], suffixes=('_home', '_away'))
ilp = pd.merge(il_hp, il_ap, on=['team','league'], suffixes=('_home','_away'))
aaap = pd.concat([pclp, ilp], ignore_index=True)

In [93]:
aaa = pd.merge(aaa, aaap, on=['team','league'], suffixes=('','_against'))

In [94]:
for col in aaa.columns:
    if col not in ['team','league']:
        aaa[col] = pd.to_numeric(aaa[col])

In [99]:

aaa['r/g_home'] = (aaa['r_home'] + aaa['r_home_against']) / (aaa['g_home'])
aaa['r/g_away'] = (aaa['r_away'] + aaa['r_away_against']) / (aaa['g_away'])

In [100]:
aaa['park_factor'] = aaa['r/g_home'] / aaa['r/g_away']

In [101]:
aaa[['team','park_factor']].sort_values('park_factor')

Unnamed: 0,team,park_factor
6,Sacramento,0.637
5,Okla. City,0.777652
7,Sugar Land,0.799716
22,Norfolk,0.805587
29,Nashville,0.837736
24,Rochester,0.849461
25,Buffalo,0.895487
27,Toledo,0.895758
12,Durham,0.899272
28,Gwinnett,0.913972


In [65]:
aaa_parks = {'Jacksonville':'vystar_ballpark', 'Memphis': 'autozone_park', 'St. Paul': 'chs_field',
              'Lehigh Valley': 'coca_cola_park', 'Gwinnett': 'coolray_field', 'Durham': 'durham_bulls_athletic_park',
                'Toledo': 'fifth_third_field', 'Nashville': 'first_horizon_park', 'Norfolk': 'harbor_park', 
                'Columbus': 'huntington_park', 'Rochester': 'innovative_field', 'Louisville': 'louisville_slugger_field',
                  'Syracuse': 'nbt_bank_stadium', 'Scranton/WB': 'pnc_field', 'Worcester': 'polar_park', 
                  'Iowa': 'principal_park', 'Buffalo': 'sahlen_field', 'Charlotte': 'truist_field', 
                  'Indianapolis': 'victory_field', 'Omaha': 'werner_park',
                  'Tacoma': 'cheney_stadium', 'Okla. City': 'chickasaw_bricktown_ballpark', 
                  'Sugar Land': 'constellation_field', 'Salt Lake': 'smiths_ballpark', 
                  'Round Rock': 'dell_diamond', 'Reno': 'greater_nevada_field', 'Las Vegas': 'las_vegas_ballpark', 
                  'Albuquerque': 'rio_grande_credit_union_field_at_isotopes_park', 'El Paso': 'southwest_university_park',
                  'Sacramento': 'sutter_health_park'}

In [10]:
import json
with open('../data_collection/metadata.json','r') as f:
    metadata = json.load(f)

In [66]:
for k,v in aaa_parks.items():
    if k not in aaa['team'].values:
        print('Team:', k)
    if v not in metadata:
        print('Park:',v)

In [170]:
aaa['ballpark'] = aaa['team'].apply(aaa_parks.get)

In [171]:
aaa[['ballpark','park_factor','g_home']].to_csv('aaa_park_factors.csv', index=False)

# NPB

In [3]:
games = []
for team in ['t','c','db','g','s','d','b','m','h','e','l','f']:
    for month in range(4,11):

        response = requests.get(f"https://npb.jp/bis/eng/teams/calendar_{team}_{str(month).zfill(2)}.html")
        soup = BeautifulSoup(response.text, features='lxml')
        calendar = soup.find('table', {'class':'tetblmain'})
        for tr in calendar.find_all('tr')[1:]:
            for td in tr.find_all('td'):
                postseason = False
                for title in ['CS First Stage','CS Final Stage','Nippon Series','Postponed','No Game']:
                    if title in td.text:
                        postseason = True
                if not postseason:
                    game = td.find_all('div',{'class':'tevsteam'})
                    if game:
                        game = game[0]
                        date = td.find('div',{'class':'teschedate'}).text
                        score = game.find('div',{'class':'tescore'})
                        stadium = game.find('div',{'class':'testdm'}).text[1:-1]
                        home_team, home_score, _, away_score, away_team = score.text.split(' ')
                        game_id = score.find('a')['href'].split('/')[-1].split('.')[0]
                        games.append({'home_team':home_team, 'home_score':int(home_score), 'away_team':away_team,
                                      'away_score':int(away_score), 'stadium':stadium, 'game_id':game_id})


In [6]:
npb_df = pd.DataFrame(games).drop_duplicates(subset='game_id').set_index('game_id')
npb_df.head()

Unnamed: 0_level_0,home_team,home_score,away_team,away_score,stadium
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
s2024032901088,G,4,T,0,Tokyo Dome
s2024033001091,G,5,T,0,Tokyo Dome
s2024033101094,G,0,T,5,Tokyo Dome
s2024040201098,T,3,DB,5,Kyocera Dome
s2024040301101,T,5,DB,2,Kyocera Dome


In [7]:
npb_df['game_count'] = 1
npb_df['total_score'] = npb_df['home_score'] + npb_df['away_score']

In [8]:
npb_df.head()

Unnamed: 0_level_0,home_team,home_score,away_team,away_score,stadium,game_count,total_score
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
s2024032901088,G,4,T,0,Tokyo Dome,1,4
s2024033001091,G,5,T,0,Tokyo Dome,1,5
s2024033101094,G,0,T,5,Tokyo Dome,1,5
s2024040201098,T,3,DB,5,Kyocera Dome,1,8
s2024040301101,T,5,DB,2,Kyocera Dome,1,7


In [9]:
npb_home = npb_df.groupby(['home_team','stadium'])[['total_score','game_count']].sum().reset_index()
npb_away = npb_df.groupby(['away_team'])[['total_score','game_count']].sum().reset_index()
npb = pd.merge(npb_home, npb_away, left_on='home_team', right_on='away_team', suffixes=('_home','_away'))
npb.pop('away_team')
npb = npb.rename(columns={'home_team':'team'})
npb

Unnamed: 0,team,stadium,total_score_home,game_count_home,total_score_away,game_count_away
0,B,Hotto Kobe,51,8,443,72
1,B,Kyocera Dome,349,61,443,72
2,B,Naha,7,2,443,72
3,C,Mazda Stadium,430,71,404,72
4,D,Gifu,8,1,475,72
5,D,Toyohashi,6,1,475,72
6,D,Vantelin Dome,362,69,475,72
7,DB,HARD OFF Niigata,5,1,488,71
8,DB,Yokohama,532,71,488,71
9,E,Akita,5,1,594,72


In [10]:
npb[npb['stadium'].duplicated(False)].sort_values('stadium')

Unnamed: 0,team,stadium,total_score_home,game_count_home,total_score_away,game_count_away
4,D,Gifu,8,1,475,72
17,G,Gifu,3,1,426,71
1,B,Kyocera Dome,349,61,443,72
19,G,Kyocera Dome,10,2,426,71
26,H,Kyocera Dome,12,1,488,71
41,T,Kyocera Dome,71,9,484,71
20,G,Maebashi,7,1,426,71
33,L,Maebashi,1,1,414,71
13,E,Tokyo Dome,7,1,594,72
22,G,Tokyo Dome,385,64,426,71


In [11]:
npb_stadiums = npb.groupby('stadium').sum().reset_index()
npb_stadiums.head()

Unnamed: 0,stadium,total_score_home,game_count_home,total_score_away,game_count_away
0,Akita,5,1,594,72
1,Belluna Dome,411,69,414,71
2,ES CON FIELD,566,72,451,71
3,Fukui,4,1,426,71
4,Fukushima,8,1,594,72


In [12]:
npb_stadiums['park_factor'] = (npb_stadiums['total_score_home'] / npb_stadiums['game_count_home']) / \
                                (npb_stadiums['total_score_away'] / npb_stadiums['game_count_away'])

In [168]:
npb_pf = npb_stadiums.rename(columns={'game_count_home':'g_home', 'stadium':'ballpark'})
#TODO fix stadium names
npb_pf[['ballpark','park_factor','g_home']].to_csv('npb_park_factors_raw.csv', index=False)

In [16]:
npb.query('stadium=="Hotto Kobe"')


Unnamed: 0,team,stadium,total_score_home,game_count_home,total_score_away,game_count_away
0,B,Hotto Kobe,51,8,443,72
1,B,Kyocera Dome,349,61,443,72
2,B,Naha,7,2,443,72
3,C,Mazda Stadium,430,71,404,72
4,D,Gifu,8,1,475,72
5,D,Toyohashi,6,1,475,72
6,D,Vantelin Dome,362,69,475,72
7,DB,HARD OFF Niigata,5,1,488,71
8,DB,Yokohama,532,71,488,71
9,E,Akita,5,1,594,72


# KBO

In [6]:
response = requests.get('https://mykbostats.com/stats/team_splits/2024')
# dfs = pd.read_html(response.text)
soup = BeautifulSoup(response.text, features='lxml')
tables = soup.find_all('table', {'class':'team-splits ui compact very basic sortable unstackable hsticky table'})

In [49]:
def get_stats_clipboard_kbo():
    df = pd.read_clipboard()
    home = df.iloc[:10,:].reset_index()
    away = df.iloc[-10:,:].reset_index()
    for d in [home, away]:
        for c in ['R','-R','G']:
            d[c] = pd.to_numeric(d[c])
    home['total_runs_per_game'] = (home['R'] + home['-R']) / home['G']
    away['total_runs_per_game'] = (away['R'] + away['-R']) / away['G']
    home.columns = ['Location','Team'] + home.columns.tolist()[2:]
    away.columns = home.columns
    kbo = pd.merge(home, away, on=['Location','Team'], suffixes=('_home','_away'))
    kbo['park_factor'] = kbo['total_runs_per_game_home'] / kbo['total_runs_per_game_away']
    kbo['ballpark'] = kbo['Location'].str.cat(kbo['Team'], sep=' ').apply({'LG Twins':'mokdong_baseball_stadium','Doosan Bears':'mokdong_baseball_stadium','Lotte Giants':'sajik_baseball_stadium',
                                                                    'Samsung Lions':'daegu_samsung_lions_park', 'SSG Landers':'incheon_ssg_landers_field', 'NC Dinos':'changwon_nc_park',
                                                                    'Kia Tigers':'kia_champions_field', 'KT Wiz':'suwon_kt_wiz_park','Hanwha Eagles':'cheongju_baseball_stadium','Kiwoom Heroes':'gocheok_sky_dome'}.get)
    kbo = kbo.rename(columns={'G_home':'g_home'})
    return kbo['ballpark,park_factor,g_home'.split(',')].dropna()

kbo = get_stats_clipboard_kbo()
kbo

Unnamed: 0,ballpark,park_factor,g_home
0,kia_champions_field,1.023923,76
1,daegu_samsung_lions_park,1.150833,77
2,mokdong_baseball_stadium,0.933258,78
3,mokdong_baseball_stadium,0.885086,73
4,suwon_kt_wiz_park,1.028408,74
5,incheon_ssg_landers_field,1.182258,71
6,sajik_baseball_stadium,1.13833,71
7,cheongju_baseball_stadium,0.976892,71
8,changwon_nc_park,0.903472,73
9,gocheok_sky_dome,0.827877,73


In [50]:
kbo.to_csv('kbo_park_factors.csv', index=False)

In [None]:
{'LG Twins':'mokdong_baseball_stadium','Doosan Bears':'mokdong_baseball_stadium','Lotte Giants':'sajik_baseball_stadium',
 'Samsung Lions':'daegu_samsung_lions_park', 'SSG Landers':'incheon_ssg_landers_field', 'NC Dinos':'changwon_nc_park',
 'KIA Tigers':'kia_champions_field', 'KT Wiz':'suwon_kt_wiz_park','Hanwha Eagles':'cheongju_baseball_stadium'}

# MLB

In [51]:
mlb = pd.read_clipboard()
mlb

Unnamed: 0,Rk.,Team,Venue,Year,Park Factor,wOBACon,xwOBACon,BACON,xBACON,HardHit,R,OBP,H,1B,2B,3B,HR,BB,SO,PA
0,1,Rockies,Coors Field,2022-2024,112,111,102,111,103,103,125,110,116,115,118,198,109,99,89,56537
1,2,Red Sox,Fenway Park,2022-2024,107,107,102,108,101,103,114,106,109,107,123,118,98,96,97,57350
2,3,Reds,Great American Ball Park,2022-2024,105,106,99,103,99,96,110,103,102,98,100,79,128,105,102,56113
3,4,Royals,Kauffman Stadium,2022-2024,104,100,103,102,102,105,108,105,106,107,114,168,85,102,87,56788
4,5,Twins,Target Field,2022-2024,102,104,102,103,100,101,104,102,101,98,107,90,105,105,105,56874
5,6,D-backs,Chase Field,2022-2024,101,100,99,101,100,101,102,102,104,104,113,168,86,99,94,57491
6,7,Pirates,PNC Park,2022-2024,101,99,101,101,101,103,102,103,102,104,110,80,84,102,95,56717
7,8,Marlins,loanDepot park,2022-2024,101,102,101,102,101,100,102,102,103,104,109,117,92,96,100,56832
8,9,Phillies,Citizens Bank Park,2022-2024,101,103,100,101,99,100,102,99,100,98,98,107,115,96,102,59030
9,10,Rangers,Globe Life Field,2022-2024,101,101,101,99,100,102,102,100,100,98,96,90,115,103,100,57150


In [78]:
mlb['ballpark'] = mlb['Venue'].apply(lambda x : x.lower().replace(' ','_').replace('-','_')).replace({'great_american_ball_park':'great_american_ballpark','angel_stadium':'angels_stadium','oriole_park_at_camden_yards':'camden_yards'})

In [79]:
mlb

Unnamed: 0,Rk.,Team,Venue,Year,Park Factor,wOBACon,xwOBACon,BACON,xBACON,HardHit,...,OBP,H,1B,2B,3B,HR,BB,SO,PA,ballpark
0,1,Rockies,Coors Field,2022-2024,112,111,102,111,103,103,...,110,116,115,118,198,109,99,89,56537,coors_field
1,2,Red Sox,Fenway Park,2022-2024,107,107,102,108,101,103,...,106,109,107,123,118,98,96,97,57350,fenway_park
2,3,Reds,Great American Ball Park,2022-2024,105,106,99,103,99,96,...,103,102,98,100,79,128,105,102,56113,great_american_ballpark
3,4,Royals,Kauffman Stadium,2022-2024,104,100,103,102,102,105,...,105,106,107,114,168,85,102,87,56788,kauffman_stadium
4,5,Twins,Target Field,2022-2024,102,104,102,103,100,101,...,102,101,98,107,90,105,105,105,56874,target_field
5,6,D-backs,Chase Field,2022-2024,101,100,99,101,100,101,...,102,104,104,113,168,86,99,94,57491,chase_field
6,7,Pirates,PNC Park,2022-2024,101,99,101,101,101,103,...,103,102,104,110,80,84,102,95,56717,pnc_park
7,8,Marlins,loanDepot park,2022-2024,101,102,101,102,101,100,...,102,103,104,109,117,92,96,100,56832,loandepot_park
8,9,Phillies,Citizens Bank Park,2022-2024,101,103,100,101,99,100,...,99,100,98,98,107,115,96,102,59030,citizens_bank_park
9,10,Rangers,Globe Life Field,2022-2024,101,101,101,99,100,102,...,100,100,98,96,90,115,103,100,57150,globe_life_field


In [76]:
for s in mlb['ballpark'].values.tolist():
    if s not in metadata:
        print(s)

chase_field
loandepot_park
globe_life_field
rogers_centre
minute_maid_park
tropicana_field
tmobile_park


In [87]:
mlb['Park Factor']

0     112
1     107
2     105
3     104
4     102
5     101
6     101
7     101
8     101
9     101
10    101
11    100
12    100
13    100
14    100
15    100
16    100
17    100
18     99
19     99
20     98
21     97
22     97
23     97
24     97
25     97
26     97
27     96
28     96
29     91
Name: Park Factor, dtype: int64

In [90]:
mlb['park_factor'] = mlb['Park Factor']
mlb['pa'] = pd.to_numeric(mlb['PA'].str.replace(',',''))

In [92]:
mlb[['ballpark','park_factor','pa']].to_csv('mlb_park_factors.csv', index=False)

# combine

In [119]:
combined = pd.concat([pd.read_csv(f'{x}_park_factors.csv').assign(lg=x) for x in ['aaa','kbo','npb','mlb']])

In [94]:
combined

Unnamed: 0,ballpark,park_factor,g_home,lg,pa
0,las_vegas_ballpark,1.105246,75.0,aaa,
1,greater_nevada_field,1.266567,74.0,aaa,
2,cheney_stadium,1.052443,75.0,aaa,
3,rio_grande_credit_union_field_at_isotopes_park,1.293567,75.0,aaa,
4,southwest_university_park,1.221795,75.0,aaa,
...,...,...,...,...,...
25,oracle_park,97.000000,,mlb,55227.0
26,citi_field,97.000000,,mlb,58441.0
27,tropicana_field,96.000000,,mlb,55802.0
28,petco_park,96.000000,,mlb,58146.0


In [120]:
combined = combined[combined['ballpark'].apply(lambda x:x in metadata)]

In [98]:
combined

Unnamed: 0,ballpark,park_factor,g_home,lg,pa
0,las_vegas_ballpark,1.105246,75.0,aaa,
1,greater_nevada_field,1.266567,74.0,aaa,
2,cheney_stadium,1.052443,75.0,aaa,
3,rio_grande_credit_union_field_at_isotopes_park,1.293567,75.0,aaa,
4,southwest_university_park,1.221795,75.0,aaa,
...,...,...,...,...,...
23,wrigley_field,97.000000,,mlb,55217.0
24,american_family_field,97.000000,,mlb,57190.0
25,oracle_park,97.000000,,mlb,55227.0
26,citi_field,97.000000,,mlb,58441.0


In [115]:

obstructions = pd.read_csv('../data_collection/obstructed_fields.txt').rename(columns={'name':'ballpark',' obstruction_type':'obstruction_type'})

In [116]:
obstructions['obstruction_type'] = obstructions['obstruction_type'].str.strip()

In [121]:
combined = pd.merge(combined, obstructions, on='ballpark', how='left')

In [122]:
combined

Unnamed: 0,ballpark,park_factor,g_home,lg,pa,obstruction_type
0,las_vegas_ballpark,1.105246,75.0,aaa,,
1,greater_nevada_field,1.266567,74.0,aaa,,
2,cheney_stadium,1.052443,75.0,aaa,,
3,rio_grande_credit_union_field_at_isotopes_park,1.293567,75.0,aaa,,
4,southwest_university_park,1.221795,75.0,aaa,,structural_shadow
...,...,...,...,...,...,...
64,wrigley_field,97.000000,,mlb,55217.0,
65,american_family_field,97.000000,,mlb,57190.0,structural
66,oracle_park,97.000000,,mlb,55227.0,
67,citi_field,97.000000,,mlb,58441.0,


In [111]:
combined.to_csv('all_park_factors.csv', index=False)

In [123]:
combined['obstruction_type'].isnull().sum()

53

: 