In [1]:
import pandas as pd
import pickle
import datetime as dt
import re

In [2]:
todays_game_dfs = pd.read_html('https://www.baseball-reference.com/previews/')

In [3]:
postponed_list = []
incomplete_list = []

for i in range(len(todays_game_dfs)):
    if len(todays_game_dfs[i].columns) == 3:
        if todays_game_dfs[i].iloc[0,2] == 'Postponed':
            postponed_list.append(i)
for i in postponed_list:
    todays_game_dfs.pop(i)

for i in range(len(todays_game_dfs)):
    if len(todays_game_dfs[i]) == 1:
        incomplete_list.append(i)
for i in incomplete_list:
    todays_game_dfs.pop(i)

In [4]:
game_dfs_clean = []

for i in range(1,len(todays_game_dfs),2):
    
    # Create a df for the specific game
    game_df = todays_game_dfs[i]
    
    # Rename columns
    game_df = game_df.rename(columns={0:'Team',1:'Pitcher'})
    
    #Add the time from previous df
    time_string = todays_game_dfs[i-1].iloc[1,2]
#     print("time: "+str(type(time_string)))
    date_string = dt.date.today().strftime("%Y-%m-%d")
#     print("date: "+str(type(date_string)))
    datetime_string = date_string+" "+time_string+' EST'
    game_df["Datetime"] = datetime_string

    for i in range(len(game_df)):
        # Create 'Next Opp Arm_R' column
        pitcher = game_df.loc[i,'Pitcher']
        throws = re.findall("[LR]HP", pitcher)[0]
        if throws == 'RHP':
            throws_binary = 1
        else:
            throws_binary = 0

        game_df.loc[i,'Next Opp Arm_R'] = throws_binary

        # Create 'Next H/A_H' 
        if i == 0:
            game_df.loc[i,'Next H/A_H'] = 0
        elif i == 1:
            game_df.loc[i,'Next H/A_H'] = 1

    game_df=game_df.drop(columns='Pitcher')

    game_dfs_clean.append(game_df)

game_dfs_clean

[  Team               Datetime  Next Opp Arm_R  Next H/A_H
 0  SEA  2023-08-17 2:10PM EST             1.0         0.0
 1  KCR  2023-08-17 2:10PM EST             0.0         1.0,
   Team               Datetime  Next Opp Arm_R  Next H/A_H
 0  BOS  2023-08-17 4:05PM EST             0.0         0.0
 1  WSN  2023-08-17 4:05PM EST             0.0         1.0,
   Team               Datetime  Next Opp Arm_R  Next H/A_H
 0  NYM  2023-08-17 7:15PM EST             0.0         0.0
 1  STL  2023-08-17 7:15PM EST             1.0         1.0,
   Team               Datetime  Next Opp Arm_R  Next H/A_H
 0  ARI  2023-08-17 9:40PM EST             1.0         0.0
 1  SDP  2023-08-17 9:40PM EST             0.0         1.0,
   Team                Datetime  Next Opp Arm_R  Next H/A_H
 0  MIL  2023-08-17 10:10PM EST             1.0         0.0
 1  LAD  2023-08-17 10:10PM EST             1.0         1.0]

In [5]:
# loop through each df and add remaing features to the df
mlb_teams = list(pd.read_csv('Home Run Classification/mlb_teams.csv')['Abbreviation'])

for df in game_dfs_clean:

    for i in range(len(df)):
        url_1 = "https://www.baseball-reference.com/teams/tgl.cgi?team="
        url_b = "&t=b&year=2023"
        url = f"{url_1}{df.loc[i,'Team']}{url_b}"
        team_df = pd.read_html(url)[0]
        team_batting = team_df.iloc[-1:]

        # Extract team batting data
        batting_cols = ['HR','BA','OBP','SLG','OPS']
        team_batting_features = team_batting[batting_cols].reset_index(drop=True)

        df.loc[i,'HRs Hit'] = team_batting_features.loc[0,'HR']
        df.loc[i,'BA'] = team_batting_features.loc[0,'BA']
        df.loc[i,'OBP'] = team_batting_features.loc[0,'OBP']
        df.loc[i,'SLG'] = team_batting_features.loc[0,'SLG']
        df.loc[i,'OPS'] = team_batting_features.loc[0,'OPS']


        # PITCHING
        url_p = "&t=p&year=2023"

        if i == 0:
            opp_team = df.loc[1,'Team']
            df.loc[i,'Venue'] = opp_team
        if i == 1:
            opp_team = df.loc[0,'Team']

        url = f"{url_1}{opp_team}{url_p}"
        opp_team_df = pd.read_html(url)[1]
        opp_team_pitching = opp_team_df.iloc[-1:]

        pitching_cols = ['ERA','Pitchers Used (Rest-GameScore-Dec)']
        opp_team_pitching_features = opp_team_pitching[pitching_cols].reset_index(drop=True)
        #print(len(opp_team_pitching_features.loc[0,'Pitchers Used (Rest-GameScore-Dec)'].split(',')))

        df.loc[i, 'Opp ERA'] = opp_team_pitching_features.loc[0,'ERA']
        df.loc[i, 'Num Pitchers Used'] = len(opp_team_pitching_features.loc[0,'Pitchers Used (Rest-GameScore-Dec)'].split(','))

    
    df['Next Opp Arm_R']=df['Next Opp Arm_R'].astype(int)
    df['Next H/A_H']=df['Next H/A_H'].astype(int)
    df['Num Pitchers Used']=df['Num Pitchers Used'].astype(int)
    df['HRs Hit']=df['HRs Hit'].astype(int)
    df['BA']=df['BA'].astype(float)
    df['OBP']=df['OBP'].astype(float)
    df['SLG']=df['SLG'].astype(float)
    df['OPS']=df['OPS'].astype(float)
    df['Opp ERA']=df['Opp ERA'].astype(float)
    
    for team in mlb_teams:
        col_name = "Next Venue_"+team
        df[col_name] = 0
    home_team = df.loc[1,'Team']
    venue = "Next Venue_"+home_team
    df[venue] = 1
    

In [6]:
for game in game_dfs_clean:
    print(game.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 42 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Team               2 non-null      object 
 1   Datetime           2 non-null      object 
 2   Next Opp Arm_R     2 non-null      int64  
 3   Next H/A_H         2 non-null      int64  
 4   HRs Hit            2 non-null      int64  
 5   BA                 2 non-null      float64
 6   OBP                2 non-null      float64
 7   SLG                2 non-null      float64
 8   OPS                2 non-null      float64
 9   Venue              1 non-null      object 
 10  Opp ERA            2 non-null      float64
 11  Num Pitchers Used  2 non-null      int64  
 12  Next Venue_ARI     2 non-null      int64  
 13  Next Venue_ATL     2 non-null      int64  
 14  Next Venue_BAL     2 non-null      int64  
 15  Next Venue_BOS     2 non-null      int64  
 16  Next Venue_CHC     2 non-null 

In [7]:
# bring in pickle model 
model = pickle.load(open('Home Run Classification/model-all.pkl', 'rb'))

In [11]:
features = "HRs Hit,BA,OBP,SLG,OPS,Opp ERA,Num Pitchers Used,Next H/A_H,Next Venue_ARI,Next Venue_ATL,Next Venue_BAL,Next Venue_BOS,Next Venue_CHC,Next Venue_CHW,Next Venue_CIN,Next Venue_CLE,Next Venue_COL,Next Venue_DET,Next Venue_HOU,Next Venue_KCR,Next Venue_LAA,Next Venue_LAD,Next Venue_MIA,Next Venue_MIL,Next Venue_MIN,Next Venue_NYM,Next Venue_NYY,Next Venue_OAK,Next Venue_PHI,Next Venue_PIT,Next Venue_SDP,Next Venue_SEA,Next Venue_SFG,Next Venue_STL,Next Venue_TBR,Next Venue_TEX,Next Venue_TOR,Next Venue_WSN,Next Opp Arm_R"
features_order = features.split(',')
features_order

['HRs Hit',
 'BA',
 'OBP',
 'SLG',
 'OPS',
 'Opp ERA',
 'Num Pitchers Used',
 'Next H/A_H',
 'Next Venue_ARI',
 'Next Venue_ATL',
 'Next Venue_BAL',
 'Next Venue_BOS',
 'Next Venue_CHC',
 'Next Venue_CHW',
 'Next Venue_CIN',
 'Next Venue_CLE',
 'Next Venue_COL',
 'Next Venue_DET',
 'Next Venue_HOU',
 'Next Venue_KCR',
 'Next Venue_LAA',
 'Next Venue_LAD',
 'Next Venue_MIA',
 'Next Venue_MIL',
 'Next Venue_MIN',
 'Next Venue_NYM',
 'Next Venue_NYY',
 'Next Venue_OAK',
 'Next Venue_PHI',
 'Next Venue_PIT',
 'Next Venue_SDP',
 'Next Venue_SEA',
 'Next Venue_SFG',
 'Next Venue_STL',
 'Next Venue_TBR',
 'Next Venue_TEX',
 'Next Venue_TOR',
 'Next Venue_WSN',
 'Next Opp Arm_R']

In [15]:
for game in game_dfs_clean:
    X = game[features_order]
    game_prediction = model.predict(X)
    
    print(f"{game['Team'][0]} @ {game['Team'][1]} ({game['Datetime'][0]})")
    print(f"{game['Team'][0]} HR prediction: {game_prediction[0]}")
    print(f"{game['Team'][1]} HR prediction: {game_prediction[1]}")
    print('-'*35)
    

SEA @ KCR (2023-08-17 2:10PM EST)
SEA HR prediction: 1
KCR HR prediction: 1
-----------------------------------
BOS @ WSN (2023-08-17 4:05PM EST)
BOS HR prediction: 1
WSN HR prediction: 1
-----------------------------------
NYM @ STL (2023-08-17 7:15PM EST)
NYM HR prediction: 1
STL HR prediction: 1
-----------------------------------
ARI @ SDP (2023-08-17 9:40PM EST)
ARI HR prediction: 1
SDP HR prediction: 1
-----------------------------------
MIL @ LAD (2023-08-17 10:10PM EST)
MIL HR prediction: 1
LAD HR prediction: 1
-----------------------------------


In [24]:
hr_value_counts_2022 = pd.read_csv('Home Run Classification/mlb_2022_training_data.csv')['Target HR'].value_counts()
print('Percentage of games with HR in 2022')
print(hr_value_counts_2022/hr_value_counts_2022.sum()*100)

Percentage of games with HR in 2022
1    63.00207
0    36.99793
Name: Target HR, dtype: float64


In [27]:
model.coef_

array([[ 0.03287128, -0.30444638,  0.09170976,  0.35147688,  0.4696914 ,
         0.01907701, -0.00347851,  0.04798156,  0.0262023 ,  0.22497264,
        -0.35558995, -0.1047708 ,  0.11185947,  0.04466369,  0.266508  ,
        -0.30112219,  0.25663662, -0.535246  ,  0.12927645, -0.3493866 ,
        -0.0806111 ,  0.2551981 , -0.15168863,  0.56070311, -0.41439091,
         0.0430237 ,  0.31715336, -0.33396458,  0.11077827, -0.16043377,
        -0.04468372,  0.15139082, -0.38653844, -0.07213044, -0.03393734,
         0.4086336 ,  0.09327076,  0.32415084,  0.11847343]])