In [195]:
import pandas as pd
from time import sleep

In [196]:
# Create a list of all mlb team abbreviations used by Baseball-Reference
team_abr = list(pd.read_csv("mlb_teams.csv")["Abbreviation"])

# 2022 BATTING

In [197]:
url_1 = "https://www.baseball-reference.com/teams/tgl.cgi?team="
url_2 = "&t=b&year=2022"
batting_dfs = []

for team in team_abr:
    print(team)
    url = f"{url_1}{team}{url_2}"
    team_df = pd.read_html(url)[0]
    
    for i in range(len(team_df)):
        # Convert 'Gtm' to int
        try:
            int(team_df.loc[i, "Gtm"])
        except:
            team_df = team_df.drop(i)

    # Update data types    
    team_df['Gtm'] = team_df['Gtm'].astype(int)
    team_df['HR'] = team_df['HR'].astype(int)

    # Reset index after rows dropped and add a 'Team column'
    team_df = team_df.reset_index(drop=True)
    team_df['Team'] = team
        
    for i in range(len(team_df)):
        
        # Add 'Target HR' column
        if i != len(team_df) - 1:
            next_i = i+1
            if int(team_df.loc[next_i, 'HR']) > 0:
                team_df.loc[i, 'Target HR'] = 1
            else:
                team_df.loc[i, 'Target HR'] = 0
            
            # add "Next Throw" column
            team_df.loc[i, "Next Throw"] = team_df.loc[next_i, "Thr"]
        
    
    # Append team_df to df list
    batting_dfs.append(team_df)
    sleep(5)

print(len(batting_dfs))

ARI
ATL
BAL
BOS
CHC
CHW
CIN
CLE
COL
DET
MIA
HOU
KCR
LAA
LAD
MIL
MIN
NYM
NYY
OAK
PHI
PIT
SDP
SFG
SEA
STL
TBR
TEX
TOR
WSN
30


In [223]:
# Combine all team_dfs into one

batting_2022_df = pd.concat(batting_dfs)
batting_2022_df = batting_2022_df.reset_index(drop=True)
batting_2022_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4860 entries, 0 to 4859
Data columns (total 35 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Rk                    4860 non-null   object 
 1   Gtm                   4860 non-null   int64  
 2   Date                  4860 non-null   object 
 3   Unnamed: 3            2430 non-null   object 
 4   Opp                   4860 non-null   object 
 5   Rslt                  4860 non-null   object 
 6   PA                    4860 non-null   object 
 7   AB                    4860 non-null   object 
 8   R                     4860 non-null   object 
 9   H                     4860 non-null   object 
 10  2B                    4860 non-null   object 
 11  3B                    4860 non-null   object 
 12  HR                    4860 non-null   int64  
 13  RBI                   4860 non-null   object 
 14  BB                    4860 non-null   object 
 15  IBB                  

In [224]:
# Drop 'Rk', create 'Home/Away'

batting_2022_df = batting_2022_df.drop(columns="Rk", axis=1)
batting_2022_df = batting_2022_df.rename(columns={"Unnamed: 3":"Home/Away"})
batting_2022_df['Home/Away'] = batting_2022_df['Home/Away'].fillna("H")
batting_2022_df['Gtm'] = batting_2022_df['Gtm'].astype(int)

batting_2022_df.head()

Unnamed: 0,Gtm,Date,Home/Away,Opp,Rslt,PA,AB,R,H,2B,...,OBP,SLG,OPS,LOB,#,Thr,Opp. Starter (GmeSc),Team,Target HR,Next Throw
0,1,Apr 7,H,SDP,"W,4-2",34,26,4,3,0,...,0.324,0.231,0.554,6,12,R,Y.Darvish(71),ARI,0.0,L
1,2,Apr 8,H,SDP,"L,0-3",30,29,0,2,0,...,0.219,0.145,0.364,3,11,L,S.Manaea(83),ARI,1.0,R
2,3,Apr 9,H,SDP,"L,2-5",33,31,2,5,2,...,0.216,0.209,0.426,4,10,R,J.Musgrove(62),ARI,1.0,R
3,4,Apr 10,H,SDP,"L,5-10",39,30,5,5,1,...,0.257,0.259,0.516,7,10,R,N.Crismatt(58),ARI,1.0,R
4,5,Apr 12,H,HOU,"L,1-2",36,30,1,4,1,...,0.257,0.26,0.518,8,13,R,L.Garcia(57),ARI,0.0,L


In [225]:
# Create 'Next' columns
for i in range(len(batting_2022_df)):
    if  batting_2022_df.loc[i, 'Gtm'] != 162:
        next_i = i+1
        
        # Create a column for next game home/away
        batting_2022_df.loc[i, 'Next H/A'] = batting_2022_df.loc[next_i, 'Home/Away']
        
        if batting_2022_df.loc[i, 'Next H/A'] == "H":
            batting_2022_df.loc[i, 'Next Venue'] = batting_2022_df.loc[next_i, 'Team']
        else:
            batting_2022_df.loc[i, 'Next Venue'] = batting_2022_df.loc[next_i, 'Opp']

batting_2022_df.head(20)

Unnamed: 0,Gtm,Date,Home/Away,Opp,Rslt,PA,AB,R,H,2B,...,OPS,LOB,#,Thr,Opp. Starter (GmeSc),Team,Target HR,Next Throw,Next H/A,Next Venue
0,1,Apr 7,H,SDP,"W,4-2",34,26,4,3,0,...,0.554,6,12,R,Y.Darvish(71),ARI,0.0,L,H,ARI
1,2,Apr 8,H,SDP,"L,0-3",30,29,0,2,0,...,0.364,3,11,L,S.Manaea(83),ARI,1.0,R,H,ARI
2,3,Apr 9,H,SDP,"L,2-5",33,31,2,5,2,...,0.426,4,10,R,J.Musgrove(62),ARI,1.0,R,H,ARI
3,4,Apr 10,H,SDP,"L,5-10",39,30,5,5,1,...,0.516,7,10,R,N.Crismatt(58),ARI,1.0,R,H,ARI
4,5,Apr 12,H,HOU,"L,1-2",36,30,1,4,1,...,0.518,8,13,R,L.Garcia(57),ARI,0.0,L,H,ARI
5,6,Apr 13,H,HOU,"W,3-2",47,33,3,6,0,...,0.532,17,13,L,F.Valdez(49),ARI,1.0,R,@,NYM
6,7,Apr 15,@,NYM,"L,3-10",35,29,3,3,2,...,0.532,5,12,R,C.Bassitt(68),ARI,1.0,R,@,NYM
7,8,Apr 16,@,NYM,"W,3-2",40,36,3,9,2,...,0.558,10,10,R,C.Carrasco(67),ARI,0.0,L,@,NYM
8,9,Apr 17,@,NYM,"L,0-5",36,32,0,5,2,...,0.548,9,11,L,D.Peterson(59),ARI,1.0,R,@,WSN
9,10,Apr 19 (1),@,WSN,"L,1-6",34,31,1,6,2,...,0.555,6,11,R,J.Gray(64),ARI,0.0,R,@,WSN


In [226]:
# Update data types
batting_floats = ['BA','OBP','SLG','OPS']
for stat in batting_floats:
    batting_2022_df[stat] = batting_2022_df[stat].astype(float)

batting_2022_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4860 entries, 0 to 4859
Data columns (total 36 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Gtm                   4860 non-null   int64  
 1   Date                  4860 non-null   object 
 2   Home/Away             4860 non-null   object 
 3   Opp                   4860 non-null   object 
 4   Rslt                  4860 non-null   object 
 5   PA                    4860 non-null   object 
 6   AB                    4860 non-null   object 
 7   R                     4860 non-null   object 
 8   H                     4860 non-null   object 
 9   2B                    4860 non-null   object 
 10  3B                    4860 non-null   object 
 11  HR                    4860 non-null   int64  
 12  RBI                   4860 non-null   object 
 13  BB                    4860 non-null   object 
 14  IBB                   4860 non-null   object 
 15  SO                   

# 2022 Pitching

In [202]:
url_1 = "https://www.baseball-reference.com/teams/tgl.cgi?team="
url_2 = "&t=p&year=2022"
pitching_dfs = []

for team in team_abr:
    print(team)
    url = f"{url_1}{team}{url_2}"
    team_df = pd.read_html(url)[1]
    
    for i in range(len(team_df)):
        # Convert 'Gtm' to int
        try:
            int(team_df.loc[i, "Gtm"])
        except:
            team_df = team_df.drop(i)

    # Update data types    
    team_df['Gtm'] = team_df['Gtm'].astype(int)

    # Reset index after rows dropped and add a 'Team column'
    team_df = team_df.reset_index(drop=True)
    team_df['Team'] = team
    
    # Append team_df to df list
    pitching_dfs.append(team_df)
    sleep(5)

print(len(pitching_dfs))

ARI
ATL
BAL
BOS
CHC
CHW
CIN
CLE
COL
DET
MIA
HOU
KCR
LAA
LAD
MIL
MIN
NYM
NYY
OAK
PHI
PIT
SDP
SFG
SEA
STL
TBR
TEX
TOR
WSN
30


In [227]:
pitching_2022_df = pd.concat(pitching_dfs)
pitching_2022_df = pitching_2022_df.reset_index(drop=True)
pitching_2022_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4860 entries, 0 to 4859
Data columns (total 35 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Rk                                  4860 non-null   object
 1   Gtm                                 4860 non-null   int64 
 2   Date                                4860 non-null   object
 3   Unnamed: 3                          2430 non-null   object
 4   Opp                                 4860 non-null   object
 5   Rslt                                4860 non-null   object
 6   IP                                  4860 non-null   object
 7   H                                   4860 non-null   object
 8   R                                   4860 non-null   object
 9   ER                                  4860 non-null   object
 10  UER                                 4860 non-null   object
 11  BB                                  4860 non-null   obje

In [228]:
pitching_2022_df = pitching_2022_df.drop(columns="Rk", axis=1)
pitching_2022_df = pitching_2022_df.rename(columns={"Unnamed: 3":"Home/Away"})
pitching_2022_df['Home/Away'] = pitching_2022_df['Home/Away'].fillna("H")
pitching_2022_df['ERA'] = pitching_2022_df['ERA'].astype(float)
pitching_2022_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4860 entries, 0 to 4859
Data columns (total 34 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Gtm                                 4860 non-null   int64  
 1   Date                                4860 non-null   object 
 2   Home/Away                           4860 non-null   object 
 3   Opp                                 4860 non-null   object 
 4   Rslt                                4860 non-null   object 
 5   IP                                  4860 non-null   object 
 6   H                                   4860 non-null   object 
 7   R                                   4860 non-null   object 
 8   ER                                  4860 non-null   object 
 9   UER                                 4860 non-null   object 
 10  BB                                  4860 non-null   object 
 11  SO                                  4860 no

In [229]:
# Number of pitchers used 
pitching_2022_df['Pitchers Used (Rest-GameScore-Dec)'] = pitching_2022_df['Pitchers Used (Rest-GameScore-Dec)'].str.split(',')
for i in range(len(pitching_2022_df)):
    pitching_2022_df.loc[i, 'Num Pitchers Used'] = len(pitching_2022_df.loc[i, 'Pitchers Used (Rest-GameScore-Dec)'])

# Merge DFs

In [236]:
pitching_2022_df['Date'].str.replace(' \(', '(', regex=True)

0          Apr 7
1          Apr 8
2          Apr 9
3         Apr 10
4         Apr 12
          ...   
4855    Oct 1(2)
4856       Oct 2
4857    Oct 4(1)
4858    Oct 4(2)
4859       Oct 5
Name: Date, Length: 4860, dtype: object

In [233]:
# Remove hidden character to allow merge

batting_2022_df['Date'] = batting_2022_df['Date'].str.replace('\xa0', ' ')
pitching_2022_df['Date'] = pitching_2022_df['Date'].str.replace('\xa0', ' ')
batting_2022_df['Date'] = batting_2022_df['Date'].str.replace(' \(', '(', regex=True)
pitching_2022_df['Date'] = pitching_2022_df['Date'].str.replace(' \(', '(', regex=True)

In [234]:
combine_2022_df = pd.merge(batting_2022_df, pitching_2022_df,
                           how='left',
                          left_on=['Date','Opp'],
                          right_on=['Date','Team'],
                          suffixes=['_b','_p'])

combine_2022_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4860 entries, 0 to 4859
Data columns (total 70 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Gtm_b                               4860 non-null   int64  
 1   Date                                4860 non-null   object 
 2   Home/Away_b                         4860 non-null   object 
 3   Opp_b                               4860 non-null   object 
 4   Rslt_b                              4860 non-null   object 
 5   PA                                  4860 non-null   object 
 6   AB_b                                4860 non-null   object 
 7   R_b                                 4860 non-null   object 
 8   H_b                                 4860 non-null   object 
 9   2B_b                                4860 non-null   object 
 10  3B_b                                4860 non-null   object 
 11  HR_b                                4860 no

In [239]:
# Create the final features df
features = ['Gtm_b','Date','Team_b','Next H/A','Opp_b','Next Venue','HR_b',
           'BA','OBP','SLG','OPS','Next Throw','ERA','Num Pitchers Used',
           'Target HR']
features_df = combine_2022_df[features]
features_df = features_df.rename(columns={
    'Gtm_b':'Team Game Number',
    'Team_b':'Team',
    'Opp_b':'Opp',
    'HR_b':'HRs Hit',
    'Next Throw':'Next Opp Arm',
    'ERA':'Opp ERA'
})
features_df.head()

Unnamed: 0,Team Game Number,Date,Team,Next H/A,Opp,Next Venue,HRs Hit,BA,OBP,SLG,OPS,Next Opp Arm,Opp ERA,Num Pitchers Used,Target HR
0,1,Apr 7,ARI,H,SDP,ARI,1,0.115,0.324,0.231,0.554,L,4.5,5.0,0.0
1,2,Apr 8,ARI,H,SDP,ARI,0,0.091,0.219,0.145,0.364,R,2.12,4.0,1.0
2,3,Apr 9,ARI,H,SDP,ARI,1,0.116,0.216,0.209,0.426,R,2.08,4.0,1.0
3,4,Apr 10,ARI,H,SDP,ARI,2,0.129,0.257,0.259,0.516,R,2.83,6.0,1.0
4,5,Apr 12,ARI,H,HOU,ARI,1,0.13,0.257,0.26,0.518,L,1.64,6.0,0.0


In [240]:
# Drop NA to remove each team's last game of the season since it does not have a target HR to predict
features_df = features_df.dropna().reset_index(drop=True)
features_df['Num Pitchers Used'] = features_df['Num Pitchers Used'].astype(int)
features_df['Target HR'] = features_df['Target HR'].astype(int)
features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4830 entries, 0 to 4829
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Team Game Number   4830 non-null   int64  
 1   Date               4830 non-null   object 
 2   Team               4830 non-null   object 
 3   Next H/A           4830 non-null   object 
 4   Opp                4830 non-null   object 
 5   Next Venue         4830 non-null   object 
 6   HRs Hit            4830 non-null   int64  
 7   BA                 4830 non-null   float64
 8   OBP                4830 non-null   float64
 9   SLG                4830 non-null   float64
 10  OPS                4830 non-null   float64
 11  Next Opp Arm       4830 non-null   object 
 12  Opp ERA            4830 non-null   float64
 13  Num Pitchers Used  4830 non-null   int64  
 14  Target HR          4830 non-null   int64  
dtypes: float64(5), int64(4), object(6)
memory usage: 566.1+ KB


In [243]:
dummies = pd.get_dummies(features_df[['Next H/A','Next Venue','Next Opp Arm']])
dummies.head()

Unnamed: 0,Next H/A_@,Next H/A_H,Next Venue_ARI,Next Venue_ATL,Next Venue_BAL,Next Venue_BOS,Next Venue_CHC,Next Venue_CHW,Next Venue_CIN,Next Venue_CLE,...,Next Venue_SDP,Next Venue_SEA,Next Venue_SFG,Next Venue_STL,Next Venue_TBR,Next Venue_TEX,Next Venue_TOR,Next Venue_WSN,Next Opp Arm_L,Next Opp Arm_R
0,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [244]:
feature_dummies_df = pd.concat([features_df, dummies], axis=1).reset_index(drop=True)
feature_dummies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4830 entries, 0 to 4829
Data columns (total 49 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Team Game Number   4830 non-null   int64  
 1   Date               4830 non-null   object 
 2   Team               4830 non-null   object 
 3   Next H/A           4830 non-null   object 
 4   Opp                4830 non-null   object 
 5   Next Venue         4830 non-null   object 
 6   HRs Hit            4830 non-null   int64  
 7   BA                 4830 non-null   float64
 8   OBP                4830 non-null   float64
 9   SLG                4830 non-null   float64
 10  OPS                4830 non-null   float64
 11  Next Opp Arm       4830 non-null   object 
 12  Opp ERA            4830 non-null   float64
 13  Num Pitchers Used  4830 non-null   int64  
 14  Target HR          4830 non-null   int64  
 15  Next H/A_@         4830 non-null   uint8  
 16  Next H/A_H         4830 

In [245]:
drop_cols = ['Team Game Number','Team','Date','Next H/A','Next H/A_@','Opp','Next Venue','Next Opp Arm','Next Opp Arm_L']
cleaned_features = feature_dummies_df.drop(columns=drop_cols)
cleaned_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4830 entries, 0 to 4829
Data columns (total 40 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   HRs Hit            4830 non-null   int64  
 1   BA                 4830 non-null   float64
 2   OBP                4830 non-null   float64
 3   SLG                4830 non-null   float64
 4   OPS                4830 non-null   float64
 5   Opp ERA            4830 non-null   float64
 6   Num Pitchers Used  4830 non-null   int64  
 7   Target HR          4830 non-null   int64  
 8   Next H/A_H         4830 non-null   uint8  
 9   Next Venue_ARI     4830 non-null   uint8  
 10  Next Venue_ATL     4830 non-null   uint8  
 11  Next Venue_BAL     4830 non-null   uint8  
 12  Next Venue_BOS     4830 non-null   uint8  
 13  Next Venue_CHC     4830 non-null   uint8  
 14  Next Venue_CHW     4830 non-null   uint8  
 15  Next Venue_CIN     4830 non-null   uint8  
 16  Next Venue_CLE     4830 