In [3]:
import pandas as pd
from time import sleep

In [2]:
# Create a list of all mlb team abbreviations used by Baseball-Reference
team_abr = list(pd.read_csv("mlb_teams.csv")["Abbreviation"])

In [9]:
url_1 = "https://www.baseball-reference.com/teams/tgl.cgi?team="
url_2 = "&t=b&year=2022"
batting_dfs = []

for team in team_abr:
    print(team)
    url = f"{url_1}{team}{url_2}"
    team_df = pd.read_html(url)[0]
    
    for i in range(len(team_df)):
        # Convert 'Gtm' to int
        try:
            int(team_df.loc[i, "Gtm"])
        except:
            team_df = team_df.drop(i)

    # Update data types    
    team_df['Gtm'] = team_df['Gtm'].astype(int)
    team_df['HR'] = team_df['HR'].astype(int)

    # Reset index after rows dropped and add a 'Team column'
    team_df = team_df.reset_index(drop=True)
    team_df['Team'] = team
        
    for i in range(len(team_df)):
        
        # Add 'Target HR' column
        if i != len(team_df) - 1:
            next_i = i+1
            if int(team_df.loc[next_i, 'HR']) > 0:
                team_df.loc[i, 'Target HR'] = 1
            else:
                team_df.loc[i, 'Target HR'] = 0
            
            # add "Next Throw" column
            team_df.loc[i, "Next Throw"] = team_df.loc[next_i, "Thr"]
        
    
    # Append team_df to df list
    batting_dfs.append(team_df)
    sleep(5)

print(len(batting_dfs))

ARI
ATL
BAL
BOS
CHC
CHW
CIN
CLE
COL
DET
MIA
HOU
KCR
LAA
LAD
MIL
MIN
NYM
NYY
OAK
PHI
PIT
SDP
SFG
SEA
STL
TBR
TEX
TOR
WSN
30


In [10]:
# Combine all team_dfs into one

batting_2022_df = pd.concat(batting_dfs)
batting_2022_df = batting_2022_df.reset_index(drop=True)
batting_2022_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4860 entries, 0 to 4859
Data columns (total 35 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Rk                    4860 non-null   object 
 1   Gtm                   4860 non-null   int64  
 2   Date                  4860 non-null   object 
 3   Unnamed: 3            2430 non-null   object 
 4   Opp                   4860 non-null   object 
 5   Rslt                  4860 non-null   object 
 6   PA                    4860 non-null   object 
 7   AB                    4860 non-null   object 
 8   R                     4860 non-null   object 
 9   H                     4860 non-null   object 
 10  2B                    4860 non-null   object 
 11  3B                    4860 non-null   object 
 12  HR                    4860 non-null   int64  
 13  RBI                   4860 non-null   object 
 14  BB                    4860 non-null   object 
 15  IBB                  

In [11]:
# Drop 'Rk', create 'Home/Away'

batting_2022_df = batting_2022_df.drop(columns="Rk", axis=1)
batting_2022_df = batting_2022_df.rename(columns={"Unnamed: 3":"Home/Away"})
batting_2022_df['Home/Away'] = batting_2022_df['Home/Away'].fillna("H")
batting_2022_df.head()

Unnamed: 0,Gtm,Date,Home/Away,Opp,Rslt,PA,AB,R,H,2B,...,OBP,SLG,OPS,LOB,#,Thr,Opp. Starter (GmeSc),Team,Target HR,Next Throw
0,1,Apr 7,H,SDP,"W,4-2",34,26,4,3,0,...,0.324,0.231,0.554,6,12,R,Y.Darvish(71),ARI,0.0,L
1,2,Apr 8,H,SDP,"L,0-3",30,29,0,2,0,...,0.219,0.145,0.364,3,11,L,S.Manaea(83),ARI,1.0,R
2,3,Apr 9,H,SDP,"L,2-5",33,31,2,5,2,...,0.216,0.209,0.426,4,10,R,J.Musgrove(62),ARI,1.0,R
3,4,Apr 10,H,SDP,"L,5-10",39,30,5,5,1,...,0.257,0.259,0.516,7,10,R,N.Crismatt(58),ARI,1.0,R
4,5,Apr 12,H,HOU,"L,1-2",36,30,1,4,1,...,0.257,0.26,0.518,8,13,R,L.Garcia(57),ARI,0.0,L


In [13]:
# Create 'Venue' column
for i in range(len(batting_2022_df)):
    if batting_2022_df.loc[i, 'Home/Away'] == "H":
        batting_2022_df.loc[i, 'Venue'] = batting_2022_df.loc[i, 'Team']
    else:
        batting_2022_df.loc[i, 'Venue'] = batting_2022_df.loc[i, 'Opp']

batting_2022_df.head(20)

Unnamed: 0,Gtm,Date,Home/Away,Opp,Rslt,PA,AB,R,H,2B,...,SLG,OPS,LOB,#,Thr,Opp. Starter (GmeSc),Team,Target HR,Next Throw,Venue
0,1,Apr 7,H,SDP,"W,4-2",34,26,4,3,0,...,0.231,0.554,6,12,R,Y.Darvish(71),ARI,0.0,L,ARI
1,2,Apr 8,H,SDP,"L,0-3",30,29,0,2,0,...,0.145,0.364,3,11,L,S.Manaea(83),ARI,1.0,R,ARI
2,3,Apr 9,H,SDP,"L,2-5",33,31,2,5,2,...,0.209,0.426,4,10,R,J.Musgrove(62),ARI,1.0,R,ARI
3,4,Apr 10,H,SDP,"L,5-10",39,30,5,5,1,...,0.259,0.516,7,10,R,N.Crismatt(58),ARI,1.0,R,ARI
4,5,Apr 12,H,HOU,"L,1-2",36,30,1,4,1,...,0.26,0.518,8,13,R,L.Garcia(57),ARI,0.0,L,ARI
5,6,Apr 13,H,HOU,"W,3-2",47,33,3,6,0,...,0.246,0.532,17,13,L,F.Valdez(49),ARI,1.0,R,ARI
6,7,Apr 15,@,NYM,"L,3-10",35,29,3,3,2,...,0.25,0.532,5,12,R,C.Bassitt(68),ARI,1.0,R,NYM
7,8,Apr 16,@,NYM,"W,3-2",40,36,3,9,2,...,0.27,0.558,10,10,R,C.Carrasco(67),ARI,0.0,L,NYM
8,9,Apr 17,@,NYM,"L,0-5",36,32,0,5,2,...,0.264,0.548,9,11,L,D.Peterson(59),ARI,1.0,R,NYM
9,10,Apr 19 (1),@,WSN,"L,1-6",34,31,1,6,2,...,0.274,0.555,6,11,R,J.Gray(64),ARI,0.0,R,WSN


In [14]:
# New df with columns to use
feature_cols = ['Gtm', 'Date', 'Team', 'Opp', 'Home/Away', 'Venue', 'BA', 'OBP', 'SLG', 'OPS', 'Next Throw', 'Target HR']
batting_features_df = batting_2022_df[feature_cols]
batting_features_df.head()

Unnamed: 0,Gtm,Date,Team,Opp,Home/Away,Venue,BA,OBP,SLG,OPS,Next Throw,Target HR
0,1,Apr 7,ARI,SDP,H,ARI,0.115,0.324,0.231,0.554,L,0.0
1,2,Apr 8,ARI,SDP,H,ARI,0.091,0.219,0.145,0.364,R,1.0
2,3,Apr 9,ARI,SDP,H,ARI,0.116,0.216,0.209,0.426,R,1.0
3,4,Apr 10,ARI,SDP,H,ARI,0.129,0.257,0.259,0.516,R,1.0
4,5,Apr 12,ARI,HOU,H,ARI,0.13,0.257,0.26,0.518,L,0.0


In [16]:
# Drop NA to remove each teams last game since it has no target HR to predict
batting_features_df = batting_features_df.dropna()
batting_features_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4830 entries, 0 to 4858
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Gtm         4830 non-null   int64  
 1   Date        4830 non-null   object 
 2   Team        4830 non-null   object 
 3   Opp         4830 non-null   object 
 4   Home/Away   4830 non-null   object 
 5   Venue       4830 non-null   object 
 6   BA          4830 non-null   object 
 7   OBP         4830 non-null   object 
 8   SLG         4830 non-null   object 
 9   OPS         4830 non-null   object 
 10  Next Throw  4830 non-null   object 
 11  Target HR   4830 non-null   float64
dtypes: float64(1), int64(1), object(10)
memory usage: 490.5+ KB
