In [1]:
# Import needed dependencies
import requests
import pandas as pd
import numpy as np
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
today = date.today()

# dd/mm/YY
current_year = today.strftime("%Y")

int(current_year) - 1

2021

In [3]:
last_five_years = []
for i in range(1,6):
    last_five_years.append(int(current_year) - i)

last_five_years

[2021, 2020, 2019, 2018, 2017]

In [4]:
# Create a function to create a dataframe from Baseball Reference tables
def batting_stats_dataframe():
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{int(current_year) - 1}-standard-batting.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = BeautifulSoup(soup.select_one('#all_players_standard_batting').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')

    # Create a list to help create a dataframe from batter statistics data
    batter_stats = []

    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        batter_stats.append(tds)

    # Create dataframe for batter statistics
    batter_stats_df = pd.DataFrame(batter_stats)

    # Create an empty list to store dataframe header information
    header_list = []

    # Grab the table header information to use as column headers in our dataframe
    for tr in table.select('tr:has(th)'):
        ths = [th.get_text(strip=True) for th in tr.select('th')]
        header_list.append(ths)

    # For loop returns a list of lists, and we only need the first list 
    df_headers = header_list[0]

    # Remove the first item from our headers list, it is the index header that we do not need
    df_headers.remove('Rk')

    # Set column headers equal to our list
    batter_stats_df.columns = df_headers
    
    batter_stats_df['Year'] = int(current_year) - 1
    
    # Drop any players with 0 plate appearances to remove null values and change PA type to integer
    
    batter_stats_df.dropna(subset=['PA'], axis = 0 , inplace= True)
    batter_stats_df['PA'] = batter_stats_df['PA'].astype(int)

    # Remove any players with fewer than 100 plate appearances
    filtered_batter_stats_df = batter_stats_df[batter_stats_df['PA'] >= 100]
    
    # Select the columns we want for our batter analysis
    final_batter_stats_df = filtered_batter_stats_df[['Year','Name','Age','R','HR','RBI','SB','BA','PA','OPS','OPS+','Pos\xa0Summary']]
    
    # This drops duplicate players and (for now) keeps their aggregate stats because that is what is listed first on baseball reference
    # There should be a better way to guarentee total stats is what we get, will try to code that later
    final_batter_stats_df = final_batter_stats_df.drop_duplicates('Name')
    
    #dataTypeDict = dict(filtered_batter_stats_df.dtypes)
    
    # return the dataframe
    return final_batter_stats_df

# # use a for loop to call the function for the past five years' worth of batting stats data
# batting_stats_dataframe()


In [5]:
batting_stats_dataframe()

Unnamed: 0,Year,Name,Age,R,HR,RBI,SB,BA,PA,OPS,OPS+,Pos Summary
4,2021,José Abreu,34,86,30,117,1,.261,659,.831,125,*3D/5
5,2021,Ronald Acuna Jr.,23,72,24,52,17,.283,360,.990,155,9/8H
7,2021,Willy Adames,25,77,25,73,5,.262,555,.818,120,*6/HD
12,2021,Riley Adams,25,13,2,10,0,.222,120,.742,105,2H
15,2021,Jo Adell,22,17,4,26,2,.246,140,.703,90,79
...,...,...,...,...,...,...,...,...,...,...,...,...
1715,2021,Seby Zavala,27,15,5,15,0,.183,104,.616,66,2/HD
1716,2021,Bradley Zimmer*,28,44,8,35,15,.227,348,.669,85,89/H7D
1718,2021,Ryan Zimmerman,36,27,14,46,0,.243,273,.756,104,H3/D
1722,2021,Mike Zunino,30,64,33,62,0,.216,375,.860,138,2/H
