In [1]:
# Import needed dependencies
import requests
import pandas as pd
import numpy as np
import scipy.stats as stats
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
today = date.today()

# dd/mm/YY
current_year = today.strftime("%Y")

last_year = int(current_year) - 1

In [3]:
last_five_years = []
for i in range(1,6):
    last_five_years.append(int(current_year) - i)

last_five_years

[2021, 2020, 2019, 2018, 2017]

In [4]:
# Create a function to create a dataframe from Baseball Reference tables
# for year in last_five_years:
    
# input URL and use BeautifulSoup to parse through the page
url = f'https://www.baseball-reference.com/leagues/majors/{last_year}-standard-batting.shtml'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')

# Grab the table element that has batter statistics
table = BeautifulSoup(soup.select_one('#all_players_standard_batting').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')

# Create a list to help create a dataframe from batter statistics data
batter_stats = []

# Grab data from table and put it into the list created above
for tr in table.select('tr:has(td)'):
    tds = [td.get_text(strip=True) for td in tr.select('td')]
    batter_stats.append(tds)

# Create dataframe for batter statistics
batter_stats_df = pd.DataFrame(batter_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')

# Set column headers equal to our list
batter_stats_df.columns = df_headers

batter_stats_df['Year'] = last_year


# Change types of columns to numeric for columns with number values
batter_stats_df[['Age', 'R','HR','RBI','SB','BA','PA','OPS','OPS+']] = batter_stats_df[['Age', 'R','HR','RBI','SB','BA','PA','OPS','OPS+']].apply(pd.to_numeric)

# Drop any players with 0 plate appearances to remove null values and change PA type to integer
batter_stats_df.dropna(subset=['PA'], axis = 0 , inplace= True)


# Remove any players with fewer than 100 plate appearances
filtered_batter_stats_df = batter_stats_df[batter_stats_df['PA'] >= 100]

# Select the columns we want for our batter analysis
final_batter_stats_df = filtered_batter_stats_df[['Year','Name','Age','R','HR','RBI','SB','BA','PA','OPS','OPS+','Pos\xa0Summary']]

# This drops duplicate players and (for now) keeps their aggregate stats because that is what is listed first on baseball reference
# There should be a better way to guarentee total stats is what we get, will try to code that later
final_batter_stats_df = final_batter_stats_df.drop_duplicates('Name')

#dataTypeDict = dict(filtered_batter_stats_df.dtypes)

# return the dataframe
final_batter_stats_df = final_batter_stats_df.set_index('Name')

# final_batter_stats_df

# # use a for loop to call the function for the past five years' worth of batting stats data
# batting_stats_dataframe()


In [12]:
R_zscores = stats.zscore(final_batter_stats_df['R'])
HR_zscores = stats.zscore(final_batter_stats_df['HR'])
RBI_zscores = stats.zscore(final_batter_stats_df['RBI'])
SB_zscores = stats.zscore(final_batter_stats_df['SB'])/5
BA_zscores = stats.zscore(final_batter_stats_df['BA'])

compare_players_df = pd.DataFrame({
    'Z_R': R_zscores,
    'Z_HR': HR_zscores,
    'Z-RBI': RBI_zscores,
    'Z_SB': SB_zscores,
    'Z_BA': BA_zscores,})

compare_players_df['average_z'] = compare_players_df.mean(axis=1)
compare_players_df['std_z'] = compare_players_df.std(axis=1)
compare_players_df['avg_confidence'] = (compare_players_df['average_z'] - compare_players_df['std_z'])
compare_players_df['position'] = final_batter_stats_df['Pos\xa0Summary']
final_compare_players_df = compare_players_df.sort_values(by=['avg_confidence'], ascending=False)
# final_compare_players_df.head(50)

In [13]:
final_batter_stats_df['Average Z'] = final_compare_players_df['average_z']
final_batter_stats_df['Z Confidence'] = final_compare_players_df['avg_confidence']
sorted_final_batter_stats_df = final_batter_stats_df.sort_values(by=['Z Confidence'], ascending=False)
sorted_final_batter_stats_df.head(50)

Unnamed: 0_level_0,Year,Age,R,HR,RBI,SB,BA,PA,OPS,OPS+,Pos Summary,Average Z,Z Confidence
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Trea Turner,2021,28.0,107,28,77,32,0.328,646,0.911,146.0,64/H,1.642379,1.06664
Bo Bichette,2021,23.0,121,29,102,25,0.298,690,0.828,123.0,*6D,1.741212,1.042461
Vladimir Guerrero Jr.,2021,22.0,123,48,111,4,0.311,698,1.002,169.0,*3D/5,2.133745,0.937926
Fernando Tatis Jr.,2021,22.0,99,42,97,25,0.282,546,0.975,166.0,69/8HD,1.713417,0.932869
Bryce Harper*,2021,28.0,101,35,84,13,0.309,599,1.044,179.0,*9/D3,1.564678,0.870972
Jose Ramirez#,2021,28.0,111,36,103,27,0.266,636,0.893,141.0,*5D/H,1.652869,0.864561
Paul Goldschmidt,2021,33.0,102,31,99,12,0.294,679,0.879,143.0,*3/DH,1.515859,0.836645
Juan Soto*,2021,22.0,111,29,95,9,0.313,654,0.999,175.0,*9/HD,1.596618,0.827235
Teoscar Hernandez,2021,28.0,92,32,116,12,0.296,595,0.87,133.0,*97D/8H,1.598435,0.81048
Kyle Tucker*,2021,24.0,83,30,92,14,0.294,567,0.917,147.0,*9/8DH,1.318703,0.784245


In [7]:
# batting_stats_dataframe()

In [8]:
# for year in last_five_years:
#     batting_stats_dataframe(year)
#     final_batter_stats_df.sort_values(by=['HR'])
#     final_batter_stats_df

In [9]:
# For 5 year trend plan, consider a difference between new year and prev year, put difference into list, then average 
# the values in the list. Won't tell you much other than if they've trended up, down, or plateau'd