In [1]:
# Import needed dependencies
import requests
import pandas as pd
import numpy as np
import scipy.stats as stats
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
today = date.today()

# dd/mm/YY
current_year = today.strftime("%Y")

last_year = int(current_year) - 1

In [3]:
last_five_years = []
for i in range(1,6):
    last_five_years.append(int(current_year) - i)

last_five_years

[2021, 2020, 2019, 2018, 2017]

In [4]:
# Create a function to create a dataframe from Baseball Reference tables
# for year in last_five_years:
    
# input URL and use BeautifulSoup to parse through the page
url = f'https://www.baseball-reference.com/leagues/majors/{last_year}-standard-pitching.shtml'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')

# Grab the table element that has batter statistics
table = BeautifulSoup(soup.select_one('#all_players_standard_pitching').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')

# Create a list to help create a dataframe from batter statistics data
pitcher_stats = []

# Grab data from table and put it into the list created above
for tr in table.select('tr:has(td)'):
    tds = [td.get_text(strip=True) for td in tr.select('td')]
    pitcher_stats.append(tds)

# Create dataframe for batter statistics
pitcher_stats_df = pd.DataFrame(pitcher_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')

# Set column headers equal to our list
pitcher_stats_df.columns = df_headers

pitcher_stats_df['Year'] = last_year


# Change types of columns to numeric for columns with number values
pitcher_stats_df[['Age', 'W','ERA','SO','SV','WHIP','ERA+','SO9','IP']] = pitcher_stats_df[['Age', 'W','ERA','SO','SV','WHIP','ERA+','SO9','IP']].apply(pd.to_numeric)


# Drop any players with 0 plate appearances to remove null values and change PA type to integer
pitcher_stats_df.dropna(subset=['IP'], axis = 0 , inplace= True)

pitcher_stats_df.replace([np.inf, -np.inf], np.nan, inplace=True)
pitcher_stats_df.dropna(subset=['ERA'], axis = 0 , inplace= True)
pitcher_stats_df.dropna(subset=['WHIP'], axis = 0 , inplace= True)

pitcher_stats_df['ERA'] = pitcher_stats_df['ERA'] * -1
pitcher_stats_df['WHIP'] = pitcher_stats_df['WHIP'] * -1


# Remove any players with fewer than 100 plate appearances
# filtered_batter_stats_df = batter_stats_df[batter_stats_df['PA'] >= 100]

# Select the columns we want for our batter analysis
final_pitcher_stats_df = pitcher_stats_df[['Year','Name','Age', 'W','ERA','SO','SV','WHIP','ERA+','SO9','IP']]

# This drops duplicate players and (for now) keeps their aggregate stats because that is what is listed first on baseball reference
# There should be a better way to guarentee total stats is what we get, will try to code that later
final_pitcher_stats_df = final_pitcher_stats_df.drop_duplicates('Name')

#dataTypeDict = dict(filtered_batter_stats_df.dtypes)

# return the dataframe
final_pitcher_stats_df = final_pitcher_stats_df.set_index('Name')

final_pitcher_stats_df

# # use a for loop to call the function for the past five years' worth of batting stats data
# batting_stats_dataframe()


Unnamed: 0_level_0,Year,Age,W,ERA,SO,SV,WHIP,ERA+,SO9,IP
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Fernando Abad*,2021,35.0,0,-5.60,10,0,-1.698,83.0,5.1,17.2
Cory Abbott,2021,25.0,0,-6.75,12,0,-1.788,64.0,6.2,17.1
Albert Abreu,2021,25.0,2,-5.15,35,1,-1.255,84.0,8.6,36.2
Bryan Abreu,2021,24.0,3,-5.75,36,1,-1.472,75.0,9.0,36.0
Domingo Acevedo,2021,27.0,0,-3.27,9,0,-1.182,129.0,7.4,11.0
...,...,...,...,...,...,...,...,...,...,...
Kyle Zimmer,2021,29.0,4,-4.83,46,2,-1.407,95.0,7.7,54.0
Bruce Zimmermann*,2021,26.0,4,-5.04,56,0,-1.508,91.0,7.8,64.1
Jordan Zimmermann,2021,35.0,0,-7.94,0,0,-1.765,57.0,0.0,5.2
Tyler Zuber,2021,26.0,0,-6.26,25,0,-1.573,74.0,8.2,27.1


In [13]:
W_zscores = stats.zscore(final_pitcher_stats_df['W'])
ERA_zscores = stats.zscore(final_pitcher_stats_df['ERA']) 
K_zscores = stats.zscore(final_pitcher_stats_df['SO'])
SV_zscores = stats.zscore(final_pitcher_stats_df['SV']) / 2.5
WHIP_zscores = stats.zscore(final_pitcher_stats_df['WHIP'])

compare_players_df = pd.DataFrame({
    'Z_W': W_zscores,
    'Z_ERA': ERA_zscores,
    'Z-K': K_zscores,
    'Z_SV': SV_zscores,
    'Z_WHIP': WHIP_zscores})

compare_players_df['average_z'] = compare_players_df.mean(axis=1)
compare_players_df['std_z'] = compare_players_df.std(axis=1)
compare_players_df['avg_confidence'] = (compare_players_df['average_z'] - compare_players_df['std_z'])
compare_players_df['Innings'] = final_pitcher_stats_df['IP']
final_compare_players_df = compare_players_df.sort_values(by=['average_z'], ascending=False)
final_compare_players_df.head(50)

Unnamed: 0_level_0,Z_W,Z_ERA,Z-K,Z_SV,Z_WHIP,average_z,std_z,avg_confidence,Innings
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Julio Urias*,5.065224,0.462965,3.000034,-0.111298,0.580745,1.799534,1.95085,-0.151316,185.2
Gerrit Cole,3.895232,0.414707,3.97025,-0.111298,0.53359,1.740496,1.803229,-0.062733,181.1
Max Scherzer,3.602734,0.552333,3.82876,-0.111298,0.757864,1.726079,1.651313,0.074765,179.1
Zack Wheeler,3.310236,0.495138,4.051102,-0.111298,0.592246,1.667485,1.677788,-0.010303,213.1
Walker Buehler,3.895232,0.550545,3.343652,-0.111298,0.638251,1.663277,1.627467,0.035809,207.2
Robbie Ray*,3.017739,0.484413,4.071315,-0.111298,0.549692,1.602372,1.636678,-0.034306,193.1
Kevin Gausman,3.310236,0.489776,3.646845,-0.111298,0.553142,1.57774,1.572835,0.004905,192.0
Adam Wainwright,4.18773,0.446879,2.575564,-0.111298,0.53589,1.526953,1.613193,-0.086239,206.1
Charlie Morton,3.310236,0.395046,3.424503,-0.111298,0.549692,1.513636,1.529708,-0.016072,185.2
Liam Hendriks,1.555249,0.538034,1.34258,3.088987,0.90968,1.486906,0.874724,0.612182,71.0


In [14]:
final_pitcher_stats_df['Average Z'] = final_compare_players_df['average_z']
final_pitcher_stats_df['Z Confidence'] = final_compare_players_df['avg_confidence']
sorted_final_pitcher_stats_df = final_pitcher_stats_df.sort_values(by=['Average Z'], ascending=False)
sorted_final_pitcher_stats_df.head(50)

Unnamed: 0_level_0,Year,Age,W,ERA,SO,SV,WHIP,ERA+,SO9,IP,Average Z,Z Confidence
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Julio Urias*,2021,24.0,20,-2.96,195,0,-1.018,138.0,9.5,185.2,1.799534,-0.151316
Gerrit Cole,2021,30.0,16,-3.23,243,0,-1.059,133.0,12.1,181.1,1.740496,-0.062733
Max Scherzer,2021,36.0,15,-2.46,236,0,-0.864,166.0,11.8,179.1,1.726079,0.074765
Zack Wheeler,2021,31.0,14,-2.78,247,0,-1.008,150.0,10.4,213.1,1.667485,-0.010303
Walker Buehler,2021,26.0,16,-2.47,212,0,-0.968,165.0,9.2,207.2,1.663277,0.035809
Robbie Ray*,2021,29.0,13,-2.84,248,0,-1.045,154.0,11.5,193.1,1.602372,-0.034306
Kevin Gausman,2021,30.0,14,-2.81,227,0,-1.042,145.0,10.6,192.0,1.57774,0.004905
Adam Wainwright,2021,39.0,17,-3.05,174,0,-1.057,127.0,7.6,206.1,1.526953,-0.086239
Charlie Morton,2021,37.0,14,-3.34,216,0,-1.045,132.0,10.5,185.2,1.513636,-0.016072
Liam Hendriks,2021,32.0,8,-2.54,113,38,-0.732,171.0,14.3,71.0,1.486906,0.612182


In [7]:
# batting_stats_dataframe()

In [8]:
# for year in last_five_years:
#     batting_stats_dataframe(year)
#     final_batter_stats_df.sort_values(by=['HR'])
#     final_batter_stats_df

In [9]:
# For 5 year trend plan, consider a difference between new year and prev year, put difference into list, then average 
# the values in the list. Won't tell you much other than if they've trended up, down, or plateau'd