In [1]:
# Import needed dependencies
import requests
import pandas as pd
import numpy as np
import scipy.stats as stats
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
today = date.today()

# dd/mm/YY
current_year = today.strftime("%Y")

last_year = int(current_year) - 1

In [3]:
last_five_years = []
for i in range(1,6):
    last_five_years.append(int(current_year) - i)

In [4]:
# Create a list to help create a dataframe from batter statistics data
pitcher_stats = []

for year in last_five_years:

    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-pitching.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = BeautifulSoup(soup.select_one('#all_players_standard_pitching').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')

    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        pitcher_stats.append(tds)

In [5]:
# Create dataframe for batter statistics
pitcher_stats_df = pd.DataFrame(pitcher_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
pitcher_stats_df.columns = df_headers

In [6]:
# Change types of columns to numeric for columns with number values
pitcher_stats_df[['Age', 'W','ERA','SO','SV','WHIP','ERA+','SO9','IP']] = pitcher_stats_df[['Age', 'W','ERA','SO','SV','WHIP','ERA+','SO9','IP']].apply(pd.to_numeric)

# Drop any players with NaN innings pitched, ERA, and WHIP to remove null values 
pitcher_stats_df.dropna(subset=['IP'], axis = 0 , inplace= True)
pitcher_stats_df.dropna(subset=['ERA'], axis = 0 , inplace= True)
pitcher_stats_df.dropna(subset=['WHIP'], axis = 0 , inplace= True)
pitcher_stats_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Make ERA and WHIP negative so high values become "low" when sorted with all other columns
pitcher_stats_df['ERA'] = pitcher_stats_df['ERA'] * -1
pitcher_stats_df['WHIP'] = pitcher_stats_df['WHIP'] * -1

# Select the columns we want for our pitcher analysis
final_pitcher_stats_df = pitcher_stats_df[['Year','Name','Age', 'W','ERA','SO','SV','WHIP','ERA+','SO9','IP']]


In [7]:
# Create variables for Z scores for all stats counted in ESPN Fantasy Baseball category leagues
W_zscores = stats.zscore(final_pitcher_stats_df['W'])
ERA_zscores = stats.zscore(final_pitcher_stats_df['ERA']) 
K_zscores = stats.zscore(final_pitcher_stats_df['SO'])
SV_zscores = stats.zscore(final_pitcher_stats_df['SV']) / 2.5
WHIP_zscores = stats.zscore(final_pitcher_stats_df['WHIP'])

# Add Z scores to a new DataFrame
compare_players_df = pd.DataFrame({
    'Z_W': W_zscores,
    'Z_ERA': ERA_zscores,
    'Z-K': K_zscores,
    'Z_SV': SV_zscores,
    'Z_WHIP': WHIP_zscores})

# Calculate average of 5 Z scores, stadard deviation, and a confidence level in the average Z score
# Confidence level is to try to avoid players with just one, large Z-score
compare_players_df['average_z'] = compare_players_df.mean(axis=1)
compare_players_df['std_z'] = compare_players_df.std(axis=1)
compare_players_df['avg_confidence'] = (compare_players_df['average_z'] - compare_players_df['std_z'])
compare_players_df['Innings'] = final_pitcher_stats_df['IP']
final_compare_players_df = compare_players_df.sort_values(by=['average_z'], ascending=False)


In [8]:
# Add avg Z scores, confidence columns to batter stats DataFrame
final_pitcher_stats_df['Average Z'] = final_compare_players_df['average_z']
final_pitcher_stats_df['Z Confidence'] = final_compare_players_df['avg_confidence']
sorted_final_pitcher_stats_df = final_pitcher_stats_df.sort_values(by=['Average Z'], ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_pitcher_stats_df['Average Z'] = final_compare_players_df['average_z']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_pitcher_stats_df['Z Confidence'] = final_compare_players_df['avg_confidence']


In [9]:
# Eliminate Baseball Reference's name badges for accolades
sorted_final_pitcher_stats_df['Name'] = sorted_final_pitcher_stats_df['Name'].str.extract('([^\*|#]*)')

In [10]:
# Sort by index to prepare to drop duplicates
sorted_final_pitcher_stats_df = sorted_final_pitcher_stats_df.sort_index()

In [11]:
# Drop duplicate entries of Player Name and Year
# This is to eliminate partial season data for players who played for 2+ teams in one season
sorted_final_pitcher_stats_df = sorted_final_pitcher_stats_df.drop_duplicates(subset=['Year', 'Name'])

In [12]:
# Sort data by name alphabetically, then by year in descending order
sorted_final_pitcher_stats_df = sorted_final_pitcher_stats_df.sort_values(['Name','Year'], ascending=[True, False])
sorted_final_pitcher_stats_df

Unnamed: 0,Year,Name,Age,W,ERA,SO,SV,WHIP,ERA+,SO9,IP,Average Z,Z Confidence
18,2021,A.J. Alexy,23.0,3,-4.70,17,0,-1.304,94.0,6.7,23.0,-0.002064,-0.292951
203,2021,A.J. Cole,29.0,0,-1.13,7,1,-0.875,413.0,7.9,8.0,-0.022088,-0.654742
1305,2020,A.J. Cole,28.0,3,-3.09,20,1,-1.200,144.0,7.7,23.1,0.093784,-0.222355
2155,2019,A.J. Cole,27.0,3,-3.81,30,1,-1.500,125.0,10.4,26.0,0.056798,-0.127925
3169,2018,A.J. Cole,26.0,4,-6.14,59,0,-1.593,69.0,11.0,48.1,0.139489,-0.098384
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4676,2017,Zac Reininger,24.0,0,-7.45,5,0,-1.966,62.0,4.7,9.2,-0.441688,-0.709500
896,2021,Zac Rosscup,33.0,0,-3.00,4,0,-1.333,175.0,12.0,3.0,-0.195388,-0.687407
2763,2019,Zac Rosscup,31.0,2,-5.00,26,0,-2.278,91.0,13.0,18.0,-0.225009,-0.499667
3781,2018,Zac Rosscup,30.0,0,-4.76,20,0,-1.147,84.0,15.9,11.1,-0.139707,-0.553368
