In [1]:
# Import needed dependencies
import requests
import pandas as pd
import numpy as np
import scipy.stats as stats
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
today = date.today()

# dd/mm/YY
current_year = today.strftime("%Y")

last_year = int(current_year) - 1

In [3]:
last_five_years = []
for i in range(1,6):
    last_five_years.append(int(current_year) - i)

In [4]:
# Create a list to help create a dataframe from batter statistics data
pitcher_stats = []

for year in last_five_years:

    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-pitching.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = BeautifulSoup(soup.select_one('#all_players_standard_pitching').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')

    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        pitcher_stats.append(tds)

In [5]:
# Create dataframe for batter statistics
pitcher_stats_df = pd.DataFrame(pitcher_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
pitcher_stats_df.columns = df_headers

In [6]:
# Change types of columns to numeric for columns with number values
pitcher_stats_df[['Age', 'W','ERA','SO','SV','WHIP','ERA+','SO9','IP']] = pitcher_stats_df[['Age', 'W','ERA','SO','SV','WHIP','ERA+','SO9','IP']].apply(pd.to_numeric)

# Drop any players with NaN innings pitched, ERA, and WHIP to remove null values 
pitcher_stats_df.dropna(subset=['IP'], axis = 0 , inplace= True)
pitcher_stats_df.dropna(subset=['ERA'], axis = 0 , inplace= True)
pitcher_stats_df.dropna(subset=['WHIP'], axis = 0 , inplace= True)
pitcher_stats_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Remove any pitchers with fewer than 30 innings pitched
pitcher_stats_df = pitcher_stats_df[pitcher_stats_df['IP'] >= 30]

# Make ERA and WHIP negative so high values become "low" when sorted with all other columns
pitcher_stats_df['ERA'] = pitcher_stats_df['ERA'] * -1
pitcher_stats_df['WHIP'] = pitcher_stats_df['WHIP'] * -1

# Select the columns we want for our pitcher analysis
final_pitcher_stats_df = pitcher_stats_df[['Year','Name','Age', 'W','ERA','SO','SV','WHIP','ERA+','SO9','IP']]

# Eliminate Baseball Reference's name badges for accolades
final_pitcher_stats_df['Name'] = final_pitcher_stats_df['Name'].str.extract('([^\*|#]*)')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_pitcher_stats_df['Name'] = final_pitcher_stats_df['Name'].str.extract('([^\*|#]*)')


In [7]:
# Sort by index to prepare to drop duplicates
final_pitcher_stats_df = final_pitcher_stats_df.sort_index()

# Drop duplicate entries of Player Name and Year
# This is to eliminate partial season data for players who played for 2+ teams in one season
final_pitcher_stats_df = final_pitcher_stats_df.drop_duplicates(subset=['Year', 'Name'])

In [8]:
# Sort data by name alphabetically, then by year in descending order
final_pitcher_stats_df = final_pitcher_stats_df.sort_values(['Year','Name'], ascending=[True, True])

In [9]:
# Create a new dataframe for stats percentile calculations
percentile_df = pd.DataFrame(columns = ['Year', 'Name', 'Age', 'W', 'ERA', 'SO', 'SV', 'WHIP', 'SO9', 'IP'])

# Carry over columnns from final_pitcher_stats_df that shouldn't be comparatively ranked 
percentile_df['Year'] = final_pitcher_stats_df['Year']
percentile_df['Name'] = final_pitcher_stats_df['Name']
percentile_df['Age'] = final_pitcher_stats_df['Age']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
for year in last_five_years:
    year_df = final_pitcher_stats_df.loc[final_pitcher_stats_df['Year'] == year]
    year_df['W_Percentile'] = year_df['W'].rank(pct=True)
    year_df['ERA_Percentile'] = year_df['ERA'].rank(pct=True)
    year_df['SO_Percentile'] = year_df['SO'].rank(pct=True)
    year_df['SV_Percentile'] = year_df['SV'].rank(pct=True)
    year_df['WHIP_Percentile'] = year_df['WHIP'].rank(pct=True)
    year_df.sort_values('Name', ascending=True)

    # Each of the seasons are added back to the percentile dataframe
    percentile_df = percentile_df.append(year_df, ignore_index=True)

percentile_df = percentile_df.sort_values(['Year','Name'], ascending=[True, True])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['W_Percentile'] = year_df['W'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['ERA_Percentile'] = year_df['ERA'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['SO_Percentile'] = year_df['SO'].rank(pct=True)
A value is trying to be set on a copy

In [10]:
# Clean up the percentile dataframe, drop NaNs and remove unnecessary columns
percentile_df = percentile_df.dropna()

# Add a rank column that adds the percentiles from each category
percentile_df['Rank'] = (percentile_df['W_Percentile'] + percentile_df['ERA_Percentile'] + percentile_df['SO_Percentile'] + percentile_df['SV_Percentile'] + percentile_df['WHIP_Percentile'])


In [11]:
# Create a list of each unique player we have in our dataframe
player_list = percentile_df.Name.unique().tolist()

# Create a new dataframe for combined, averaged percentiles over the past 5 seasons
new_df = pd.DataFrame(columns = ['Name', 'Rank', 'Trend', 'Years', 'W_Percentile', 'ERA_Percentile', 'SO_Percentile', 'SV_Percentile', 'WHIP_Percentile'])

# Create a list for each percentile stat category for upcoming loop
player_trends = []
average_W = []
average_ERA = []
average_SO = []
average_SV = []
average_WHIP = []
average_Rank = []
year_count = []
pos = []

# Loop through each player, check if they played in the past two seasons. If not, remove them
for player in player_list:
    filter_df = percentile_df.loc[percentile_df['Name'] == player]
    filter_df = filter_df.sort_values(['Year'], ascending=[False])
    year_list = filter_df.Year.tolist()
    if (year_list[0] != last_year) and (year_list[0] != (last_year - 1)):
        player_list.remove(player)

# Update new dataframe with updated unique player list
new_df['Name'] = player_list        

# Loop through each player, locate their percentile stats for each season, average them out
for player in player_list:
    player_df = percentile_df.loc[percentile_df['Name'] == player]
    
    # We want to find the slope of the line of best fit for each player's overall ranking each season
    x = np.array(player_df['Year'], dtype = float)
    y = np.array(player_df['Rank'], dtype = float)
    slope, intercept = np.polyfit(x, y, 1)
    player_trends.append(slope)
    
    # Find average of each player's percentiles from previous 5 seasons
    average_W.append(sum(player_df['W_Percentile']) / len(player_df['W_Percentile']))
    average_ERA.append(sum(player_df['ERA_Percentile']) / len(player_df['ERA_Percentile']))
    average_SO.append(sum(player_df['SO_Percentile']) / len(player_df['SO_Percentile']))
    average_SV.append(sum(player_df['SV_Percentile']) / len(player_df['SV_Percentile']))
    average_WHIP.append(sum(player_df['WHIP_Percentile']) / len(player_df['WHIP_Percentile']))
    average_Rank.append(sum(player_df['Rank']) / len(player_df['Rank']))
    year_count.append(len(x))

# Update new dataframe with the list data from each stat
new_df['Trend'] = player_trends
new_df['W_Percentile'] = average_W
new_df['ERA_Percentile'] = average_ERA
new_df['SO_Percentile'] = average_SO
new_df['SV_Percentile'] = average_SV
new_df['WHIP_Percentile'] = average_WHIP
new_df['Rank'] = average_Rank

# Keep track of how many seasons are being considered, so we know how reliable the data is
new_df['Years'] = year_count



  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

In [12]:
# Create a weighted rank column by adding trend data to the rank data and account for number of seasons played
# Basically, if you played all 5 seasons, your trend stat is added directly
# If you played fewer than all 5 seasons, your trend stat is reduced depending on how few seasons you played
new_df['Weighted Rank'] = (new_df['Rank'] + ((new_df['Trend'] * (new_df['Years'] - 1) / 4)))

# shift column 'Weighted Rank' to first position
first_column = new_df.pop('Weighted Rank')
  
# insert column using insert(position,column_name,first_column) function
new_df.insert(1, 'Weighted Rank', first_column)

new_df = new_df.sort_values('Weighted Rank', ascending = False)
new_df.head(50)

Unnamed: 0,Name,Weighted Rank,Rank,Trend,Years,W_Percentile,ERA_Percentile,SO_Percentile,SV_Percentile,WHIP_Percentile
607,Alexis Díaz,4.308917,4.308917,0.001066,1,0.791932,0.972399,0.676221,0.929936,0.938429
185,Justin Verlander,4.230547,4.22841,0.004273,3,0.990764,0.938395,0.985115,0.326198,0.987939
492,Devin Williams,4.200687,4.117935,0.331008,2,0.779559,0.943907,0.71247,0.901062,0.780937
638,Evan Phillips,4.185775,4.185775,0.001035,1,0.791932,0.997877,0.627389,0.77707,0.991507
381,Liam Hendriks,4.170686,4.216773,-0.092174,3,0.630785,0.905899,0.762817,0.98661,0.930662
527,Jordan Romano,4.089993,4.135401,-0.18163,2,0.708035,0.956547,0.621209,0.980428,0.869181
639,Félix Bautista,4.076433,4.076433,0.001008,1,0.529724,0.930998,0.705945,0.952229,0.957537
500,Emmanuel Clase,3.994186,3.988372,0.023256,2,0.440028,0.990464,0.595935,0.986258,0.975687
450,Alek Manoah,3.967463,3.861677,0.423143,2,0.928096,0.832282,0.90054,0.317778,0.882981
90,Dellin Betances,3.939232,3.939232,0.000976,1,0.528785,0.864606,0.788913,0.86887,0.88806
