In [1]:
# Import needed dependencies
import requests
import re
import pandas as pd
import numpy as np
import scipy.stats as stats
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
today = date.today()

# dd/mm/YY
current_year = today.strftime("%Y")

last_year = int(current_year) - 1

In [3]:
last_five_years = []
for i in range(1,6):
    last_five_years.append(int(current_year) - i)

In [4]:
# Create a list to help create a dataframe from batter statistics data
batter_stats = []

# Create a loop to create a dataframe from Baseball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-batting.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = soup.select_one('#all_players_standard_batting')


    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        batter_stats.append(tds)

In [5]:
# Create dataframe for batter statistics
batter_stats_df = pd.DataFrame(batter_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
batter_stats_df.columns = df_headers

In [6]:
# batter_name_cleaning_list = batter_stats_df.Player.to_list()
# cleaned_batter_name_list = []

# for item in batter_name_cleaning_list:
#     cleaned_batter_name_list.append(re.sub(r'[^A-Za-z0-9 ]+', '', item))

# batter_stats_df['Player'] = cleaned_batter_name_list
# batter_stats_df

In [7]:
# Change types of columns to numeric for columns with number values
batter_stats_df[['Age', 'R','HR','RBI','SB','BA','PA','OPS','OPS+']] = batter_stats_df[['Age', 'R','HR','RBI','SB','BA','PA','OPS','OPS+']].apply(pd.to_numeric)

# Drop any players with 0 plate appearances to remove null values and change PA type to integer
batter_stats_df.dropna(subset=['PA'], axis = 0 , inplace= True)

# Remove any players with fewer than 100 plate appearances
filtered_batter_stats_df = batter_stats_df[batter_stats_df['PA'] >= 100]

# Select the columns we want for our batter analysis
final_batter_stats_df = filtered_batter_stats_df[['Year','Player','Team','Age','R','HR','RBI','SB','BA','PA','OPS','OPS+','Pos']]


In [8]:
# Sort by index to prepare to drop duplicates
final_batter_stats_df = final_batter_stats_df.sort_index()

# Drop duplicate entries of Player Name and Year
# This is to eliminate partial season data for players who played for 2+ teams in one season
final_batter_stats_df = final_batter_stats_df.drop_duplicates(subset=['Year', 'Player'])

In [9]:
# Sort data by name alphabetically, then by year in descending order
final_batter_stats_df = final_batter_stats_df.sort_values(['Year','Player'], ascending=[True, True])

# Eliminate Baseball Reference's name badges for accolades
final_batter_stats_df['Player'] = final_batter_stats_df['Player'].str.extract('([^\*|#]*)')

cleaned_player_list = []
for player in final_batter_stats_df['Player']:
    player = player.replace("\xa0", " ")
    cleaned_player_list.append(player)

final_batter_stats_df['Player'] = cleaned_player_list        

In [10]:
# Create a new dataframe for stats percentile calculations
percentile_df = pd.DataFrame(columns = ['Year', 'Player', 'Team', 'Age', 'BA', 'R', 'HR', 'RBI', 'SB', 'PA'])

# Carry over columnns from final_batter_stats_df that shouldn't be comparatively ranked 
percentile_df['Year'] = final_batter_stats_df['Year']
percentile_df['Player'] = final_batter_stats_df['Player']
percentile_df['Age'] = final_batter_stats_df['Age']
percentile_df['Team'] = final_batter_stats_df['Team']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
for year in last_five_years:
    year_df = final_batter_stats_df.loc[final_batter_stats_df['Year'] == year]
    year_df['BA_Percentile'] = year_df['BA'].rank(pct=True)
    year_df['R_Percentile'] = year_df['R'].rank(pct=True)
    year_df['HR_Percentile'] = year_df['HR'].rank(pct=True)
    year_df['RBI_Percentile'] = year_df['RBI'].rank(pct=True)
    year_df['SB_Percentile'] = year_df['SB'].rank(pct=True)
    year_df.sort_values('Player', ascending=True)

    # Each of the seasons are added back to the percentile dataframe
    percentile_df = percentile_df.append(year_df, ignore_index=True)

percentile_df = percentile_df.sort_values(['Year','Player'], ascending=[True, True])
percentile_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['BA_Percentile'] = year_df['BA'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['R_Percentile'] = year_df['R'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['HR_Percentile'] = year_df['HR'].rank(pct=True)
A value is trying to be set on a copy o

Unnamed: 0,Year,Player,Team,Age,BA,R,HR,RBI,SB,PA,OPS,OPS+,Pos,BA_Percentile,R_Percentile,HR_Percentile,RBI_Percentile,SB_Percentile
0,2020.0,AJ Pollock,LAD,32.0,,,,,,,,,,,,,,
4006,2020.0,AJ Pollock,LAD,32.0,0.276,30.0,16.0,34.0,2.0,210.0,0.881,132.0,*78D/H,0.722581,0.766129,0.969355,0.880645,0.620968
1,2020.0,Aaron Hicks,NYY,30.0,,,,,,,,,,,,,,
4007,2020.0,Aaron Hicks,NYY,30.0,0.225,28.0,6.0,21.0,4.0,211.0,0.793,122.0,*8/HD,0.280645,0.712903,0.500000,0.464516,0.793548
2,2020.0,Aaron Judge,NYY,28.0,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,2024.0,Zach Neto,LAA,23.0,0.249,70.0,23.0,77.0,30.0,602.0,0.761,113.0,*6/H,0.637363,0.815385,0.885714,0.892308,0.951648
2156,2024.0,Zack Gelof,OAK,24.0,,,,,,,,,,,,,,
2611,2024.0,Zack Gelof,OAK,24.0,0.211,60.0,17.0,49.0,25.0,547.0,0.632,82.0,*4/H,0.212088,0.727473,0.751648,0.631868,0.936264
2157,2024.0,Ángel Martínez,CLE,22.0,,,,,,,,,,,,,,


In [11]:
# Clean up the percentile dataframe, drop NaNs and remove unnecessary columns
percentile_df = percentile_df.dropna()

# Add a rank column that adds the percentiles from each category
percentile_df['Rank'] = (percentile_df['BA_Percentile'] + percentile_df['R_Percentile'] + percentile_df['HR_Percentile'] + percentile_df['RBI_Percentile'] + percentile_df['SB_Percentile'])
percentile_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  percentile_df['Rank'] = (percentile_df['BA_Percentile'] + percentile_df['R_Percentile'] + percentile_df['HR_Percentile'] + percentile_df['RBI_Percentile'] + percentile_df['SB_Percentile'])


Unnamed: 0,Year,Player,Team,Age,BA,R,HR,RBI,SB,PA,OPS,OPS+,Pos,BA_Percentile,R_Percentile,HR_Percentile,RBI_Percentile,SB_Percentile,Rank
4006,2020.0,AJ Pollock,LAD,32.0,0.276,30.0,16.0,34.0,2.0,210.0,0.881,132.0,*78D/H,0.722581,0.766129,0.969355,0.880645,0.620968,3.959677
4007,2020.0,Aaron Hicks,NYY,30.0,0.225,28.0,6.0,21.0,4.0,211.0,0.793,122.0,*8/HD,0.280645,0.712903,0.500000,0.464516,0.793548,2.751613
4008,2020.0,Aaron Judge,NYY,28.0,0.257,23.0,9.0,22.0,0.0,114.0,0.891,143.0,9/DH,0.585484,0.558065,0.745161,0.500000,0.180645,2.569355
4009,2020.0,Adalberto Mondesí,KCR,24.0,0.256,33.0,6.0,22.0,24.0,233.0,0.710,90.0,*6,0.577419,0.838710,0.500000,0.500000,1.000000,3.416129
4010,2020.0,Adam Duvall,ATL,31.0,0.237,34.0,16.0,33.0,0.0,209.0,0.833,114.0,*79/H8D,0.403226,0.862903,0.969355,0.856452,0.180645,3.272581
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2608,2024.0,Yordan Alvarez,HOU,27.0,0.308,88.0,35.0,86.0,6.0,635.0,0.959,172.0,D7,0.978022,0.940659,0.978022,0.929670,0.608791,4.435165
2609,2024.0,Zach McKinstry,DET,29.0,0.215,32.0,4.0,23.0,16.0,325.0,0.614,74.0,65H479/1D,0.245055,0.372527,0.213187,0.259341,0.854945,1.945055
2610,2024.0,Zach Neto,LAA,23.0,0.249,70.0,23.0,77.0,30.0,602.0,0.761,113.0,*6/H,0.637363,0.815385,0.885714,0.892308,0.951648,4.182418
2611,2024.0,Zack Gelof,OAK,24.0,0.211,60.0,17.0,49.0,25.0,547.0,0.632,82.0,*4/H,0.212088,0.727473,0.751648,0.631868,0.936264,3.259341


In [12]:
# Create a list of each unique player we have in our dataframe
player_list = percentile_df.Player.unique().tolist()

# Create a new dataframe for combined, averaged percentiles over the past 5 seasons
new_df = pd.DataFrame(columns = ['Player', 'Rank', 'Trend', 'Pos', 'Years', 'BA_Percentile', 'R_Percentile', 'HR_Percentile', 'RBI_Percentile', 'SB_Percentile'])

# Create a list for each percentile stat category for upcoming loop
player_trends = []
average_BA = []
average_R = []
average_HR = []
average_RBI = []
average_SB = []
average_Rank = []
year_count = []
pos = []

# Loop through each player, check if they played in the past two seasons. If not, remove them
for player in player_list:
    filter_df = percentile_df.loc[percentile_df['Player'] == player]
    filter_df = filter_df.sort_values(['Year'], ascending=[False])
    year_list = filter_df.Year.tolist()
    if (year_list[0] != last_year) and (year_list[0] != (last_year - 1)):
        player_list.remove(player)

# Update new dataframe with updated unique player list
new_df['Player'] = player_list        

# Loop through each player, locate their percentile stats for each season, average them out
for player in player_list:
    player_df = percentile_df.loc[percentile_df['Player'] == player]
    
    # We want to find the slope of the line of best fit for each player's overall ranking each season
    x = np.array(player_df['Year'], dtype = float)
    y = np.array(player_df['Rank'], dtype = float)
    slope, intercept = np.polyfit(x, y, 1)
    player_trends.append(slope)
    
    # Find average of each player's percentiles from previous 5 seasons
    average_BA.append(sum(player_df['BA_Percentile']) / len(player_df['BA_Percentile']))
    average_R.append(sum(player_df['R_Percentile']) / len(player_df['R_Percentile']))
    average_HR.append(sum(player_df['HR_Percentile']) / len(player_df['HR_Percentile']))
    average_RBI.append(sum(player_df['RBI_Percentile']) / len(player_df['RBI_Percentile']))
    average_SB.append(sum(player_df['SB_Percentile']) / len(player_df['SB_Percentile']))
    average_Rank.append(sum(player_df['Rank']) / len(player_df['Rank']))
    year_count.append(len(x))
    
    # Keep player positions for reference purposes during the draft
    pos.append(player_df['Pos'].unique())

# Update new dataframe with the list data from each stat
new_df['Pos'] = pos
new_df['Trend'] = player_trends
new_df['BA_Percentile'] = average_BA
new_df['R_Percentile'] = average_R
new_df['HR_Percentile'] = average_HR
new_df['RBI_Percentile'] = average_RBI
new_df['SB_Percentile'] = average_SB
new_df['Rank'] = average_Rank

# Keep track of how many seasons are being considered, so we know how reliable the data is
new_df['Years'] = year_count



  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

In [13]:
# Create a weighted rank column by adding trend data to the rank data and account for number of seasons played
# Basically, if you played all 5 seasons, your trend stat is added directly
# If you played fewer than all 5 seasons, your trend stat is reduced depending on how few seasons you played
new_df['Weighted Rank'] = (new_df['Rank'] + ((new_df['Trend'] * (new_df['Years'] - 1) / 4)))

# shift column 'Weighted Rank' to first position
first_column = new_df.pop('Weighted Rank')
  
# insert column using insert(position,column_name,first_column) function
new_df.insert(1, 'Weighted Rank', first_column)

new_df = new_df.sort_values('Weighted Rank', ascending = False)

In [14]:
# separate position column into a list for editing
new_pos_list = []
pos_list = new_df['Pos'].tolist()

# loop through list and pull only the last item, which represents player position listed from most recent season
for i in pos_list:
    j = i[-1]
    
    if re.search('/', j):
        k = re.sub("([^\/]+$)","",j)
        new_pos_list.append(k)
    else:
        new_pos_list.append(j)

cleaned_list = []
for pos in new_pos_list:
    placeholder = re.findall("[a-zA-Z0-9]+", pos)
    placeholder_2 = ''.join(placeholder)
    placeholder_3 = [d for d in placeholder_2]
    cleaned_list.append(placeholder_3)

cleaned_pos_list = []
for n_list in cleaned_list:
    
    placeholder_list = []
    for pos in n_list:
        if pos == '1':
            placeholder_list.append('P')
        elif pos == '2':
            placeholder_list.append('C')
        elif pos == '3':
            placeholder_list.append('1B')
        elif pos == '4':
            placeholder_list.append('2B')
        elif pos == '5':
            placeholder_list.append('3B')
        elif pos == '6':
            placeholder_list.append('SS')
        elif pos == ('7'):
            placeholder_list.append('OF')
        elif pos == ('8'):
            placeholder_list.append('OF')
        elif pos == ('9'):
            placeholder_list.append('OF')
        elif pos == ('D'):
            placeholder_list.append('DH')
        
    cleaned_pos_list.append(placeholder_list)        

temp_pos_list = []
for item in cleaned_pos_list:
    new_string = []
    for pos in item:
        string = str(pos)
        new_string = f'{new_string},{string}'
    temp_pos_list.append(new_string)

    
final_pos_list = []
for i in temp_pos_list:
    i = i.replace('[],', '')
    final_pos_list.append(i)
    
# replace old position column with new position column
new_df.drop('Pos', axis = 1, inplace = True)
new_df['Pos'] = final_pos_list

new_df.head(25)

Unnamed: 0,Player,Weighted Rank,Rank,Trend,Years,BA_Percentile,R_Percentile,HR_Percentile,RBI_Percentile,SB_Percentile,Pos
199,Shohei Ohtani,4.796784,4.328196,0.468588,5,0.704882,0.892269,0.910337,0.892377,0.928331,DH
365,Bobby Witt Jr.,4.790617,4.652996,0.275242,3,0.846372,0.95459,0.92222,0.948723,0.98109,SS
122,José Ramírez,4.717098,4.712929,0.004169,5,0.857743,0.964488,0.954428,0.964809,0.971461,"3B,DH"
71,Freddie Freeman,4.58857,4.596189,-0.007619,5,0.969491,0.978282,0.903572,0.952754,0.79209,1B
568,Jackson Merrill,4.535165,4.535165,0.00112,1,0.956044,0.876923,0.903297,0.943956,0.854945,OF
210,Trea Turner,4.532218,4.633157,-0.100939,5,0.930688,0.973361,0.882337,0.883647,0.963124,SS
70,Francisco Lindor,4.50992,4.202946,0.306974,5,0.651391,0.899645,0.859854,0.878345,0.913712,SS
167,Mookie Betts,4.498335,4.502801,-0.004466,5,0.864219,0.957867,0.912408,0.890535,0.877773,"SS,OF,2B"
2,Aaron Judge,4.496712,4.122588,0.374124,5,0.848865,0.866336,0.942103,0.858185,0.607099,"OF,DH"
402,Julio Rodríguez,4.473259,4.556567,-0.166615,3,0.871964,0.920724,0.905554,0.893947,0.964377,"OF,DH"


In [15]:
# Create a list to help create a dataframe from pitcher statistics data
pitcher_stats = []

for year in last_five_years:

    # input URL and use BeautifulSoup to parse through the page
    pitching_url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-pitching.shtml'
    pitching_soup = BeautifulSoup(requests.get(pitching_url).content, 'html.parser')

    # Grab the table element that has batter statistics
    pitching_table = pitching_soup.select_one('#all_players_standard_pitching')#.find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')

    # Grab data from table and put it into the list created above
    for tr in pitching_table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        pitcher_stats.append(tds)
        

In [16]:
# Create dataframe for batter statistics
pitcher_stats_df = pd.DataFrame(pitcher_stats)

# Create an empty list to store dataframe header information
pitcher_header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in pitching_table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    pitcher_header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
pitcher_df_headers = pitcher_header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
pitcher_df_headers.remove('Rk')
pitcher_df_headers.append("Year")

# Set column headers equal to our list
pitcher_stats_df.columns = pitcher_df_headers

In [17]:
# Change types of columns to numeric for columns with number values
pitcher_stats_df[['Age', 'W','ERA','SO','SV','WHIP','ERA+','SO9','IP','G','GS']] = pitcher_stats_df[['Age', 'W','ERA','SO','SV','WHIP','ERA+','SO9','IP','G','GS']].apply(pd.to_numeric)

# Drop any players with NaN innings pitched, ERA, and WHIP to remove null values 
pitcher_stats_df.dropna(subset=['IP'], axis = 0 , inplace= True)
pitcher_stats_df.dropna(subset=['ERA'], axis = 0 , inplace= True)
pitcher_stats_df.dropna(subset=['WHIP'], axis = 0 , inplace= True)
pitcher_stats_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Remove any pitchers with fewer than 30 innings pitched
pitcher_stats_df = pitcher_stats_df[pitcher_stats_df['IP'] >= 30]

# Make ERA and WHIP negative so high values become "low" when sorted with all other columns
pitcher_stats_df['ERA'] = pitcher_stats_df['ERA'] * -1
pitcher_stats_df['WHIP'] = pitcher_stats_df['WHIP'] * -1

# pitcher_stats_df['GS%'] = pitcher_stats_df['GS'] / pitcher_stats_df['G']
# games_started_list = pitcher_stats_df['GS%'].to_list()
# pitcher_pos_assignment = []

# for item in games_started_list:
#     if item < .25:
#         pitcher_pos_assignment.append('RP')
#     if item >= .25 and item <= .75:
#         pitcher_pos_assignment.append('SP,RP')
#     if item > .75:
#         pitcher_pos_assignment.append('SP')

# pitcher_stats_df['Pos'] = pitcher_pos_assignment
        
# Select the columns we want for our pitcher analysis
final_pitcher_stats_df = pitcher_stats_df[['Year','Player','Age', 'W','ERA','SO','SV','WHIP','ERA+','SO9','IP','G','GS',]]

# Eliminate Baseball Reference's name badges for accolades
final_pitcher_stats_df['Player'] = final_pitcher_stats_df['Player'].str.extract('([^\*|#]*)')

pitcher_list = final_pitcher_stats_df.Player.tolist()

cleaned_pitcher_list = []
for pitcher in pitcher_list:
    pitcher = pitcher.replace("\xa0", " ")
    cleaned_pitcher_list.append(pitcher) 
    
final_pitcher_stats_df['Player'] = cleaned_pitcher_list        


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_pitcher_stats_df['Player'] = final_pitcher_stats_df['Player'].str.extract('([^\*|#]*)')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_pitcher_stats_df['Player'] = cleaned_pitcher_list


In [18]:
# Sort by index to prepare to drop duplicates
final_pitcher_stats_df = final_pitcher_stats_df.sort_index()

# Drop duplicate entries of Player Name and Year
# This is to eliminate partial season data for players who played for 2+ teams in one season
final_pitcher_stats_df = final_pitcher_stats_df.drop_duplicates(subset=['Year', 'Player'])

# Weighting ERA and WHIP with Innings Pitched so that relievers do not dominate these categories
final_pitcher_stats_df['ERA++'] = final_pitcher_stats_df['IP'] * -(1 / final_pitcher_stats_df['ERA'])
final_pitcher_stats_df['WHIP++'] = final_pitcher_stats_df['IP'] * -(1 / final_pitcher_stats_df['WHIP'])

In [19]:
# Sort data by name alphabetically, then by year in descending order
final_pitcher_stats_df = final_pitcher_stats_df.sort_values(['Year','Player'], ascending=[True, True])

In [20]:
final_pitcher_stats_df

Unnamed: 0,Year,Player,Age,W,ERA,SO,SV,WHIP,ERA+,SO9,IP,G,GS,ERA++,WHIP++
4917,2020.0,Aaron Civale,25.0,4.0,-4.74,69.0,0.0,-1.324,94.000,8.4,74.0,12.0,12.0,15.611814,55.891239
4924,2020.0,Aaron Nola,27.0,5.0,-3.28,96.0,0.0,-1.079,137.000,12.1,71.1,12.0,12.0,21.676829,65.894347
4942,2020.0,Adam Wainwright,38.0,5.0,-3.15,54.0,0.0,-1.051,133.000,7.4,65.2,10.0,10.0,20.698413,62.036156
4973,2020.0,Adrian Houser,27.0,1.0,-5.30,44.0,0.0,-1.500,86.000,7.1,56.0,12.0,11.0,10.566038,37.333333
4948,2020.0,Alec Mills,28.0,5.0,-4.48,46.0,0.0,-1.155,101.000,6.6,62.1,11.0,11.0,13.861607,53.766234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
429,2024.0,Ángel Zerpa,24.0,2.0,-3.86,49.0,0.0,-1.453,110.000,8.2,53.2,60.0,0.0,13.782383,36.613902
2302,,Jordan Montgomery,30.0,1.0,-6.00,0.0,31.0,-0.900,1.355,,37.0,5.0,0.0,6.166667,41.111111
4796,,Max Fried,27.0,2.0,-5.00,1.0,27.2,-1.000,1.193,,30.0,5.0,0.0,6.000000,30.000000
2300,,Nathan Eovaldi,33.0,0.0,-6.00,1.0,36.2,-0.700,1.118,,31.0,6.0,0.0,5.166667,44.285714


In [21]:
# Create a new dataframe for stats percentile calculations
pitcher_percentile_df = pd.DataFrame(columns = ['Year', 'Player', 'Age', 'W', 'ERA', 'SO', 'SV', 'WHIP', 'SO9', 'IP'])

# Carry over columnns from final_pitcher_stats_df that shouldn't be comparatively ranked 
pitcher_percentile_df['Year'] = final_pitcher_stats_df['Year']
pitcher_percentile_df['Player'] = final_pitcher_stats_df['Player']
pitcher_percentile_df['Age'] = final_pitcher_stats_df['Age']
# pitcher_percentile_df['Pos'] = final_pitcher_stats_df['Pos']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
for year in last_five_years:
    year_df = final_pitcher_stats_df.loc[final_pitcher_stats_df['Year'] == year]
    year_df['W_Percentile'] = year_df['W'].rank(pct=True)
    year_df['ERA_Percentile'] = year_df['ERA++'].rank(pct=True)
    year_df['SO_Percentile'] = year_df['SO'].rank(pct=True)
    year_df['SV_Percentile'] = year_df['SV'].rank(pct=True)
    year_df['WHIP_Percentile'] = year_df['WHIP++'].rank(pct=True)
    year_df.sort_values('Player', ascending=True)
    
    
    if year == last_year:
        year_df['GS%'] = year_df['GS'] / year_df['G']
        games_started_list = year_df['GS%'].to_list()
        pitcher_pos_assignment = []

        for item in games_started_list:
            if item < .25:
                pitcher_pos_assignment.append('RP')
            if item >= .25 and item <= .75:
                pitcher_pos_assignment.append('SP,RP')
            if item > .75:
                pitcher_pos_assignment.append('SP')

        year_df['Pos'] = pitcher_pos_assignment
        
    else:
        year_df['Pos'] = ''

    
    
    # Each of the seasons are added back to the percentile dataframe
    pitcher_percentile_df = pitcher_percentile_df.append(year_df, ignore_index=True)

pitcher_percentile_df = pitcher_percentile_df.sort_values(['Year','Player'], ascending=[True, True])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['W_Percentile'] = year_df['W'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['ERA_Percentile'] = year_df['ERA++'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['SO_Percentile'] = year_df['SO'].rank(pct=True)
A value is trying to be set on a co

In [22]:
# Clean up the percentile dataframe, drop NaNs and remove unnecessary columns
pitcher_percentile_df = pitcher_percentile_df[pitcher_percentile_df['IP'].notna()]

# Add a rank column that adds the percentiles from each category
pitcher_percentile_df['Rank'] = (pitcher_percentile_df['W_Percentile'] + pitcher_percentile_df['ERA_Percentile'] + pitcher_percentile_df['SO_Percentile'] + pitcher_percentile_df['SV_Percentile'] + pitcher_percentile_df['WHIP_Percentile'])


In [23]:
pd.set_option('display.max_columns', None)
pitcher_percentile_df.tail(50)

Unnamed: 0,Year,Player,Age,W,ERA,SO,SV,WHIP,SO9,IP,ERA+,G,GS,ERA++,WHIP++,W_Percentile,ERA_Percentile,SO_Percentile,SV_Percentile,WHIP_Percentile,GS%,Pos,Rank
2478,2024.0,Taylor Rogers,33.0,1.0,-2.4,64.0,0.0,-1.25,9.6,60.0,161.0,64.0,0.0,25.0,48.0,0.117089,0.670886,0.484177,0.304852,0.42616,0.0,RP,2.003165
2479,2024.0,Thyago Vieira,31.0,0.0,-5.21,39.0,1.0,-1.658,9.2,38.0,83.0,28.0,0.0,7.293666,22.91918,0.030591,0.078059,0.17616,0.693038,0.054852,0.0,RP,1.0327
2480,2024.0,Tim Herrin,27.0,5.0,-1.92,68.0,0.0,-0.975,9.3,65.2,213.0,75.0,0.0,33.958333,66.871795,0.612869,0.793249,0.530591,0.304852,0.649789,0.0,RP,2.89135
2481,2024.0,Tim Hill,34.0,4.0,-3.36,31.0,0.0,-1.433,4.2,67.0,123.0,62.0,0.0,19.940476,46.755059,0.521097,0.567511,0.090717,0.304852,0.407173,0.0,RP,1.89135
2482,2024.0,Tim Mayza,32.0,0.0,-6.33,28.0,0.0,-1.617,5.9,42.2,65.0,50.0,0.0,6.666667,26.097712,0.030591,0.061181,0.066456,0.304852,0.099156,0.0,RP,0.562236
2483,2024.0,Tobias Myers,25.0,9.0,-3.0,127.0,0.0,-1.174,8.3,138.0,140.0,27.0,25.0,46.0,117.546848,0.85654,0.915612,0.834388,0.304852,0.841772,0.925926,SP,3.753165
2484,2024.0,Tommy Henry,26.0,2.0,-7.04,30.0,0.0,-1.748,7.0,38.1,60.0,9.0,7.0,5.411932,21.796339,0.244726,0.029536,0.083333,0.304852,0.042194,0.777778,SP,0.704641
2485,2024.0,Tommy Kahnle,34.0,0.0,-2.11,46.0,1.0,-1.148,9.7,42.2,196.0,50.0,0.0,20.0,36.759582,0.030591,0.57173,0.251055,0.693038,0.253165,0.0,RP,1.799578
2486,2024.0,Tony Santillan,27.0,3.0,-3.0,46.0,0.0,-1.0,13.8,30.0,148.0,29.0,1.0,10.0,30.0,0.39557,0.185654,0.251055,0.304852,0.147679,0.034483,RP,1.28481
2487,2024.0,Trent Thornton,30.0,4.0,-3.61,77.0,1.0,-1.134,9.6,72.1,102.0,71.0,0.0,19.972299,63.580247,0.521097,0.56962,0.619198,0.693038,0.624473,0.0,RP,3.027426


In [24]:
# Create a list of each unique player we have in our dataframe
pitcher_list = pitcher_percentile_df.Player.unique().tolist()

# Create a new dataframe for combined, averaged percentiles over the past 5 seasons
new_pitcher_df = pd.DataFrame(columns = ['Player', 'Rank', 'Trend', 'Years', 'W_Percentile', 'ERA_Percentile', 'SO_Percentile', 'SV_Percentile', 'WHIP_Percentile','Pos'])

# Create a list for each percentile stat category for upcoming loop
pitcher_trends = []
average_W = []
average_ERA = []
average_SO = []
average_SV = []
average_WHIP = []
pitcher_average_Rank = []
pitcher_year_count = []
pos = []

cleaned_pitcher_list = []
for pitcher in pitcher_list:
    pitcher = pitcher.replace("\xa0", " ")
    cleaned_pitcher_list.append(pitcher) 

# new_pitcher_df
    
# Loop through each player, check if they played in the past two seasons. If not, remove them
for pitcher in cleaned_pitcher_list:
    filter_df = pitcher_percentile_df.loc[pitcher_percentile_df['Player'] == pitcher]
    filter_df = filter_df.sort_values(['Year'], ascending=[False])
    year_list = filter_df.Year.tolist()
    if (year_list[0] != last_year) and (year_list[0] != (last_year - 1)):
        pitcher_list.remove(pitcher)

# Update new dataframe with updated unique player list
new_pitcher_df['Player'] = pitcher_list        

# Loop through each player, locate their percentile stats for each season, average them out
for pitcher in pitcher_list:
    pitcher_df = pitcher_percentile_df.loc[pitcher_percentile_df['Player'] == pitcher]
    
    # We want to find the slope of the line of best fit for each player's overall ranking each season
    x = np.array(pitcher_df['Year'], dtype = float)
    y = np.array(pitcher_df['Rank'], dtype = float)
    slope, intercept = np.polyfit(x, y, 1)
    pitcher_trends.append(slope)
    
    # Find average of each player's percentiles from previous 5 seasons
    average_W.append(sum(pitcher_df['W_Percentile']) / len(pitcher_df['W_Percentile']))
    average_ERA.append(sum(pitcher_df['ERA_Percentile']) / len(pitcher_df['ERA_Percentile']))
    average_SO.append(sum(pitcher_df['SO_Percentile']) / len(pitcher_df['SO_Percentile']))
    average_SV.append(sum(pitcher_df['SV_Percentile']) / len(pitcher_df['SV_Percentile']))
    average_WHIP.append(sum(pitcher_df['WHIP_Percentile']) / len(pitcher_df['WHIP_Percentile']))
    pitcher_average_Rank.append(sum(pitcher_df['Rank']) / len(pitcher_df['Rank']))
    pitcher_year_count.append(len(x))

    # Keep player positions for reference purposes during the draft
    pitcher_pos = pitcher_df['Pos'].unique()
    
    pitcher_pos_list = list(pitcher_pos)
    pitcher_pos_string = ''

    for item in pitcher_pos_list:
        if pitcher_pos_string == '':
            pitcher_pos_string = item
        else:
            pitcher_pos_string = pitcher_pos_string + ',' + item

    pos.append(pitcher_pos_string)


    
# Update new dataframe with the list data from each stat
new_pitcher_df['Pos'] = pos
new_pitcher_df['Trend'] = pitcher_trends
new_pitcher_df['W_Percentile'] = average_W
new_pitcher_df['ERA_Percentile'] = average_ERA
new_pitcher_df['SO_Percentile'] = average_SO
new_pitcher_df['SV_Percentile'] = average_SV
new_pitcher_df['WHIP_Percentile'] = average_WHIP
new_pitcher_df['Rank'] = pitcher_average_Rank

# Keep track of how many seasons are being considered, so we know how reliable the data is
new_pitcher_df['Years'] = pitcher_year_count

new_pitcher_df = new_pitcher_df.sort_values('Rank', ascending = False)

new_pitcher_df.head(50)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

Unnamed: 0,Player,Rank,Trend,Years,W_Percentile,ERA_Percentile,SO_Percentile,SV_Percentile,WHIP_Percentile,Pos
605,Shota Imanaga,4.186709,0.001034,1,0.983122,0.974684,0.945148,0.304852,0.978903,SP
24,Corbin Burnes,4.137758,0.054384,5,0.890847,0.976761,0.966472,0.347159,0.956519,SP
472,Kodai Senga,4.12395,0.001019,1,0.942227,0.970588,0.971639,0.313025,0.926471,
386,Spencer Strider,4.112315,0.250161,2,0.953723,0.923753,0.989362,0.3113,0.934177,
1,Aaron Nola,4.112232,0.036961,5,0.907588,0.913401,0.979958,0.347159,0.964126,SP
109,Zack Wheeler,4.083005,0.126895,5,0.91741,0.954335,0.909795,0.347159,0.954306,SP
35,Gerrit Cole,4.074129,-0.194745,5,0.945286,0.909579,0.941776,0.347159,0.93033,SP
33,Framber Valdez,4.060859,0.089012,5,0.940067,0.928186,0.91862,0.347159,0.926827,SP
594,Paul Skenes,4.049578,0.001,1,0.918776,0.987342,0.931435,0.304852,0.907173,SP
267,Shohei Ohtani,4.01353,0.028846,3,0.913058,0.931004,0.942989,0.316643,0.909835,


In [25]:
# Create a weighted rank column by adding trend data to the rank data and account for number of seasons played
# Basically, if you played all 5 seasons, your trend stat is added directly
# If you played fewer than all 5 seasons, your trend stat is reduced depending on how few seasons you played
new_pitcher_df['Weighted Rank'] = (new_pitcher_df['Rank'] + ((new_pitcher_df['Trend'] * (new_pitcher_df['Years'] - 1) / 4)))

# shift column 'Weighted Rank' to first position
first_pitcher_column = new_pitcher_df.pop('Weighted Rank')
  
# insert column using insert(position,column_name,first_column) function
new_pitcher_df.insert(1, 'Weighted Rank', first_pitcher_column)

new_pitcher_df = new_pitcher_df.sort_values('Weighted Rank', ascending = False)

# Removing duplicate Pos listings for players
uncleaned_pos_column = new_pitcher_df['Pos'].to_list()
cleaned_pos_list = []
for item in uncleaned_pos_column:
    temp_cleaning_list = []
    temp_list = item.split(',')
    for temp_item in temp_list:
        if temp_item not in temp_cleaning_list:
            temp_cleaning_list.append(temp_item)
        else:
            continue
    cleaned_pos_list.append(temp_cleaning_list)
    
cleaned_pos_column = []
for item in cleaned_pos_list:
    temp_string = ''
    for mini_item in item:
        if temp_string == '':
            temp_string = mini_item
        else:
            temp_string = temp_string + ',' + mini_item
    cleaned_pos_column.append(temp_string)
    
new_pitcher_df['Pos'] = cleaned_pos_column

new_pitcher_df = new_pitcher_df[new_pitcher_df.Player != 'LgAvg per 180 IP']

new_pitcher_df.head(25)


Unnamed: 0,Player,Weighted Rank,Rank,Trend,Years,W_Percentile,ERA_Percentile,SO_Percentile,SV_Percentile,WHIP_Percentile,Pos
109,Zack Wheeler,4.2099,4.083005,0.126895,5,0.91741,0.954335,0.909795,0.347159,0.954306,SP
24,Corbin Burnes,4.192142,4.137758,0.054384,5,0.890847,0.976761,0.966472,0.347159,0.956519,SP
605,Shota Imanaga,4.186709,4.186709,0.001034,1,0.983122,0.974684,0.945148,0.304852,0.978903,SP
386,Spencer Strider,4.174855,4.112315,0.250161,2,0.953723,0.923753,0.989362,0.3113,0.934177,
33,Framber Valdez,4.149871,4.060859,0.089012,5,0.940067,0.928186,0.91862,0.347159,0.926827,SP
1,Aaron Nola,4.149193,4.112232,0.036961,5,0.907588,0.913401,0.979958,0.347159,0.964126,SP
61,Logan Webb,4.143993,3.766763,0.377231,5,0.865377,0.845889,0.858272,0.347159,0.850066,SP
329,George Kirby,4.135331,4.011499,0.247664,3,0.923987,0.924337,0.914251,0.309151,0.939773,SP
472,Kodai Senga,4.12395,4.12395,0.001019,1,0.942227,0.970588,0.971639,0.313025,0.926471,
229,Logan Gilbert,4.122456,3.950322,0.229513,4,0.867496,0.89571,0.935953,0.313696,0.937468,SP


In [26]:
# Create a new dataframe for stats percentile calculations
new_percentile_df = pd.DataFrame(columns = ['Player','Pos'])

# Carry over columnns from final_batter_stats_df that shouldn't be comparatively ranked 
new_percentile_df['Player'] = new_df['Player']
new_percentile_df['Pos'] = new_df['Pos']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
pos_list = ['C','1B','2B','3B','SS','OF','DH']

for pos in pos_list:
    pos_filtered_df = new_df[new_df['Pos'].str.contains(pos)]

    pos_filtered_df[f'{pos} Weighted Rank'] = pos_filtered_df['Weighted Rank'].rank(pct=True)
#     pos_filtered_df[f'{pos}_AVG_FPTS_Percentile'] = pos_filtered_df['AVG_FPTS'].rank(pct=True)

    # Each of the seasons are added back to the percentile dataframe
    new_percentile_df = new_percentile_df.append(pos_filtered_df, ignore_index=True)
        
#     new_percentile_df['New_Pos'] = pos

        
new_percentile_df = new_percentile_df[new_percentile_df['Rank'].notna()]

new_percentile_df = new_percentile_df.sort_values('Player', ascending=True)
# new_percentile_df = new_percentile_df.drop(['Tm','G','R','H','2B','3B','HR','RBI','SB','TB','BB','SO','PO','A','E','PA','OPS','OPS+','1B'],axis=1)

new_percentile_df = new_percentile_df.fillna(0)

catcher_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('C')]
catcher_percentile_df['Ranked_Pos'] = 'C'
catcher_percentile_df = catcher_percentile_df[['Player','Ranked_Pos','Years','Weighted Rank','C Weighted Rank']]
catcher_percentile_df['Pos_Ranking'] = (catcher_percentile_df['C Weighted Rank'])
catcher_percentile_df = catcher_percentile_df.drop(['C Weighted Rank'],axis = 1)


firstbase_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('1B')]
firstbase_percentile_df['Ranked_Pos'] = '1B'
firstbase_percentile_df = firstbase_percentile_df[['Player','Ranked_Pos','Years','Weighted Rank','1B Weighted Rank']]
firstbase_percentile_df['Pos_Ranking'] = (firstbase_percentile_df['1B Weighted Rank'])
firstbase_percentile_df = firstbase_percentile_df.drop(['1B Weighted Rank'],axis = 1)


secondbase_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('2B')]
secondbase_percentile_df['Ranked_Pos'] = '2B'
secondbase_percentile_df = secondbase_percentile_df[['Player','Ranked_Pos','Years','Weighted Rank','2B Weighted Rank']]
secondbase_percentile_df['Pos_Ranking'] = (secondbase_percentile_df['2B Weighted Rank'])
secondbase_percentile_df = secondbase_percentile_df.drop(['2B Weighted Rank'],axis = 1)


thirdbase_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('3B')]
thirdbase_percentile_df['Ranked_Pos'] = '3B'
thirdbase_percentile_df = thirdbase_percentile_df[['Player','Ranked_Pos','Years','Weighted Rank','3B Weighted Rank']]
thirdbase_percentile_df['Pos_Ranking'] = (thirdbase_percentile_df['3B Weighted Rank'])
thirdbase_percentile_df = thirdbase_percentile_df.drop(['3B Weighted Rank'],axis = 1)


SS_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('SS')]
SS_percentile_df['Ranked_Pos'] = 'SS'
SS_percentile_df = SS_percentile_df[['Player','Ranked_Pos','Years','Weighted Rank','SS Weighted Rank']]
SS_percentile_df['Pos_Ranking'] = (SS_percentile_df['SS Weighted Rank'])
SS_percentile_df = SS_percentile_df.drop(['SS Weighted Rank'],axis = 1)


OF_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('OF')]
OF_percentile_df['Ranked_Pos'] = 'OF'
OF_percentile_df = OF_percentile_df[['Player','Ranked_Pos','Years','Weighted Rank','OF Weighted Rank']]
OF_percentile_df['Pos_Ranking'] = (OF_percentile_df['OF Weighted Rank'])
OF_percentile_df = OF_percentile_df.drop(['OF Weighted Rank'],axis = 1)

DH_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('DH')]
DH_percentile_df['Ranked_Pos'] = 'DH'
DH_percentile_df = DH_percentile_df[['Player','Ranked_Pos','Years','Weighted Rank','DH Weighted Rank']]
DH_percentile_df['Pos_Ranking'] = (DH_percentile_df['DH Weighted Rank'])
DH_percentile_df = DH_percentile_df.drop(['DH Weighted Rank'],axis = 1)

# OF_percentile_df.sort_values('Pos_Ranking',ascending = False).head(20)
pos_rank_df = pd.concat([catcher_percentile_df, firstbase_percentile_df,secondbase_percentile_df,thirdbase_percentile_df,SS_percentile_df,OF_percentile_df,DH_percentile_df])

final_df = pd.merge(pos_rank_df, new_df,  how='left', left_on=['Player','Years','Weighted Rank'], right_on = ['Player','Years','Weighted Rank'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos_filtered_df[f'{pos} Weighted Rank'] = pos_filtered_df['Weighted Rank'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  catcher_percentile_df['Ranked_Pos'] = 'C'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  firstbase_percentile_df['Ranked_Pos'] = '1B'
A value is trying to be set 

In [27]:
new_pitcher_percentile_df = pd.DataFrame(columns = ['Player','Pos'])

# # Carry over columnns from final_batter_stats_df that shouldn't be comparatively ranked 
new_pitcher_percentile_df['Player'] = new_pitcher_df['Player']
new_pitcher_percentile_df['Pos'] = new_pitcher_df['Pos']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
pos_list = ['SP','RP']

for pos in pos_list:
    pos_filtered_df = new_pitcher_df[new_pitcher_df['Pos'].str.contains(pos)]

    pos_filtered_df[f'{pos} Weighted Rank'] = pos_filtered_df['Weighted Rank'].rank(pct=True)
#     pos_filtered_df[f'{pos}_AVG_FPTS_Percentile'] = pos_filtered_df['AVG_FPTS'].rank(pct=True)

    # Each of the seasons are added back to the percentile dataframe
    new_pitcher_percentile_df = new_pitcher_percentile_df.append(pos_filtered_df, ignore_index=True)
            
        
new_pitcher_percentile_df = new_pitcher_percentile_df[new_pitcher_percentile_df['Rank'].notna()]

new_pitcher_percentile_df = new_pitcher_percentile_df.sort_values('Player', ascending=True)

new_pitcher_percentile_df = new_pitcher_percentile_df.fillna(0)

SP_percentile_df = new_pitcher_percentile_df[new_pitcher_percentile_df['Pos'].str.contains('SP')]
SP_percentile_df['Ranked_Pos'] = 'SP'
SP_percentile_df = SP_percentile_df[['Player','Ranked_Pos','Years','Weighted Rank','SP Weighted Rank']]
SP_percentile_df['Pos_Ranking'] = (SP_percentile_df['SP Weighted Rank'])
SP_percentile_df = SP_percentile_df.drop(['SP Weighted Rank'],axis = 1)

RP_percentile_df = new_pitcher_percentile_df[new_pitcher_percentile_df['Pos'].str.contains('RP')]
RP_percentile_df['Ranked_Pos'] = 'RP'
RP_percentile_df = RP_percentile_df[['Player','Ranked_Pos','Years','Weighted Rank','RP Weighted Rank']]
RP_percentile_df['Pos_Ranking'] = (RP_percentile_df['RP Weighted Rank'])
RP_percentile_df = RP_percentile_df.drop(['RP Weighted Rank'],axis = 1)

pitch_rank_df = pd.concat([SP_percentile_df, RP_percentile_df])

final_pitch_df = pd.merge(pitch_rank_df, new_pitcher_df,  how='left', left_on=['Player','Years','Weighted Rank'], right_on = ['Player','Years','Weighted Rank'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos_filtered_df[f'{pos} Weighted Rank'] = pos_filtered_df['Weighted Rank'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SP_percentile_df['Ranked_Pos'] = 'SP'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  RP_percentile_df['Ranked_Pos'] = 'RP'


In [28]:
draft_df = pd.concat([final_df, final_pitch_df], ignore_index=True, sort=False)

# draft_df.drop(columns = ['Rank','FPTS_Percentile','AVG_FPTS_Percentile'])


# draft_df['FPTS_Percentile'] = draft_df['FPTS'].rank(pct=True)
# draft_df['AVG_FPTS_Percentile'] = draft_df['AVG_FPTS'].rank(pct=True)
    
# draft_df['Rank'] = (draft_df['FPTS_Percentile'] + (draft_df['AVG_FPTS_Percentile'] / 2))

# draft_df['Weighted_Rank'] = ''
# draft_df['Weighted_Rank'] = abs(draft_df['FPTS'] * (1 + (draft_df['Trend'] * (draft_df['Years'] - 1) / 4)))
# shift column 'Weighted Rank' to first position
# weighted_rank_column = draft_df.pop('Weighted Rank')
  
# # insert column using insert(position,column_name,first_column) function
# draft_df.insert(1, 'Weighted Rank', weighted_rank_column)


# draft_df['Weighted_Rank'] = ''
# draft_df['Weighted_Rank'] = abs(draft_df['FPTS'] * (1 + draft_df['Trend']))

weighted_rank_column = draft_df.pop('Weighted Rank')
  
# insert column using insert(position,column_name,first_column) function
draft_df.insert(1, 'Weighted Rank', weighted_rank_column)


# pitcher_percentile_df = pitcher_percentile_df.sort_values(['Year','Player'], ascending=[True, True])

draft_df = draft_df.sort_values(['Pos_Ranking','Weighted Rank'], ascending=[False,False])
# draft_df["G"].fillna("N/A", inplace = True)
# draft_df["GS"].fillna("N/A", inplace = True)

draft_df.head(50)

Unnamed: 0,Player,Weighted Rank,Ranked_Pos,Years,Pos_Ranking,Rank,Trend,BA_Percentile,R_Percentile,HR_Percentile,RBI_Percentile,SB_Percentile,Pos,W_Percentile,ERA_Percentile,SO_Percentile,SV_Percentile,WHIP_Percentile
1881,Shohei Ohtani,4.796784,DH,5.0,1.0,4.328196,0.468588,0.704882,0.892269,0.910337,0.892377,0.928331,DH,,,,,
1006,Bobby Witt Jr.,4.790617,SS,3.0,1.0,4.652996,0.275242,0.846372,0.95459,0.92222,0.948723,0.98109,SS,,,,,
857,José Ramírez,4.717098,3B,5.0,1.0,4.712929,0.004169,0.857743,0.964488,0.954428,0.964809,0.971461,"3B,DH",,,,,
208,Freddie Freeman,4.58857,1B,5.0,1.0,4.596189,-0.007619,0.969491,0.978282,0.903572,0.952754,0.79209,1B,,,,,
1336,Jackson Merrill,4.535165,OF,1.0,1.0,4.535165,0.00112,0.956044,0.876923,0.903297,0.943956,0.854945,OF,,,,,
599,Mookie Betts,4.498335,2B,5.0,1.0,4.502801,-0.004466,0.864219,0.957867,0.912408,0.890535,0.877773,"SS,OF,2B",,,,,
2191,Zack Wheeler,4.2099,SP,5.0,1.0,4.083005,0.126895,,,,,,SP,0.91741,0.954335,0.909795,0.347159,0.954306
113,William Contreras,4.064834,C,4.0,1.0,3.310158,1.006234,0.743184,0.681233,0.728604,0.676658,0.48048,"C,DH",,,,,
2454,Ryan Walker,3.887243,RP,2.0,1.0,3.556131,1.324447,,,,,,RP,0.765003,0.709823,0.691183,0.813921,0.576202
2387,Kirby Yates,3.880727,RP,2.0,0.996503,3.693944,0.747132,,,,,,RP,0.763675,0.739385,0.670024,0.94115,0.57971


In [29]:
######################################################################################################################
######################################################################################################################
######################################################################################################################
#### DRAFT DAY FUNCTIONS
   
# DROP A PLAYER 
def drafted(player):
    global draft_df
    global final_pitcher_stats_df
    global final_batter_stats_df
    draft_df = draft_df[draft_df.Player != player]
    final_pitcher_stats_df = final_pitcher_stats_df[final_pitcher_stats_df.Player != player]
    final_batter_stats_df = final_batter_stats_df[final_batter_stats_df.Player != player]
    return draft_df.head(25)
    
# FILTER PLAYERS BY POSITION
def position_filter(Pos):
    filtered_draft_df = draft_df[draft_df['Pos'].str.contains(Pos)]
    return filtered_draft_df.head(25)

# PULL PITCHING STAT CATEGORY LEADERS
def pitching_stat_leaders(CAT):
    global final_pitcher_stats_df
    pitching_filtered_draft_df = draft_df.sort_values([CAT], ascending=[False])
    return pitching_filtered_draft_df.head(25)

# PULL BATTING STAT CATEGORY LEADERS
def batting_stat_leaders(CAT):
    global final_batter_stats_df
    batting_filtered_draft_df = draft_df.sort_values([CAT], ascending=[False])
    return batting_filtered_draft_df.head(25)

def drop_all_position(POS):
    global draft_df
    draft_df = draft_df[draft_df.Ranked_Pos != POS]
    return draft_df.head(25)

In [152]:
# drafted('Bobby Witt Jr.')
drop_all_position('C')
drafted('Shohei Ohtani')
drafted('Bobby Witt Jr.')
drafted('Aaron Judge')
drafted('Elly De La Cruz')
drafted('José Ramírez')
drafted('Kyle Tucker')
drafted('Juan Soto')
drafted('Freddie Freeman')
drafted('Mookie Betts')
drafted('Francisco Lindor')
drafted('Julio Rodríguez')
drafted('Fernando Tatis Jr.')
drafted('Paul Skenes')
drafted('Gunnar Henderson')
drafted('Tarik Skubal')
drafted('Yordan Alvarez')
drafted('Zack Wheeler')
drafted('Jackson Chourio')
drafted('Ronald Acuna Jr.')
drafted('Bryce Harper')
drafted('Jaren Duran')
drafted('Vladimir Guerrero Jr.')
drafted('Jazz Chisholm Jr.')
drafted('Ketel Marte')
drafted('Trea Turner')
drafted('Garrett Crochet')
drafted('Jackson Merrill')
drafted('Manny Machado')
drafted('William Contreras')
drafted('Logan Gilbert')
drafted('Rafael Devers')
drafted('Logan Gilbert')
drafted('Teoscar Hernández')
drafted('Corbin Burnes')
drafted('Austin Riley')
drafted('Cole Ragens')
drafted('Pete Alonso')
drop_all_position('DH')
drafted('CJ Abrams')
drafted('Matt Olson')
drafted('Corey Seager')
drafted('Jose Altuve')
drafted('Chris Sale')
drafted('Aaron Nola')
drafted('Framber Valdez')
drafted('Emmanuel Clase')
drafted('Kirby Yates')
drafted('Junior Caminero')
drafted('Josh Hader')
drafted('Yoshinobu Yamamoto')
drafted('Marcell Ozuna')
drafted('Kyle Schwarber')
drafted('Shota Imanaga')
drafted('Logan Webb')
drafted('Tanner Scott')
drafted('Michael King')
drafted('Blake Snell')
drop_all_position('RP')
drafted('Dylan Cease')
drafted('Bryan Reynolds')
drafted('Cody Bellinger')
drafted('Alex Bregman')
drafted('Spencer Schwellenbach')
drafted('Felix Bautista')
drafted('Christian Yelich')
drafted('Hunter Greene')
drafted('Raisel Iglesias')
drafted('Max Fried')
drafted('Salvador Perez')
drafted('Jordan Westburg')
drafted('Matt McLain')
drafted('Bryce Miller')
drafted('Will Smith')
drafted('Josh Naylor')
drafted('Tyler Glasnow')
drafted('Matt Chapman')
drafted('Luis Robert Jr.')
drafted('Luis Castillo')
drafted('James Wood')
drafted('Pablo López')
drafted('Dylan Crews')
drafted('Vinnie Pasquantino')
drafted('Corbin Carroll')
drafted('Mike Trout')
drafted('George Kirby')
drafted('Marcus Semien')
drafted('Christian Walker')
drafted('Seiya Suzuki')
drafted('Sonny Gray')
drafted('Bailey Ober')
drafted('Freddy Peralta')
drafted('Wyatt Langford')
drafted('Kevin Gausman')
drafted('Tanner Bibee')
drafted('Ozzie Albies')
drafted('Zac Gallen')
drafted('Oneil Cruz')
drafted('Ian Happ')
drafted('Josh Jung')
drafted('Xander Bogaerts')
drafted('Luis Gil')
drafted('Randy Arozarena')
drafted('Adolis García')
drafted('Bryson Stott')
drafted('Jake Burger')
drafted('Matt Shaw')
drafted('Hunter Brown')
drafted('Matt Vierling')
drafted('Sandy Alcantara')
drafted('Kodai Senga')
drafted('Brice Turang')
drafted('Willy Adames')
drafted('Alec Bohm')
drafted('Triston Casas')
drafted('Spencer Steer')
drafted('Joe Ryan')
drafted('Jack Flaherty')
drafted('Robbie Ray')
drafted('Nick Castellanos')
drafted('Eugenio Suárez')
drafted('Carlos Rodon')
drafted('Bryan Woo')
drafted('Brenton Doyle')
drafted('Seth Lugo')
drafted('Jeremy Peña')
drafted('Paul Goldschmidt')
drafted('Dansby Swanson')

Unnamed: 0,Player,Weighted Rank,Ranked_Pos,Years,Pos_Ranking,Rank,Trend,BA_Percentile,R_Percentile,HR_Percentile,RBI_Percentile,SB_Percentile,Pos,W_Percentile,ERA_Percentile,SO_Percentile,SV_Percentile,WHIP_Percentile
923,Nolan Arenado,3.862819,3B,5.0,0.969925,3.681941,0.180877,0.745742,0.782854,0.846164,0.881659,0.425523,3B,,,,,
492,Gleyber Torres,3.890325,2B,5.0,0.969697,3.439582,0.450743,0.679351,0.72309,0.643766,0.689242,0.704132,2B,,,,,
443,Ceddanne Rafaela,3.871429,2B,1.0,0.962121,3.871429,0.000956,0.591209,0.815385,0.703297,0.876923,0.884615,"OF,SS,2B",,,,,
2058,José Berríos,4.050422,SP,5.0,0.947137,3.956241,0.094181,,,,,,SP,0.928182,0.875163,0.906373,0.347159,0.899364
614,Nico Hoerner,3.704639,2B,5.0,0.931818,3.043914,0.660725,0.774559,0.617968,0.289173,0.500118,0.862096,"2B,SS",,,,,
1971,Chris Bassitt,4.015643,SP,5.0,0.929515,3.990767,0.024876,,,,,,SP,0.932417,0.928536,0.883194,0.347159,0.899461
1048,Ezequiel Tovar,4.066,SS,2.0,0.929412,3.953978,0.448087,0.714734,0.891525,0.795616,0.86376,0.688344,SS,,,,,
1982,Colin Rea,4.003553,SP,2.0,0.92511,3.796136,0.829668,,,,,,SP,0.826554,0.791684,0.817967,0.503032,0.856899
949,Ryan McMahon,3.621427,3B,5.0,0.924812,3.444108,0.177318,0.481558,0.783209,0.822084,0.807951,0.549307,3B,,,,,
408,Andrés Giménez,3.654298,2B,5.0,0.924242,3.219519,0.434779,0.616312,0.640485,0.486061,0.541261,0.9354,2B,,,,,


In [153]:
position_filter('SP')

Unnamed: 0,Player,Weighted Rank,Ranked_Pos,Years,Pos_Ranking,Rank,Trend,BA_Percentile,R_Percentile,HR_Percentile,RBI_Percentile,SB_Percentile,Pos,W_Percentile,ERA_Percentile,SO_Percentile,SV_Percentile,WHIP_Percentile
2058,José Berríos,4.050422,SP,5.0,0.947137,3.956241,0.094181,,,,,,SP,0.928182,0.875163,0.906373,0.347159,0.899364
1971,Chris Bassitt,4.015643,SP,5.0,0.929515,3.990767,0.024876,,,,,,SP,0.932417,0.928536,0.883194,0.347159,0.899461
1982,Colin Rea,4.003553,SP,2.0,0.92511,3.796136,0.829668,,,,,,SP,0.826554,0.791684,0.817967,0.503032,0.856899
2039,Jameson Taillon,3.992661,SP,4.0,0.9163,3.943478,0.065577,,,,,,SP,0.888915,0.864236,0.870307,0.410859,0.90916
2144,Sean Manaea,3.945838,SP,5.0,0.903084,3.709191,0.236647,,,,,,SP,0.838046,0.768984,0.834757,0.42489,0.842514
1970,Charlie Morton,3.917943,SP,5.0,0.894273,3.509088,0.408855,,,,,,SP,0.792803,0.761377,0.848314,0.347159,0.759437
2143,Ryne Nelson,3.905416,SP,2.0,0.889868,3.742266,0.652599,,,,,,SP,0.855864,0.771651,0.775904,0.503032,0.835815
2119,Nick Pivetta,3.898521,SP,4.0,0.885463,4.006276,-0.143673,,,,,,SP,0.834115,0.832477,0.944244,0.509112,0.886329
2109,Mitch Keller,3.892883,SP,4.0,0.881057,3.566291,0.435455,,,,,,SP,0.786867,0.785079,0.868111,0.313696,0.812539
2014,Gerrit Cole,3.879384,SP,5.0,0.876652,4.074129,-0.194745,,,,,,SP,0.945286,0.909579,0.941776,0.347159,0.93033
