In [1]:
# Import needed dependencies
import requests
import re
import pandas as pd
import numpy as np
import scipy.stats as stats
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
today = date.today()

# dd/mm/YY
current_year = today.strftime("%Y")

last_year = int(current_year) - 1

In [3]:
last_five_years = []
for i in range(1,6):
    last_five_years.append(int(current_year) - i)

In [4]:
# Create a list to help create a dataframe from batter statistics data
batter_stats = []

# Create a loop to create a dataframe from Baseball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-batting.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = BeautifulSoup(soup.select_one('#all_players_standard_batting').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')


    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        batter_stats.append(tds)

In [5]:
# Create dataframe for batter statistics
batter_stats_df = pd.DataFrame(batter_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
batter_stats_df.columns = df_headers

In [6]:
# Change types of columns to numeric for columns with number values
batter_stats_df[['Age', 'R','HR','RBI','SB','BA','PA','OPS','OPS+']] = batter_stats_df[['Age', 'R','HR','RBI','SB','BA','PA','OPS','OPS+']].apply(pd.to_numeric)

# Drop any players with 0 plate appearances to remove null values and change PA type to integer
batter_stats_df.dropna(subset=['PA'], axis = 0 , inplace= True)

# Remove any players with fewer than 100 plate appearances
filtered_batter_stats_df = batter_stats_df[batter_stats_df['PA'] >= 100]

# Select the columns we want for our batter analysis
final_batter_stats_df = filtered_batter_stats_df[['Year','Name','Tm','Age','R','HR','RBI','SB','BA','PA','OPS','OPS+','Pos\xa0Summary']]

In [7]:
# Sort by index to prepare to drop duplicates
final_batter_stats_df = final_batter_stats_df.sort_index()

# Drop duplicate entries of Player Name and Year
# This is to eliminate partial season data for players who played for 2+ teams in one season
final_batter_stats_df = final_batter_stats_df.drop_duplicates(subset=['Year', 'Name'])

In [8]:
# Sort data by name alphabetically, then by year in descending order
final_batter_stats_df = final_batter_stats_df.sort_values(['Year','Name'], ascending=[True, True])

# Eliminate Baseball Reference's name badges for accolades
final_batter_stats_df['Name'] = final_batter_stats_df['Name'].str.extract('([^\*|#]*)')

cleaned_player_list = []
for player in final_batter_stats_df['Name']:
    player = player.replace("\xa0", " ")
    cleaned_player_list.append(player)

final_batter_stats_df['Name'] = cleaned_player_list        

In [9]:
# Create a new dataframe for stats percentile calculations
percentile_df = pd.DataFrame(columns = ['Year', 'Name', 'Tm', 'Age', 'BA', 'R', 'HR', 'RBI', 'SB', 'PA'])

# Carry over columnns from final_batter_stats_df that shouldn't be comparatively ranked 
percentile_df['Year'] = final_batter_stats_df['Year']
percentile_df['Name'] = final_batter_stats_df['Name']
percentile_df['Age'] = final_batter_stats_df['Age']
percentile_df['Tm'] = final_batter_stats_df['Tm']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
for year in last_five_years:
    year_df = final_batter_stats_df.loc[final_batter_stats_df['Year'] == year]
    year_df['BA_Percentile'] = year_df['BA'].rank(pct=True)
    year_df['R_Percentile'] = year_df['R'].rank(pct=True)
    year_df['HR_Percentile'] = year_df['HR'].rank(pct=True)
    year_df['RBI_Percentile'] = year_df['RBI'].rank(pct=True)
    year_df['SB_Percentile'] = year_df['SB'].rank(pct=True)
    year_df.sort_values('Name', ascending=True)

    # Each of the seasons are added back to the percentile dataframe
    percentile_df = percentile_df.append(year_df, ignore_index=True)

percentile_df = percentile_df.sort_values(['Year','Name'], ascending=[True, True])
percentile_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['BA_Percentile'] = year_df['BA'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['R_Percentile'] = year_df['R'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['HR_Percentile'] = year_df['HR'].rank(pct=True)
A value is trying to be set on a copy o

Unnamed: 0,Year,Name,Tm,Age,BA,R,HR,RBI,SB,PA,OPS,OPS+,Pos Summary,BA_Percentile,R_Percentile,HR_Percentile,RBI_Percentile,SB_Percentile
0,2019,AJ Pollock,LAD,31.0,,,,,,,,,,,,,,
3866,2019,AJ Pollock,LAD,31.0,0.266,49,15,47,5,342,0.795,107.0,87/HD,0.650442,0.553097,0.603982,0.542035,0.707965
1,2019,Aaron Hicks,NYY,29.0,,,,,,,,,,,,,,
3867,2019,Aaron Hicks,NYY,29.0,0.235,41,12,36,1,255,0.769,103.0,8/DH,0.324115,0.441372,0.502212,0.404867,0.331858
2,2019,Aaron Judge,NYY,27.0,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2618,2023,Zach Remillard,CHW,29.0,0.252,16,1,18,4,160,0.615,69.0,4/H5976D,0.584416,0.139610,0.041126,0.179654,0.516234
2157,2023,Zack Gelof,OAK,23.0,,,,,,,,,,,,,,
2619,2023,Zack Gelof,OAK,23.0,0.267,40,14,32,14,300,0.840,137.0,4,0.778139,0.465368,0.627706,0.380952,0.835498
2158,2023,Zack Short,DET,28.0,,,,,,,,,,,,,,


In [10]:
# Clean up the percentile dataframe, drop NaNs and remove unnecessary columns
percentile_df = percentile_df.dropna()

# Add a rank column that adds the percentiles from each category
percentile_df['Rank'] = (percentile_df['BA_Percentile'] + percentile_df['R_Percentile'] + percentile_df['HR_Percentile'] + percentile_df['RBI_Percentile'] + percentile_df['SB_Percentile'])
percentile_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  percentile_df['Rank'] = (percentile_df['BA_Percentile'] + percentile_df['R_Percentile'] + percentile_df['HR_Percentile'] + percentile_df['RBI_Percentile'] + percentile_df['SB_Percentile'])


Unnamed: 0,Year,Name,Tm,Age,BA,R,HR,RBI,SB,PA,OPS,OPS+,Pos Summary,BA_Percentile,R_Percentile,HR_Percentile,RBI_Percentile,SB_Percentile,Rank
3866,2019,AJ Pollock,LAD,31.0,0.266,49,15,47,5,342,0.795,107.0,87/HD,0.650442,0.553097,0.603982,0.542035,0.707965,3.057522
3867,2019,Aaron Hicks,NYY,29.0,0.235,41,12,36,1,255,0.769,103.0,8/DH,0.324115,0.441372,0.502212,0.404867,0.331858,2.004425
3868,2019,Aaron Judge,NYY,27.0,0.272,75,27,55,3,447,0.921,143.0,9D/H,0.722345,0.786504,0.849558,0.613938,0.549779,3.522124
3869,2019,Adalberto Mondesí,KCR,23.0,0.263,58,9,62,43,443,0.715,85.0,6/DH,0.617257,0.646018,0.375000,0.700221,0.997788,3.336283
3870,2019,Adam Duvall,ATL,30.0,0.267,17,10,19,0,130,0.882,117.0,7H/9,0.664823,0.127212,0.412611,0.173673,0.131637,1.509956
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2616,2023,Zach McKinstry,DET,28.0,0.231,60,9,35,16,518,0.654,79.0,549H67/8D1,0.332251,0.698052,0.451299,0.433983,0.871212,2.786797
2617,2023,Zach Neto,LAA,22.0,0.225,38,9,34,5,329,0.685,86.0,6/H,0.277056,0.441558,0.451299,0.416667,0.579004,2.165584
2618,2023,Zach Remillard,CHW,29.0,0.252,16,1,18,4,160,0.615,69.0,4/H5976D,0.584416,0.139610,0.041126,0.179654,0.516234,1.461039
2619,2023,Zack Gelof,OAK,23.0,0.267,40,14,32,14,300,0.840,137.0,4,0.778139,0.465368,0.627706,0.380952,0.835498,3.087662


In [11]:
# Create a list of each unique player we have in our dataframe
player_list = percentile_df.Name.unique().tolist()

# Create a new dataframe for combined, averaged percentiles over the past 5 seasons
new_df = pd.DataFrame(columns = ['Name', 'Rank', 'Trend', 'Pos', 'Years', 'BA_Percentile', 'R_Percentile', 'HR_Percentile', 'RBI_Percentile', 'SB_Percentile'])

# Create a list for each percentile stat category for upcoming loop
player_trends = []
average_BA = []
average_R = []
average_HR = []
average_RBI = []
average_SB = []
average_Rank = []
year_count = []
pos = []

# Loop through each player, check if they played in the past two seasons. If not, remove them
for player in player_list:
    filter_df = percentile_df.loc[percentile_df['Name'] == player]
    filter_df = filter_df.sort_values(['Year'], ascending=[False])
    year_list = filter_df.Year.tolist()
    if (year_list[0] != last_year) and (year_list[0] != (last_year - 1)):
        player_list.remove(player)

# Update new dataframe with updated unique player list
new_df['Name'] = player_list        

# Loop through each player, locate their percentile stats for each season, average them out
for player in player_list:
    player_df = percentile_df.loc[percentile_df['Name'] == player]
    
    # We want to find the slope of the line of best fit for each player's overall ranking each season
    x = np.array(player_df['Year'], dtype = float)
    y = np.array(player_df['Rank'], dtype = float)
    slope, intercept = np.polyfit(x, y, 1)
    player_trends.append(slope)
    
    # Find average of each player's percentiles from previous 5 seasons
    average_BA.append(sum(player_df['BA_Percentile']) / len(player_df['BA_Percentile']))
    average_R.append(sum(player_df['R_Percentile']) / len(player_df['R_Percentile']))
    average_HR.append(sum(player_df['HR_Percentile']) / len(player_df['HR_Percentile']))
    average_RBI.append(sum(player_df['RBI_Percentile']) / len(player_df['RBI_Percentile']))
    average_SB.append(sum(player_df['SB_Percentile']) / len(player_df['SB_Percentile']))
    average_Rank.append(sum(player_df['Rank']) / len(player_df['Rank']))
    year_count.append(len(x))
    
    # Keep player positions for reference purposes during the draft
    pos.append(player_df['Pos\xa0Summary'].unique())

# Update new dataframe with the list data from each stat
new_df['Pos'] = pos
new_df['Trend'] = player_trends
new_df['BA_Percentile'] = average_BA
new_df['R_Percentile'] = average_R
new_df['HR_Percentile'] = average_HR
new_df['RBI_Percentile'] = average_RBI
new_df['SB_Percentile'] = average_SB
new_df['Rank'] = average_Rank

# Keep track of how many seasons are being considered, so we know how reliable the data is
new_df['Years'] = year_count



  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

In [12]:
# Create a weighted rank column by adding trend data to the rank data and account for number of seasons played
# Basically, if you played all 5 seasons, your trend stat is added directly
# If you played fewer than all 5 seasons, your trend stat is reduced depending on how few seasons you played
new_df['Weighted Rank'] = (new_df['Rank'] + ((new_df['Trend'] * (new_df['Years'] - 1) / 4)))

# shift column 'Weighted Rank' to first position
first_column = new_df.pop('Weighted Rank')
  
# insert column using insert(position,column_name,first_column) function
new_df.insert(1, 'Weighted Rank', first_column)

new_df = new_df.sort_values('Weighted Rank', ascending = False)

In [13]:
# separate position column into a list for editing
new_pos_list = []
pos_list = new_df['Pos'].tolist()

# loop through list and pull only the last item, which represents player position listed from most recent season
for i in pos_list:
    j = i[-1]
    
    if re.search('/', j):
        k = re.sub("([^\/]+$)","",j)
        new_pos_list.append(k)
    else:
        new_pos_list.append(j)

cleaned_list = []
for pos in new_pos_list:
    placeholder = re.findall("[a-zA-Z0-9]+", pos)
    placeholder_2 = ''.join(placeholder)
    placeholder_3 = [d for d in placeholder_2]
    cleaned_list.append(placeholder_3)

cleaned_pos_list = []
for n_list in cleaned_list:
    
    placeholder_list = []
    for pos in n_list:
        if pos == '1':
            placeholder_list.append('P')
        elif pos == '2':
            placeholder_list.append('C')
        elif pos == '3':
            placeholder_list.append('1B')
        elif pos == '4':
            placeholder_list.append('2B')
        elif pos == '5':
            placeholder_list.append('3B')
        elif pos == '6':
            placeholder_list.append('SS')
        elif pos == ('7'):
            placeholder_list.append('OF')
        elif pos == ('8'):
            placeholder_list.append('OF')
        elif pos == ('9'):
            placeholder_list.append('OF')
        elif pos == ('D'):
            placeholder_list.append('DH')
        
    cleaned_pos_list.append(placeholder_list)        

temp_pos_list = []
for item in cleaned_pos_list:
    new_string = []
    for pos in item:
        string = str(pos)
        new_string = f'{new_string},{string}'
    temp_pos_list.append(new_string)

    
final_pos_list = []
for i in temp_pos_list:
    i = i.replace('[],', '')
    final_pos_list.append(i)
    
# replace old position column with new position column
new_df.drop('Pos', axis = 1, inplace = True)
new_df['Pos'] = final_pos_list

new_df.head(25)

Unnamed: 0,Name,Weighted Rank,Rank,Trend,Years,BA_Percentile,R_Percentile,HR_Percentile,RBI_Percentile,SB_Percentile,Pos
101,Freddie Freeman,4.714065,4.647196,0.06687,5,0.966538,0.994326,0.923942,0.963171,0.799218,1B
494,Julio Rodríguez,4.709913,4.684977,0.099742,2,0.876893,0.946037,0.9491,0.93063,0.982318,OF
169,José Ramírez,4.645912,4.519598,0.126314,5,0.783998,0.913682,0.91537,0.938574,0.967974,"3B,DH"
290,Trea Turner,4.643129,4.582798,0.060331,5,0.922151,0.968028,0.8525,0.855654,0.984464,SS
338,Kyle Tucker,4.625874,4.490333,0.180721,4,0.807053,0.878376,0.886734,0.96802,0.950149,OF
451,Bobby Witt Jr.,4.613446,4.527759,0.342751,2,0.772244,0.934224,0.900576,0.929281,0.991434,"SS,DH"
231,Mookie Betts,4.579494,4.544659,0.034834,5,0.855381,0.983853,0.92686,0.884627,0.893938,"OF,2B,SS"
273,Shohei Ohtani,4.473284,4.069562,0.403722,5,0.679595,0.805005,0.845811,0.832532,0.90662,"DH,P"
97,Fernando Tatis Jr.,4.458339,4.42148,0.049145,4,0.809775,0.892786,0.911165,0.844821,0.962933,OF
261,Ronald Acuna Jr.,4.441384,4.403861,0.037523,5,0.80178,0.921198,0.899455,0.809049,0.972379,OF


In [14]:
# Create a list to help create a dataframe from batter statistics data
pitcher_stats = []

for year in last_five_years:

    # input URL and use BeautifulSoup to parse through the page
    pitching_url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-pitching.shtml'
    pitching_soup = BeautifulSoup(requests.get(pitching_url).content, 'html.parser')

    # Grab the table element that has batter statistics
    pitching_table = BeautifulSoup(pitching_soup.select_one('#all_players_standard_pitching').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')

    # Grab data from table and put it into the list created above
    for tr in pitching_table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        pitcher_stats.append(tds)
        

In [15]:
# Create dataframe for batter statistics
pitcher_stats_df = pd.DataFrame(pitcher_stats)

# Create an empty list to store dataframe header information
pitcher_header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in pitching_table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    pitcher_header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
pitcher_df_headers = pitcher_header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
pitcher_df_headers.remove('Rk')
pitcher_df_headers.append("Year")

# Set column headers equal to our list
pitcher_stats_df.columns = pitcher_df_headers

In [16]:
# Change types of columns to numeric for columns with number values
pitcher_stats_df[['Age', 'W','ERA','SO','SV','WHIP','ERA+','SO9','IP','G','GS']] = pitcher_stats_df[['Age', 'W','ERA','SO','SV','WHIP','ERA+','SO9','IP','G','GS']].apply(pd.to_numeric)

# Drop any players with NaN innings pitched, ERA, and WHIP to remove null values 
pitcher_stats_df.dropna(subset=['IP'], axis = 0 , inplace= True)
pitcher_stats_df.dropna(subset=['ERA'], axis = 0 , inplace= True)
pitcher_stats_df.dropna(subset=['WHIP'], axis = 0 , inplace= True)
pitcher_stats_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Remove any pitchers with fewer than 30 innings pitched
pitcher_stats_df = pitcher_stats_df[pitcher_stats_df['IP'] >= 30]

# Make ERA and WHIP negative so high values become "low" when sorted with all other columns
pitcher_stats_df['ERA'] = pitcher_stats_df['ERA'] * -1
pitcher_stats_df['WHIP'] = pitcher_stats_df['WHIP'] * -1

# pitcher_stats_df['GS%'] = pitcher_stats_df['GS'] / pitcher_stats_df['G']
# games_started_list = pitcher_stats_df['GS%'].to_list()
# pitcher_pos_assignment = []

# for item in games_started_list:
#     if item < .25:
#         pitcher_pos_assignment.append('RP')
#     if item >= .25 and item <= .75:
#         pitcher_pos_assignment.append('SP,RP')
#     if item > .75:
#         pitcher_pos_assignment.append('SP')

# pitcher_stats_df['Pos'] = pitcher_pos_assignment
        
# Select the columns we want for our pitcher analysis
final_pitcher_stats_df = pitcher_stats_df[['Year','Name','Age', 'W','ERA','SO','SV','WHIP','ERA+','SO9','IP','G','GS',]]

# Eliminate Baseball Reference's name badges for accolades
final_pitcher_stats_df['Name'] = final_pitcher_stats_df['Name'].str.extract('([^\*|#]*)')

pitcher_list = final_pitcher_stats_df.Name.tolist()

cleaned_pitcher_list = []
for pitcher in pitcher_list:
    pitcher = pitcher.replace("\xa0", " ")
    cleaned_pitcher_list.append(pitcher) 
    
final_pitcher_stats_df['Name'] = cleaned_pitcher_list        


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_pitcher_stats_df['Name'] = final_pitcher_stats_df['Name'].str.extract('([^\*|#]*)')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_pitcher_stats_df['Name'] = cleaned_pitcher_list


In [17]:
# Sort by index to prepare to drop duplicates
final_pitcher_stats_df = final_pitcher_stats_df.sort_index()

# Drop duplicate entries of Player Name and Year
# This is to eliminate partial season data for players who played for 2+ teams in one season
final_pitcher_stats_df = final_pitcher_stats_df.drop_duplicates(subset=['Year', 'Name'])

# Weighting ERA and WHIP with Innings Pitched so that relievers do not dominate these categories
final_pitcher_stats_df['ERA++'] = final_pitcher_stats_df['IP'] * -(1 / final_pitcher_stats_df['ERA'])
final_pitcher_stats_df['WHIP++'] = final_pitcher_stats_df['IP'] * -(1 / final_pitcher_stats_df['WHIP'])

In [18]:
# Sort data by name alphabetically, then by year in descending order
final_pitcher_stats_df = final_pitcher_stats_df.sort_values(['Year','Name'], ascending=[True, True])

In [19]:
# Create a new dataframe for stats percentile calculations
pitcher_percentile_df = pd.DataFrame(columns = ['Year', 'Name', 'Age', 'W', 'ERA', 'SO', 'SV', 'WHIP', 'SO9', 'IP'])

# Carry over columnns from final_pitcher_stats_df that shouldn't be comparatively ranked 
pitcher_percentile_df['Year'] = final_pitcher_stats_df['Year']
pitcher_percentile_df['Name'] = final_pitcher_stats_df['Name']
pitcher_percentile_df['Age'] = final_pitcher_stats_df['Age']
# pitcher_percentile_df['Pos'] = final_pitcher_stats_df['Pos']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
for year in last_five_years:
    year_df = final_pitcher_stats_df.loc[final_pitcher_stats_df['Year'] == year]
    year_df['W_Percentile'] = year_df['W'].rank(pct=True)
    year_df['ERA_Percentile'] = year_df['ERA++'].rank(pct=True)
    year_df['SO_Percentile'] = year_df['SO'].rank(pct=True)
    year_df['SV_Percentile'] = year_df['SV'].rank(pct=True)
    year_df['WHIP_Percentile'] = year_df['WHIP++'].rank(pct=True)
    year_df.sort_values('Name', ascending=True)
    
    
    if year == last_year:
        year_df['GS%'] = year_df['GS'] / year_df['G']
        games_started_list = year_df['GS%'].to_list()
        pitcher_pos_assignment = []

        for item in games_started_list:
            if item < .25:
                pitcher_pos_assignment.append('RP')
            if item >= .25 and item <= .75:
                pitcher_pos_assignment.append('SP,RP')
            if item > .75:
                pitcher_pos_assignment.append('SP')

        year_df['Pos'] = pitcher_pos_assignment
        
    else:
        year_df['Pos'] = ''

    
    
    # Each of the seasons are added back to the percentile dataframe
    pitcher_percentile_df = pitcher_percentile_df.append(year_df, ignore_index=True)

pitcher_percentile_df = pitcher_percentile_df.sort_values(['Year','Name'], ascending=[True, True])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['W_Percentile'] = year_df['W'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['ERA_Percentile'] = year_df['ERA++'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['SO_Percentile'] = year_df['SO'].rank(pct=True)
A value is trying to be set on a co

In [20]:
# Clean up the percentile dataframe, drop NaNs and remove unnecessary columns
pitcher_percentile_df = pitcher_percentile_df[pitcher_percentile_df['IP'].notna()]

# Add a rank column that adds the percentiles from each category
pitcher_percentile_df['Rank'] = (pitcher_percentile_df['W_Percentile'] + pitcher_percentile_df['ERA_Percentile'] + pitcher_percentile_df['SO_Percentile'] + pitcher_percentile_df['SV_Percentile'] + pitcher_percentile_df['WHIP_Percentile'])


In [21]:
# Create a list of each unique player we have in our dataframe
pitcher_list = pitcher_percentile_df.Name.unique().tolist()

# Create a new dataframe for combined, averaged percentiles over the past 5 seasons
new_pitcher_df = pd.DataFrame(columns = ['Name', 'Rank', 'Trend', 'Years', 'W_Percentile', 'ERA_Percentile', 'SO_Percentile', 'SV_Percentile', 'WHIP_Percentile','Pos'])

# Create a list for each percentile stat category for upcoming loop
pitcher_trends = []
average_W = []
average_ERA = []
average_SO = []
average_SV = []
average_WHIP = []
pitcher_average_Rank = []
pitcher_year_count = []
pos = []

cleaned_pitcher_list = []
for pitcher in pitcher_list:
    pitcher = pitcher.replace("\xa0", " ")
    cleaned_pitcher_list.append(pitcher) 

# new_pitcher_df
    
# Loop through each player, check if they played in the past two seasons. If not, remove them
for pitcher in cleaned_pitcher_list:
    filter_df = pitcher_percentile_df.loc[pitcher_percentile_df['Name'] == pitcher]
    filter_df = filter_df.sort_values(['Year'], ascending=[False])
    year_list = filter_df.Year.tolist()
    if (year_list[0] != last_year) and (year_list[0] != (last_year - 1)):
        pitcher_list.remove(pitcher)

# Update new dataframe with updated unique player list
new_pitcher_df['Name'] = pitcher_list        

# Loop through each player, locate their percentile stats for each season, average them out
for pitcher in pitcher_list:
    pitcher_df = pitcher_percentile_df.loc[pitcher_percentile_df['Name'] == pitcher]
    
    # We want to find the slope of the line of best fit for each player's overall ranking each season
    x = np.array(pitcher_df['Year'], dtype = float)
    y = np.array(pitcher_df['Rank'], dtype = float)
    slope, intercept = np.polyfit(x, y, 1)
    pitcher_trends.append(slope)
    
    # Find average of each player's percentiles from previous 5 seasons
    average_W.append(sum(pitcher_df['W_Percentile']) / len(pitcher_df['W_Percentile']))
    average_ERA.append(sum(pitcher_df['ERA_Percentile']) / len(pitcher_df['ERA_Percentile']))
    average_SO.append(sum(pitcher_df['SO_Percentile']) / len(pitcher_df['SO_Percentile']))
    average_SV.append(sum(pitcher_df['SV_Percentile']) / len(pitcher_df['SV_Percentile']))
    average_WHIP.append(sum(pitcher_df['WHIP_Percentile']) / len(pitcher_df['WHIP_Percentile']))
    pitcher_average_Rank.append(sum(pitcher_df['Rank']) / len(pitcher_df['Rank']))
    pitcher_year_count.append(len(x))

    # Keep player positions for reference purposes during the draft
    pitcher_pos = pitcher_df['Pos'].unique()
    
    pitcher_pos_list = list(pitcher_pos)
    pitcher_pos_string = ''

    for item in pitcher_pos_list:
        if pitcher_pos_string == '':
            pitcher_pos_string = item
        else:
            pitcher_pos_string = pitcher_pos_string + ',' + item

    pos.append(pitcher_pos_string)


    
# Update new dataframe with the list data from each stat
new_pitcher_df['Pos'] = pos
new_pitcher_df['Trend'] = pitcher_trends
new_pitcher_df['W_Percentile'] = average_W
new_pitcher_df['ERA_Percentile'] = average_ERA
new_pitcher_df['SO_Percentile'] = average_SO
new_pitcher_df['SV_Percentile'] = average_SV
new_pitcher_df['WHIP_Percentile'] = average_WHIP
new_pitcher_df['Rank'] = pitcher_average_Rank

# Keep track of how many seasons are being considered, so we know how reliable the data is
new_pitcher_df['Years'] = pitcher_year_count



  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

In [22]:
# Create a weighted rank column by adding trend data to the rank data and account for number of seasons played
# Basically, if you played all 5 seasons, your trend stat is added directly
# If you played fewer than all 5 seasons, your trend stat is reduced depending on how few seasons you played
new_pitcher_df['Weighted Rank'] = (new_pitcher_df['Rank'] + ((new_pitcher_df['Trend'] * (new_pitcher_df['Years'] - 1) / 4)))

# shift column 'Weighted Rank' to first position
first_pitcher_column = new_pitcher_df.pop('Weighted Rank')
  
# insert column using insert(position,column_name,first_column) function
new_pitcher_df.insert(1, 'Weighted Rank', first_pitcher_column)

new_pitcher_df = new_pitcher_df.sort_values('Weighted Rank', ascending = False)

# Removing duplicate Pos listings for players
uncleaned_pos_column = new_pitcher_df['Pos'].to_list()
cleaned_pos_list = []
for item in uncleaned_pos_column:
    temp_cleaning_list = []
    temp_list = item.split(',')
    for temp_item in temp_list:
        if temp_item not in temp_cleaning_list:
            temp_cleaning_list.append(temp_item)
        else:
            continue
    cleaned_pos_list.append(temp_cleaning_list)
    
cleaned_pos_column = []
for item in cleaned_pos_list:
    temp_string = ''
    for mini_item in item:
        if temp_string == '':
            temp_string = mini_item
        else:
            temp_string = temp_string + ',' + mini_item
    cleaned_pos_column.append(temp_string)
    
new_pitcher_df['Pos'] = cleaned_pos_column

new_pitcher_df = new_pitcher_df[new_pitcher_df.Name != 'LgAvg per 180 IP']

new_pitcher_df.head(25)


Unnamed: 0,Name,Weighted Rank,Rank,Trend,Years,W_Percentile,ERA_Percentile,SO_Percentile,SV_Percentile,WHIP_Percentile,Pos
81,Gerrit Cole,4.251872,4.267592,-0.01572,5,0.980988,0.965397,0.98993,0.350248,0.981029,SP
128,Justin Verlander,4.187474,4.215025,-0.055101,3,0.987762,0.98322,0.949487,0.315538,0.979018,SP
504,Spencer Strider,4.173957,4.111018,0.251759,2,0.953822,0.923914,0.989384,0.310643,0.933255,SP
48,Corbin Burnes,4.150733,3.588914,0.56182,5,0.717889,0.784213,0.875522,0.430161,0.781129,SP
587,Kodai Senga,4.121593,4.121593,0.001019,1,0.942348,0.97065,0.971698,0.312369,0.924528,SP
2,Aaron Nola,4.10781,4.091528,0.016282,5,0.895756,0.909592,0.977921,0.350248,0.958011,SP
244,Zack Wheeler,4.105253,4.027637,0.077616,5,0.895986,0.941005,0.898948,0.350248,0.94145,SP
77,Framber Valdez,4.097662,3.627305,0.470357,5,0.849418,0.796099,0.827803,0.350248,0.803738,SP
38,Chris Bassitt,4.097531,3.965874,0.131657,5,0.923637,0.924724,0.8664,0.350248,0.900865,SP
131,Kevin Gausman,4.092603,3.737113,0.35549,5,0.759839,0.837704,0.925801,0.350248,0.863522,SP


In [23]:
# Create a new dataframe for stats percentile calculations
new_percentile_df = pd.DataFrame(columns = ['Name','Pos'])

# Carry over columnns from final_batter_stats_df that shouldn't be comparatively ranked 
new_percentile_df['Name'] = new_df['Name']
new_percentile_df['Pos'] = new_df['Pos']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
pos_list = ['C','1B','2B','3B','SS','OF','DH']

for pos in pos_list:
    pos_filtered_df = new_df[new_df['Pos'].str.contains(pos)]

    pos_filtered_df[f'{pos} Weighted Rank'] = pos_filtered_df['Weighted Rank'].rank(pct=True)
#     pos_filtered_df[f'{pos}_AVG_FPTS_Percentile'] = pos_filtered_df['AVG_FPTS'].rank(pct=True)

    # Each of the seasons are added back to the percentile dataframe
    new_percentile_df = new_percentile_df.append(pos_filtered_df, ignore_index=True)
        
#     new_percentile_df['New_Pos'] = pos

        
new_percentile_df = new_percentile_df[new_percentile_df['Rank'].notna()]

new_percentile_df = new_percentile_df.sort_values('Name', ascending=True)
# new_percentile_df = new_percentile_df.drop(['Tm','G','R','H','2B','3B','HR','RBI','SB','TB','BB','SO','PO','A','E','PA','OPS','OPS+','1B'],axis=1)

new_percentile_df = new_percentile_df.fillna(0)

catcher_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('C')]
catcher_percentile_df['Ranked_Pos'] = 'C'
catcher_percentile_df = catcher_percentile_df[['Name','Ranked_Pos','Years','Weighted Rank','C Weighted Rank']]
catcher_percentile_df['Pos_Ranking'] = (catcher_percentile_df['C Weighted Rank'])
catcher_percentile_df = catcher_percentile_df.drop(['C Weighted Rank'],axis = 1)


firstbase_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('1B')]
firstbase_percentile_df['Ranked_Pos'] = '1B'
firstbase_percentile_df = firstbase_percentile_df[['Name','Ranked_Pos','Years','Weighted Rank','1B Weighted Rank']]
firstbase_percentile_df['Pos_Ranking'] = (firstbase_percentile_df['1B Weighted Rank'])
firstbase_percentile_df = firstbase_percentile_df.drop(['1B Weighted Rank'],axis = 1)


secondbase_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('2B')]
secondbase_percentile_df['Ranked_Pos'] = '2B'
secondbase_percentile_df = secondbase_percentile_df[['Name','Ranked_Pos','Years','Weighted Rank','2B Weighted Rank']]
secondbase_percentile_df['Pos_Ranking'] = (secondbase_percentile_df['2B Weighted Rank'])
secondbase_percentile_df = secondbase_percentile_df.drop(['2B Weighted Rank'],axis = 1)


thirdbase_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('3B')]
thirdbase_percentile_df['Ranked_Pos'] = '3B'
thirdbase_percentile_df = thirdbase_percentile_df[['Name','Ranked_Pos','Years','Weighted Rank','3B Weighted Rank']]
thirdbase_percentile_df['Pos_Ranking'] = (thirdbase_percentile_df['3B Weighted Rank'])
thirdbase_percentile_df = thirdbase_percentile_df.drop(['3B Weighted Rank'],axis = 1)


SS_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('SS')]
SS_percentile_df['Ranked_Pos'] = 'SS'
SS_percentile_df = SS_percentile_df[['Name','Ranked_Pos','Years','Weighted Rank','SS Weighted Rank']]
SS_percentile_df['Pos_Ranking'] = (SS_percentile_df['SS Weighted Rank'])
SS_percentile_df = SS_percentile_df.drop(['SS Weighted Rank'],axis = 1)


OF_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('OF')]
OF_percentile_df['Ranked_Pos'] = 'OF'
OF_percentile_df = OF_percentile_df[['Name','Ranked_Pos','Years','Weighted Rank','OF Weighted Rank']]
OF_percentile_df['Pos_Ranking'] = (OF_percentile_df['OF Weighted Rank'])
OF_percentile_df = OF_percentile_df.drop(['OF Weighted Rank'],axis = 1)

DH_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('DH')]
DH_percentile_df['Ranked_Pos'] = 'DH'
DH_percentile_df = DH_percentile_df[['Name','Ranked_Pos','Years','Weighted Rank','DH Weighted Rank']]
DH_percentile_df['Pos_Ranking'] = (DH_percentile_df['DH Weighted Rank'])
DH_percentile_df = DH_percentile_df.drop(['DH Weighted Rank'],axis = 1)

# OF_percentile_df.sort_values('Pos_Ranking',ascending = False).head(20)
pos_rank_df = pd.concat([catcher_percentile_df, firstbase_percentile_df,secondbase_percentile_df,thirdbase_percentile_df,SS_percentile_df,OF_percentile_df,DH_percentile_df])

final_df = pd.merge(pos_rank_df, new_df,  how='left', left_on=['Name','Years','Weighted Rank'], right_on = ['Name','Years','Weighted Rank'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos_filtered_df[f'{pos} Weighted Rank'] = pos_filtered_df['Weighted Rank'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  catcher_percentile_df['Ranked_Pos'] = 'C'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  firstbase_percentile_df['Ranked_Pos'] = '1B'
A value is trying to be set 

In [24]:
new_pitcher_percentile_df = pd.DataFrame(columns = ['Name','Pos'])

# # Carry over columnns from final_batter_stats_df that shouldn't be comparatively ranked 
new_pitcher_percentile_df['Name'] = new_pitcher_df['Name']
new_pitcher_percentile_df['Pos'] = new_pitcher_df['Pos']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
pos_list = ['SP','RP']

for pos in pos_list:
    pos_filtered_df = new_pitcher_df[new_pitcher_df['Pos'].str.contains(pos)]

    pos_filtered_df[f'{pos} Weighted Rank'] = pos_filtered_df['Weighted Rank'].rank(pct=True)
#     pos_filtered_df[f'{pos}_AVG_FPTS_Percentile'] = pos_filtered_df['AVG_FPTS'].rank(pct=True)

    # Each of the seasons are added back to the percentile dataframe
    new_pitcher_percentile_df = new_pitcher_percentile_df.append(pos_filtered_df, ignore_index=True)
            
        
new_pitcher_percentile_df = new_pitcher_percentile_df[new_pitcher_percentile_df['Rank'].notna()]

new_pitcher_percentile_df = new_pitcher_percentile_df.sort_values('Name', ascending=True)

new_pitcher_percentile_df = new_pitcher_percentile_df.fillna(0)

SP_percentile_df = new_pitcher_percentile_df[new_pitcher_percentile_df['Pos'].str.contains('SP')]
SP_percentile_df['Ranked_Pos'] = 'SP'
SP_percentile_df = SP_percentile_df[['Name','Ranked_Pos','Years','Weighted Rank','SP Weighted Rank']]
SP_percentile_df['Pos_Ranking'] = (SP_percentile_df['SP Weighted Rank'])
SP_percentile_df = SP_percentile_df.drop(['SP Weighted Rank'],axis = 1)

RP_percentile_df = new_pitcher_percentile_df[new_pitcher_percentile_df['Pos'].str.contains('RP')]
RP_percentile_df['Ranked_Pos'] = 'RP'
RP_percentile_df = RP_percentile_df[['Name','Ranked_Pos','Years','Weighted Rank','RP Weighted Rank']]
RP_percentile_df['Pos_Ranking'] = (RP_percentile_df['RP Weighted Rank'])
RP_percentile_df = RP_percentile_df.drop(['RP Weighted Rank'],axis = 1)

pitch_rank_df = pd.concat([SP_percentile_df, RP_percentile_df])

final_pitch_df = pd.merge(pitch_rank_df, new_pitcher_df,  how='left', left_on=['Name','Years','Weighted Rank'], right_on = ['Name','Years','Weighted Rank'])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos_filtered_df[f'{pos} Weighted Rank'] = pos_filtered_df['Weighted Rank'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SP_percentile_df['Ranked_Pos'] = 'SP'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  RP_percentile_df['Ranked_Pos'] = 'RP'


In [25]:
draft_df = pd.concat([final_df, final_pitch_df], ignore_index=True, sort=False)

# draft_df.drop(columns = ['Rank','FPTS_Percentile','AVG_FPTS_Percentile'])


# draft_df['FPTS_Percentile'] = draft_df['FPTS'].rank(pct=True)
# draft_df['AVG_FPTS_Percentile'] = draft_df['AVG_FPTS'].rank(pct=True)
    
# draft_df['Rank'] = (draft_df['FPTS_Percentile'] + (draft_df['AVG_FPTS_Percentile'] / 2))

# draft_df['Weighted_Rank'] = ''
# draft_df['Weighted_Rank'] = abs(draft_df['FPTS'] * (1 + (draft_df['Trend'] * (draft_df['Years'] - 1) / 4)))
# shift column 'Weighted Rank' to first position
# weighted_rank_column = draft_df.pop('Weighted Rank')
  
# # insert column using insert(position,column_name,first_column) function
# draft_df.insert(1, 'Weighted Rank', weighted_rank_column)


# draft_df['Weighted_Rank'] = ''
# draft_df['Weighted_Rank'] = abs(draft_df['FPTS'] * (1 + draft_df['Trend']))

weighted_rank_column = draft_df.pop('Weighted Rank')
  
# insert column using insert(position,column_name,first_column) function
draft_df.insert(1, 'Weighted Rank', weighted_rank_column)


draft_df = draft_df.sort_values(by='Pos_Ranking', ascending=False)
# draft_df["G"].fillna("N/A", inplace = True)
# draft_df["GS"].fillna("N/A", inplace = True)

draft_df.head(50)

Unnamed: 0,Name,Weighted Rank,Ranked_Pos,Years,Pos_Ranking,Rank,Trend,BA_Percentile,R_Percentile,HR_Percentile,RBI_Percentile,SB_Percentile,Pos,W_Percentile,ERA_Percentile,SO_Percentile,SV_Percentile,WHIP_Percentile
48,J.T. Realmuto,4.056043,C,5.0,1.0,4.07993,-0.023887,0.716757,0.828873,0.817723,0.843093,0.873485,C,,,,,
2151,Gerrit Cole,4.251872,SP,5.0,1.0,4.267592,-0.01572,,,,,,SP,0.980988,0.965397,0.98993,0.350248,0.981029
219,Freddie Freeman,4.714065,1B,5.0,1.0,4.647196,0.06687,0.966538,0.994326,0.923942,0.963171,0.799218,1B,,,,,
1502,Julio Rodríguez,4.709913,OF,2.0,1.0,4.684977,0.099742,0.876893,0.946037,0.9491,0.93063,0.982318,OF,,,,,
1219,Trea Turner,4.643129,SS,5.0,1.0,4.582798,0.060331,0.922151,0.968028,0.8525,0.855654,0.984464,SS,,,,,
2616,Nick Pivetta,4.070234,RP,4.0,1.0,3.860753,0.279308,,,,,,"SP,RP",0.793458,0.761179,0.876156,0.613208,0.816752
1885,José Ramírez,4.645912,DH,5.0,1.0,4.519598,0.126314,0.783998,0.913682,0.91537,0.938574,0.967974,"3B,DH",,,,,
895,José Ramírez,4.645912,3B,5.0,1.0,4.519598,0.126314,0.783998,0.913682,0.91537,0.938574,0.967974,"3B,DH",,,,,
624,Mookie Betts,4.579494,2B,5.0,1.0,4.544659,0.034834,0.855381,0.983853,0.92686,0.884627,0.893938,"OF,2B,SS",,,,,
2470,Félix Bautista,4.012308,RP,2.0,0.996678,3.868432,0.575505,,,,,,RP,0.674191,0.823305,0.743958,0.967729,0.659249


In [26]:
######################################################################################################################
######################################################################################################################
######################################################################################################################
#### DRAFT DAY FUNCTIONS
   
# DROP A PLAYER 
def drafted(player):
    global draft_df
    global final_pitcher_stats_df
    global final_batter_stats_df
    draft_df = draft_df[draft_df.Name != player]
    final_pitcher_stats_df = final_pitcher_stats_df[final_pitcher_stats_df.Name != player]
    final_batter_stats_df = final_batter_stats_df[final_batter_stats_df.Name != player]
    return draft_df.head(25)
    
# FILTER PLAYERS BY POSITION
def position_filter(Pos):
    filtered_draft_df = draft_df[draft_df['Pos'].str.contains(Pos)]
    return filtered_draft_df.head(25)

# PULL PITCHING STAT CATEGORY LEADERS
def pitching_stat_leaders(CAT):
    global final_pitcher_stats_df
    pitching_filtered_draft_df = draft_df.sort_values([CAT], ascending=[False])
    return pitching_filtered_draft_df.head(25)

# PULL BATTING STAT CATEGORY LEADERS
def batting_stat_leaders(CAT):
    global final_batter_stats_df
    batting_filtered_draft_df = draft_df.sort_values([CAT], ascending=[False])
    return batting_filtered_draft_df.head(25)

def drop_all_position(POS):
    global draft_df
    draft_df = draft_df[draft_df.Ranked_Pos != POS]
    return draft_df.head(25)

In [31]:
drafted('Clayton Kershaw')
drafted('Max Scherzer')
drafted('Devin Williams')
drafted('Mookie Betts')
drafted('Ronald Acuna Jr.')
drafted('Bobby Witt Jr.')
drafted('Julio Rodríguez')
drafted('Shohei Ohtani')
drafted('Kyle Tucker')
drafted('Freddie Freeman')
drafted('Trea Turner')
drafted('Juan Soto')
drafted('Spencer Strider')
drafted('Fernando Tatis Jr.')
drafted('Zack Wheeler')
drafted('Bryce Harper')
drafted('Aaron Judge')
drafted('José Ramírez')
drafted('Gerrit Cole')
drafted('Matt Olson')
drafted('Vladimir Guerrero Jr.')
drafted('Austin Riley')
drafted('Corbin Burnes')
drafted('Pete Alonso')
drafted('Marcus Semien')
drafted('José Altuve')
drafted('Yordan Alvarez')
drafted('Luis Castillo')
drafted('Zac Gallen')
drafted('Tyler Glasnow')
drafted('George Kirby')
drafted('Logan Webb')
drafted('Francisco Lindor')
drafted('Rafael Devers')
drafted('J.T. Realmuto')
drafted('Félix Bautista')
drafted('Adley Rutschman')
drafted('Framber Valdez')
drafted('Michael Harris II')
drafted('Paul Goldschmidt')
drafted('Manny Machado')
drop_all_position('C')
drafted('Mike Trout')
drop_all_position('2B')
drafted('Randy Arozarena')
drafted('Camilo Doval')
drafted('Aaron Nola')
drafted('Nolan Arenado')
drafted('Logan Gilbert')
drafted('Adolis García')
drafted('Jesus Luzardo')
drafted('Xander Bogaerts')
drafted('Royce Lewis')
drafted('Raisel Iglesias')
drafted('Dylan Cease')
drafted('Matt McLain')
position_filter('OF')
drafted('Luis Robert Jr.')
drafted('Nolan Jones')
drafted('Chris Bassitt')
drafted('Teoscar Hernández')
drafted('Kevin Gausman')
drafted('Bo Bichette')
drafted('Steven Kwan')
position_filter('SP')
drafted('Pablo López')
drafted('Blake Snell')
drafted('Alexis Díaz')
drafted('Ha-Seong Kim')
drafted('Emmanuel Clase')
drafted('Bryan Reynolds')
drafted('Alex Bregman')
drafted('Christian Yelich')
drafted('Cody Bellinger')
drafted('Jordan Romano')
drafted('George Springer')
position_filter('SP')
drafted('José Berríos')
drafted('Lane Thomas')
drafted('Michael King')
drafted('Yordan Álvarez')
drafted('Eloy Jimenez')
drafted('Nick Castellanos')
drafted('Gunnar Henderson')
drafted('Masataka Yoshida')
drop_all_position('OF')
drafted('Dansby Swanson')
drafted('Joe Ryan')
drafted('Craig Kimbrel')
drafted('Corey Seager')
position_filter('SP')
drafted('Yu Darvish')
# drop_all_position('SP')
# drop_all_position('RP')
position_filter('SP')
drafted('Jordan Montgomery')
position_filter('SP')
drafted('Taijuan Walker')




Unnamed: 0,Name,Weighted Rank,Ranked_Pos,Years,Pos_Ranking,Rank,Trend,BA_Percentile,R_Percentile,HR_Percentile,RBI_Percentile,SB_Percentile,Pos,W_Percentile,ERA_Percentile,SO_Percentile,SV_Percentile,WHIP_Percentile
2616,Nick Pivetta,4.070234,RP,4.0,1.0,3.860753,0.279308,,,,,,"SP,RP",0.793458,0.761179,0.876156,0.613208,0.816752
2211,Justin Verlander,4.187474,SP,3.0,0.995763,4.215025,-0.055101,,,,,,SP,0.987762,0.98322,0.949487,0.315538,0.979018
2667,Sean Manaea,3.795142,RP,4.0,0.990033,3.595872,0.265693,,,,,,"SP,RP",0.810655,0.723431,0.801245,0.453438,0.807103
2614,Nick Martínez,3.727722,RP,2.0,0.983389,3.707016,0.082823,,,,,,RP,0.620732,0.779891,0.754143,0.804457,0.747793
2222,Kodai Senga,4.121593,SP,1.0,0.983051,4.121593,0.001019,,,,,,SP,0.942348,0.97065,0.971698,0.312369,0.924528
2569,Kevin Ginkel,3.661426,RP,1.0,0.980066,3.661426,0.000905,,,,,,RP,0.858491,0.714885,0.545073,0.880503,0.662474
2443,Dane Dunning,3.579028,RP,4.0,0.973422,3.012707,0.755096,,,,,,"SP,RP",0.602263,0.678037,0.689403,0.356478,0.686525
2590,Martín Pérez,3.554531,RP,5.0,0.9701,3.445505,0.109025,,,,,,"SP,RP",0.795746,0.763158,0.746425,0.350248,0.789928
2456,Drew Smyly,3.536232,RP,4.0,0.966777,3.4839,0.069777,,,,,,"SP,RP",0.794363,0.706362,0.80251,0.418204,0.762461
2678,Tanner Scott,3.501505,RP,3.0,0.960133,2.957869,1.087272,,,,,,RP,0.66605,0.454804,0.66977,0.743263,0.423982
