In [1]:
# Import needed dependencies
import requests
import re
import pandas as pd
import numpy as np
import scipy.stats as stats
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
today = date.today()

# dd/mm/YY
current_year = today.strftime("%Y")

last_year = int(current_year) - 1

In [3]:
last_five_years = []
for i in range(1,6):
    last_five_years.append(int(current_year) - i)

In [4]:
# Create a list to help create a dataframe from batter statistics data
batter_stats = []

# Create a loop to create a dataframe from Baseball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-batting.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = BeautifulSoup(soup.select_one('#all_players_standard_batting').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')


    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        batter_stats.append(tds)

In [5]:
# Create dataframe for batter statistics
batter_stats_df = pd.DataFrame(batter_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
batter_stats_df.columns = df_headers
batter_stats_df

Unnamed: 0,Name,Age,Tm,Lg,G,PA,AB,R,H,2B,...,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos Summary,Year
0,CJ Abrams*,21,TOT,NL,90,302,284,33,70,12,...,.604,76,92,5,9,2,2,0,64/H9D,2022
1,CJ Abrams*,21,SDP,NL,46,139,125,16,29,5,...,.605,77,40,4,6,2,2,0,64/H9D,2022
2,CJ Abrams*,21,WSN,NL,44,163,159,17,41,7,...,.603,75,52,1,3,0,0,0,6/H,2022
3,Albert Abreu,26,TOT,AL,1,0,0,0,0,0,...,,,0,0,0,0,0,0,1,2022
4,Albert Abreu,26,KCR,AL,1,0,0,0,0,0,...,,,0,0,0,0,0,0,/1,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6505,Ryan Zimmerman,33,WSN,NL,85,323,288,33,76,21,...,.824,114,140,10,3,0,2,1,3H,2018
6506,Jordan Zimmermann,32,DET,AL,2,2,2,0,0,0,...,.000,-100,0,0,0,0,0,0,1,2018
6507,Ben Zobrist#,37,CHC,NL,139,520,455,67,139,28,...,.817,117,200,8,2,1,7,1,497H/3D,2018
6508,Mike Zunino,27,SEA,AL,113,405,373,37,75,18,...,.669,85,153,7,6,0,2,0,*2/HD,2018


In [6]:
# Create a list to help create a dataframe from batter statistics data
fielding_stats = []

# Create a loop to create a dataframe from Baseball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-fielding.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = soup.select_one('#all_players_players_standard_fielding_fielding')#.find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')


    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        fielding_stats.append(tds)

# Create dataframe for batter statistics
fielding_stats_df = pd.DataFrame(fielding_stats)

# Create an empty list to store dataframe header information
fielding_header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    fielding_header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
fielding_df_headers = fielding_header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
fielding_df_headers.remove('Rk')
fielding_df_headers.append("Year")

# Set column headers equal to our list
fielding_stats_df.columns = fielding_df_headers

final_fielding_stats_df = fielding_stats_df[['Name','PO','E','Year','Tm']]

position_players_df = pd.merge(final_fielding_stats_df, batter_stats_df, on=['Name','Year','Tm'])
position_players_df

    

Unnamed: 0,Name,PO,E,Year,Tm,Age,Lg,G,PA,AB,...,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos Summary
0,Albert Abreu,2,2,2022,TOT,26,AL,1,0,0,...,,,,0,0,0,0,0,0,1
1,José Abreu,954,11,2022,CHW,35,AL,157,679,601,...,.446,.824,133,268,19,12,0,4,2,*3D
2,Ronald Acuna Jr.,150,4,2022,ATL,24,NL,119,533,467,...,.413,.764,114,193,8,10,0,3,4,9D/H8
3,Willy Adames,178,14,2022,MIL,26,NL,139,617,563,...,.458,.756,112,258,11,1,0,4,3,*6/DH
4,Riley Adams,327,2,2022,WSN,26,NL,48,155,142,...,.310,.555,60,44,2,1,0,0,0,2/HD3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3337,Rob Zastryzny,1,0,2018,CHC,26,NL,6,0,0,...,,,,0,0,0,0,0,0,/1
3338,Brad Ziegler,4,1,2018,TOT,38,NL,76,0,0,...,,,,0,0,0,0,0,0,1
3339,Ryan Zimmerman,538,2,2018,WSN,33,NL,85,323,288,...,.486,.824,114,140,10,3,0,2,1,3H
3340,Jordan Zimmermann,9,0,2018,DET,32,AL,2,2,2,...,.000,.000,-100,0,0,0,0,0,0,1


In [7]:
position_players_df.columns

Index(['Name', 'PO', 'E', 'Year', 'Tm', 'Age', 'Lg', 'G', 'PA', 'AB', 'R', 'H',
       '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG',
       'OPS', 'OPS+', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB', 'Pos Summary'],
      dtype='object')

In [8]:
# Change types of columns to numeric for columns with number values
position_players_df[['Age','G','R','H','2B','3B','HR','RBI','SB','TB','BB','SO','PO','E','PA','OPS','OPS+']] = position_players_df[['Age','G', 'R','H','2B','3B','HR','RBI','SB','TB','BB','SO','PO','E','PA','OPS','OPS+']].apply(pd.to_numeric)

# Drop any players with 0 plate appearances to remove null values and change PA type to integer
position_players_df.dropna(subset=['PA'], axis = 0 , inplace= True)

# Remove any players with fewer than 100 plate appearances
filtered_position_players_df = position_players_df[position_players_df['PA'] >= 100]

# Select the columns we want for our batter analysis
final_position_players_df = filtered_position_players_df[['Year','Name','Tm','Age','G', 'R','H','2B','3B','HR','RBI','SB','TB','BB','SO','PO','E','PA','OPS','OPS+','Pos\xa0Summary']]

# Sort by index to prepare to drop duplicates
final_position_players_df = final_position_players_df.sort_index()

# Drop duplicate entries of Player Name and Year
final_position_players_df['1B'] = ''

for index, row in final_position_players_df.iterrows():
    final_position_players_df['1B'] = (final_position_players_df['H'] - (final_position_players_df['2B'] + final_position_players_df['3B'] + final_position_players_df['HR']))

final_position_players_df.columns


Index(['Year', 'Name', 'Tm', 'Age', 'G', 'R', 'H', '2B', '3B', 'HR', 'RBI',
       'SB', 'TB', 'BB', 'SO', 'PO', 'E', 'PA', 'OPS', 'OPS+', 'Pos Summary',
       '1B'],
      dtype='object')

In [9]:
final_position_players_df['FPTS'] = ''
final_position_players_df['AVG_FPTS'] = ''

for index, row in final_position_players_df.iterrows():
    final_position_players_df['FPTS'] = (final_position_players_df['1B'] + (2*final_position_players_df['2B']) + (3*final_position_players_df['3B']) + (4*final_position_players_df['HR']) + final_position_players_df['TB'] + final_position_players_df['BB'] + final_position_players_df['R'] + (2*final_position_players_df['RBI']) + (2*final_position_players_df['SB']) + final_position_players_df['PO'] - (2*final_position_players_df['SO']) - final_position_players_df['E'])
    final_position_players_df['AVG_FPTS'] = (final_position_players_df['FPTS']/final_position_players_df['G'])

final_position_players_df
    

Unnamed: 0,Year,Name,Tm,Age,G,R,H,2B,3B,HR,...,SO,PO,E,PA,OPS,OPS+,Pos Summary,1B,FPTS,AVG_FPTS
1,2022,José Abreu,CHW,35,157,85,183,40,0,15,...,110,954,11,679,0.824,133.0,*3D,128,1556,9.910828
2,2022,Ronald Acuna Jr.,ATL,24,119,71,124,24,0,15,...,126,150,4,533,0.764,114.0,9D/H8,85,562,4.722689
3,2022,Willy Adames,MIL,26,139,83,134,31,0,31,...,166,178,14,617,0.756,112.0,*6/DH,72,692,4.978417
4,2022,Riley Adams,WSN,26,48,14,25,4,0,5,...,46,327,2,155,0.555,60.0,2/HD3,16,367,7.645833
5,2022,Jo Adell,LAA,23,88,22,60,12,2,8,...,107,139,5,285,0.637,79.0,79H/D,38,215,2.443182
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3319,2018,Bobby Wilson,MIN,35,47,12,24,8,0,2,...,37,399,1,151,0.523,43.0,2,14,456,9.702128
3333,2018,Austin Wynns,BAL,27,42,16,28,2,0,4,...,25,257,1,118,0.669,83.0,2/HD,22,333,7.928571
3335,2018,Chris Young,LAA,34,56,17,19,2,1,6,...,37,56,1,128,0.615,68.0,987H/D,10,121,2.160714
3339,2018,Ryan Zimmerman,WSN,33,85,33,76,21,2,13,...,55,538,2,323,0.824,114.0,3H,40,873,10.270588


In [10]:
# Sort data by name alphabetically, then by year in descending order
final_position_players_df = final_position_players_df.sort_values(['Year','Name'], ascending=[True, True])

# Eliminate Baseball Reference's name badges for accolades
final_position_players_df['Name'] = final_position_players_df['Name'].str.extract('([^\*|#]*)')

cleaned_player_list = []
for player in final_position_players_df['Name']:
    player = player.replace("\xa0", " ")
    cleaned_player_list.append(player)

final_position_players_df['Name'] = cleaned_player_list 
final_position_players_df = final_position_players_df.sort_values(['FPTS'], ascending=False)

final_position_players_df

Unnamed: 0,Year,Name,Tm,Age,G,R,H,2B,3B,HR,...,SO,PO,E,PA,OPS,OPS+,Pos Summary,1B,FPTS,AVG_FPTS
2798,2018,Paul Goldschmidt,ARI,30,158,95,172,35,5,33,...,173,1323,6,690,0.922,142.0,*3/DH,99,1968,12.455696
781,2021,Vladimir Guerrero Jr.,TOR,22,161,123,188,29,1,48,...,110,1026,8,698,1.002,167.0,*3D/5,110,1963,12.192547
758,2021,Paul Goldschmidt,STL,33,158,102,177,36,2,31,...,136,1144,2,679,0.879,141.0,*3/DH,108,1881,11.905063
161,2022,Paul Goldschmidt,STL,34,151,106,178,41,0,35,...,141,1071,1,651,0.981,180.0,*3D/H,102,1865,12.350993
1980,2019,Paul Goldschmidt,STL,31,161,97,155,25,1,34,...,166,1256,5,682,0.821,115.0,*3/H,95,1862,11.565217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1345,2020,Jo Adell,LAA,21,38,9,20,4,0,3,...,55,72,3,132,0.478,30.0,9/8H,13,55,1.447368
3139,2018,Yadiel Rivera,MIA,26,111,13,24,3,0,1,...,51,46,4,160,0.485,37.0,H564/7389,20,54,0.486486
1858,2019,Zack Cozart,LAA,33,38,4,12,2,0,0,...,16,30,4,107,0.322,-12.0,5/H64,10,45,1.184211
1513,2020,Carter Kieboom,WSN,22,33,15,20,1,0,0,...,33,16,3,122,0.556,57.0,5/HD,19,39,1.181818


In [11]:
# Create a new dataframe for stats percentile calculations
percentile_df = pd.DataFrame(columns = ['Year', 'Name', 'Age','Pos\xa0Summary', 'Tm', 'FPTS_Percentile', 'AVG_FPTS_Percentile'])

# Carry over columnns from final_batter_stats_df that shouldn't be comparatively ranked 
percentile_df['Year'] = final_position_players_df['Year']
percentile_df['Name'] = final_position_players_df['Name']
percentile_df['Age'] = final_position_players_df['Age']
percentile_df['Tm'] = final_position_players_df['Tm']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
for year in last_five_years:
    year_df = final_position_players_df.loc[final_position_players_df['Year'] == year]
    year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
    year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)
    year_df.sort_values('Name', ascending=True)

    # Each of the seasons are added back to the percentile dataframe
    percentile_df = percentile_df.append(year_df, ignore_index=True)

percentile_df = percentile_df.sort_values(['Year','Name'], ascending=[True, True])
percentile_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)


Unnamed: 0,Year,Name,Age,Pos Summary,Tm,FPTS_Percentile,AVG_FPTS_Percentile,G,R,H,...,BB,SO,PO,E,PA,OPS,OPS+,1B,FPTS,AVG_FPTS
631,2018,A.J. Ellis,37,,SDP,,,,,,...,,,,,,,,,,
2272,2018,A.J. Ellis,37,2H/D7,SDP,0.392,0.72,66.0,19.0,41.0,...,26.0,37.0,321.0,1.0,183.0,0.722,104.0,32.0,425.0,6.439394
346,2018,AJ Pollock,30,,ARI,,,,,,...,,,,,,,,,,
2205,2018,AJ Pollock,30,*8/H,ARI,0.66,0.672,113.0,61.0,106.0,...,31.0,100.0,229.0,1.0,460.0,0.800,108.0,59.0,676.0,5.982301
938,2018,Aaron Altherr,27,,PHI,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1029,2022,Yu Chang,26,,TOT,,,,,,...,,,,,,,,,,
1413,2022,Yu Chang,26,46/531H,TOT,0.129771,0.236641,51.0,14.0,28.0,...,12.0,41.0,116.0,2.0,141.0,0.624,81.0,20.0,168.0,3.294118
1415,2022,Yu Chang,26,46/35HD1,TOT,0.122137,0.064885,69.0,19.0,35.0,...,16.0,59.0,116.0,2.0,190.0,0.605,75.0,25.0,167.0,2.420290
29,2022,Yuli Gurriel,38,,HOU,,,,,,...,,,,,,,,,,


In [12]:
# Clean up the percentile dataframe, drop NaNs and remove unnecessary columns
percentile_df = percentile_df.dropna()

# Add a rank column that adds the percentiles from each category
percentile_df['Rank'] = (percentile_df['FPTS_Percentile'] + percentile_df['AVG_FPTS_Percentile'])

percentile_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  percentile_df['Rank'] = (percentile_df['FPTS_Percentile'] + percentile_df['AVG_FPTS_Percentile'])


Unnamed: 0,Year,Name,Age,Pos Summary,Tm,FPTS_Percentile,AVG_FPTS_Percentile,G,R,H,...,SO,PO,E,PA,OPS,OPS+,1B,FPTS,AVG_FPTS,Rank
2272,2018,A.J. Ellis,37,2H/D7,SDP,0.392,0.72,66.0,19.0,41.0,...,37.0,321.0,1.0,183.0,0.722,104.0,32.0,425.0,6.439394,1.112
2205,2018,AJ Pollock,30,*8/H,ARI,0.66,0.672,113.0,61.0,106.0,...,100.0,229.0,1.0,460.0,0.800,108.0,59.0,676.0,5.982301,1.332
2331,2018,Aaron Altherr,27,9H8/7,PHI,0.156,0.06,105.0,28.0,44.0,...,91.0,95.0,1.0,285.0,0.628,69.0,24.0,220.0,2.095238,0.216
2224,2018,Aaron Judge,26,9D/H8,NYY,0.584,0.588,112.0,77.0,115.0,...,152.0,171.0,3.0,498.0,0.919,150.0,66.0,599.0,5.348214,1.172
2254,2018,Adam Duvall,29,7H3/95,TOT,0.466,0.268,138.0,48.0,75.0,...,117.0,228.0,2.0,427.0,0.639,70.0,40.0,483.0,3.500000,0.734
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1248,2022,Yandy Díaz,30,53DH,TBR,0.759542,0.645038,137.0,71.0,140.0,...,60.0,184.0,6.0,558.0,0.824,143.0,98.0,727.0,5.306569,1.40458
1292,2022,Yonathan Daza,28,*87/H9,COL,0.593511,0.60687,113.0,56.0,112.0,...,58.0,252.0,6.0,408.0,0.733,97.0,87.0,566.0,5.008850,1.200382
1413,2022,Yu Chang,26,46/531H,TOT,0.129771,0.236641,51.0,14.0,28.0,...,41.0,116.0,2.0,141.0,0.624,81.0,20.0,168.0,3.294118,0.366412
1415,2022,Yu Chang,26,46/35HD1,TOT,0.122137,0.064885,69.0,19.0,35.0,...,59.0,116.0,2.0,190.0,0.605,75.0,25.0,167.0,2.420290,0.187023


In [13]:
# Create a list of each unique player we have in our dataframe
player_list = percentile_df.Name.unique().tolist()

# Create a new dataframe for combined, averaged percentiles over the past 5 seasons
new_df = pd.DataFrame(columns = ['Name', 'Rank', 'Trend', 'Pos', 'Years', 'FPTS', 'AVG_FPTS', 'FPTS_Percentile', 'AVG_FPTS_Percentile'])

# Create a list for each percentile stat category for upcoming loop
player_trends = []
average_FPTS = []
average_AVG_FPTS = []
average_FPTS_Percentile = []
average_AVG_FPTS_Percentile = []
average_Rank = []
year_count = []
pos = []

# Loop through each player, check if they played in the past two seasons. If not, remove them
for player in player_list:
    filter_df = percentile_df.loc[percentile_df['Name'] == player]
    filter_df = filter_df.sort_values(['Year'], ascending=[False])
    year_list = filter_df.Year.tolist()
    if (year_list[0] != last_year) and (year_list[0] != (last_year - 1)):
        player_list.remove(player)

# Update new dataframe with updated unique player list
new_df['Name'] = player_list        

# Loop through each player, locate their percentile stats for each season, average them out
for player in player_list:
    player_df = percentile_df.loc[percentile_df['Name'] == player]
    
    # We want to find the slope of the line of best fit for each player's overall ranking each season
    x = np.array(player_df['Year'], dtype = float)
    y = np.array(player_df['Rank'], dtype = float)
    slope, intercept = np.polyfit(x, y, 1)
    player_trends.append(slope)
    
    # Find average of each player's percentiles from previous 5 seasons
    average_FPTS.append(sum(player_df['FPTS']) / len(player_df['FPTS']))
    average_AVG_FPTS.append(sum(player_df['AVG_FPTS']) / len(player_df['AVG_FPTS']))
    average_FPTS_Percentile.append(sum(player_df['FPTS_Percentile']) / len(player_df['FPTS_Percentile']))
    average_AVG_FPTS_Percentile.append(sum(player_df['AVG_FPTS_Percentile']) / len(player_df['AVG_FPTS_Percentile']))
    average_Rank.append(sum(player_df['Rank']) / len(player_df['Rank']))
    year_count.append(len(x))
    
    # Keep player positions for reference purposes during the draft
    pos.append(player_df['Pos\xa0Summary'].unique())

# Update new dataframe with the list data from each stat
new_df['Pos'] = pos
new_df['Trend'] = player_trends
new_df['FPTS'] = average_FPTS
new_df['AVG_FPTS'] = average_AVG_FPTS
new_df['FPTS_Percentile'] = average_FPTS_Percentile
new_df['AVG_FPTS_Percentile'] = average_AVG_FPTS_Percentile
new_df['Rank'] = average_Rank

# Keep track of how many seasons are being considered, so we know how reliable the data is
new_df['Years'] = year_count



  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

In [14]:
# Create a weighted rank column by adding trend data to the rank data and account for number of seasons played
# Basically, if you played all 5 seasons, your trend stat is added directly
# If you played fewer than all 5 seasons, your trend stat is reduced depending on how few seasons you played
new_df['Weighted Rank'] = (new_df['Rank'] + ((new_df['Trend'] * (new_df['Years'] - 1) / 4)))

# shift column 'Weighted Rank' to first position
first_column = new_df.pop('Weighted Rank')
  
# insert column using insert(position,column_name,first_column) function
new_df.insert(1, 'Weighted Rank', first_column)

new_df = new_df.sort_values('Weighted Rank', ascending = False)

In [15]:
# separate position column into a list for editing
new_pos_list = []
pos_list = new_df['Pos'].tolist()

# loop through list and pull only the last item, which represents player position listed from most recent season
for i in pos_list:
    j = i[-1]
    
    if re.search('/', j):
        k = re.sub("([^\/]+$)","",j)
        new_pos_list.append(k)
    else:
        new_pos_list.append(j)

cleaned_list = []
for pos in new_pos_list:
    placeholder = re.findall("[a-zA-Z0-9]+", pos)
    placeholder_2 = ''.join(placeholder)
    placeholder_3 = [d for d in placeholder_2]
    cleaned_list.append(placeholder_3)

cleaned_pos_list = []
for n_list in cleaned_list:
    
    placeholder_list = []
    for pos in n_list:
        if pos == '1':
            placeholder_list.append('P')
        elif pos == '2':
            placeholder_list.append('C')
        elif pos == '3':
            placeholder_list.append('1B')
        elif pos == '4':
            placeholder_list.append('2B')
        elif pos == '5':
            placeholder_list.append('3B')
        elif pos == '6':
            placeholder_list.append('SS')
        elif pos == ('7'):
            placeholder_list.append('OF')
        elif pos == ('8'):
            placeholder_list.append('OF')
        elif pos == ('9'):
            placeholder_list.append('OF')
        elif pos == ('D'):
            placeholder_list.append('DH')
        
    cleaned_pos_list.append(placeholder_list)        

temp_pos_list = []
for item in cleaned_pos_list:
    new_string = []
    for pos in item:
        string = str(pos)
        new_string = f'{new_string},{string}'
    temp_pos_list.append(new_string)

    
final_pos_list = []
for i in temp_pos_list:
    i = i.replace('[],', '')
    final_pos_list.append(i)
    
# replace old position column with new position column
new_df.drop('Pos', axis = 1, inplace = True)
new_df['Pos'] = final_pos_list

new_df.head(25)

Unnamed: 0,Name,Weighted Rank,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos
138,Paul Goldschmidt,1.962599,1.963268,-0.000668,5,1632.4,11.676084,0.995547,0.967721,"1B,DH"
70,J.T. Realmuto,1.949851,1.948361,0.00149,5,1379.8,11.558872,0.976092,0.972268,C
229,Will Smith,1.94014,1.855459,0.112908,4,1062.0,11.989147,0.863381,0.992078,"C,DH"
181,Yuli Gurriel,1.930997,1.921756,0.009241,5,1355.2,10.789009,0.979547,0.942209,1B
89,José Abreu,1.926779,1.937161,-0.010382,5,1415.8,11.080875,0.984405,0.952756,"1B,DH"
36,Christian Vázquez,1.915981,1.871224,0.044756,5,1119.8,10.723128,0.923847,0.947377,C
143,Rhys Hoskins,1.914641,1.849587,0.065053,5,1197.2,10.119458,0.942848,0.90674,1B
219,Pete Alonso,1.911645,1.895619,0.021368,4,1403.75,10.211517,0.972342,0.923277,"1B,DH"
228,Vladimir Guerrero Jr.,1.908146,1.629204,0.371922,4,1169.25,8.859455,0.849491,0.779713,"1B,DH"
26,C.J. Cron,1.895679,1.83838,0.076399,4,1365.75,9.822698,0.938527,0.899853,"1B,DH"


In [16]:
############################################################################################
############################################################################################
############################################################################################

#    However, for example, if your league is set to have a Games Started limit of 12 and you have 10 pitchers at the 
#    completion of Saturday's games and start 4 pitchers on Sunday, you will receive stats for all 14 pitchers. 

#    (Note: This can happen on any day during the week. If managers have 10 pitchers by end of Wednesday and 
#    starts 4 on Thursday, they will receive points for the 4 pitchers on Thursday but for Friday, Saturday and 
#    Sunday, they will not receive any starting pitchers points.)

############################################################################################
############################################################################################
############################################################################################


In [17]:
# Create a list to help create a dataframe from batter statistics data
pitcher_stats = []

for year in last_five_years:

    # input URL and use BeautifulSoup to parse through the page
    pitching_url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-pitching.shtml'
    pitching_soup = BeautifulSoup(requests.get(pitching_url).content, 'html.parser')

    # Grab the table element that has batter statistics
    pitching_table = BeautifulSoup(pitching_soup.select_one('#all_players_standard_pitching').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')

    # Grab data from table and put it into the list created above
    for tr in pitching_table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        pitcher_stats.append(tds)
        

In [18]:
# Create dataframe for batter statistics
raw_pitcher_stats_df = pd.DataFrame(pitcher_stats)

# Create an empty list to store dataframe header information
pitcher_header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in pitching_table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    pitcher_header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
pitcher_df_headers = pitcher_header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
pitcher_df_headers.remove('Rk')
pitcher_df_headers.append("Year")

# Set column headers equal to our list
raw_pitcher_stats_df.columns = pitcher_df_headers

pitcher_stats_df = pd.merge(final_fielding_stats_df, raw_pitcher_stats_df, on=['Name','Year','Tm'])
pitcher_stats_df.columns


Index(['Name', 'PO', 'E', 'Year', 'Tm', 'Age', 'Lg', 'W', 'L', 'W-L%', 'ERA',
       'G', 'GS', 'GF', 'CG', 'SHO', 'SV', 'IP', 'H', 'R', 'ER', 'HR', 'BB',
       'IBB', 'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP', 'WHIP', 'H9',
       'HR9', 'BB9', 'SO9', 'SO/W'],
      dtype='object')

In [19]:
# Create a list to help create a dataframe from batter statistics data
reliever_stats = []

# Create a loop to create a dataframe from Baseball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{year}-reliever-pitching.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = BeautifulSoup(soup.select_one('#all_players_reliever_pitching').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')


    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        reliever_stats.append(tds)

# Create dataframe for batter statistics
reliever_stats_df = pd.DataFrame(reliever_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
reliever_stats_df.columns = df_headers


final_reliever_stats_df = reliever_stats_df[['Name','Hold','Year','Tm']]

final_pitcher_stats_df = pd.merge(final_reliever_stats_df, pitcher_stats_df, on=['Name','Year','Tm'])

final_pitcher_stats_df



Unnamed: 0,Name,Hold,Year,Tm,PO,E,Age,Lg,W,L,...,WP,BF,ERA+,FIP,WHIP,H9,HR9,BB9,SO9,SO/W
0,Cory Abbott,0,2022,WSN,2,0,26,NL,0,5,...,2,216,75,6.36,1.438,8.3,2.3,4.7,8.4,1.80
1,Albert Abreu,1,2022,TOT,2,2,26,AL,2,2,...,6,172,124,4.77,1.474,8.1,1.2,5.1,8.8,1.73
2,Bryan Abreu,8,2022,HOU,2,0,25,AL,4,0,...,7,248,200,2.12,1.177,6.7,0.3,3.9,13.1,3.38
3,Domingo Acevedo,20,2022,OAK,3,3,28,AL,4,4,...,1,266,113,4.01,0.990,6.7,1.2,2.3,7.7,3.41
4,Jason Adam,22,2022,TBR,1,0,30,AL,2,3,...,2,237,233,2.86,0.758,4.4,0.7,2.4,10.7,4.41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2581,Mike Wright Jr.,2,2018,BAL,5,1,28,AL,4,2,...,4,388,77,4.78,1.625,10.8,1.3,3.8,7.9,2.06
2582,Steven Wright,3,2018,BOS,1,0,33,AL,3,1,...,0,223,165,4.37,1.248,6.9,0.8,4.4,7.0,1.62
2583,Jimmy Yacabonis,0,2018,BAL,2,0,26,AL,0,2,...,4,177,80,5.84,1.450,9.0,1.8,4.1,7.4,1.83
2584,Kirby Yates,16,2018,SDP,0,1,31,NL,5,3,...,2,250,180,2.54,0.921,5.9,0.9,2.4,12.9,5.29


In [20]:
final_pitcher_stats_df.columns

Index(['Name', 'Hold', 'Year', 'Tm', 'PO', 'E', 'Age', 'Lg', 'W', 'L', 'W-L%',
       'ERA', 'G', 'GS', 'GF', 'CG', 'SHO', 'SV', 'IP', 'H', 'R', 'ER', 'HR',
       'BB', 'IBB', 'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP', 'WHIP', 'H9',
       'HR9', 'BB9', 'SO9', 'SO/W'],
      dtype='object')

In [21]:
# Change types of columns to numeric for columns with number values
final_pitcher_stats_df[['Age','G','GS','IP','ER','W','L','SV','SO','H','BB','CG','Hold','PO','E']] = final_pitcher_stats_df[['Age','G','GS','IP','ER','W','L','SV','SO','H','BB','CG','Hold','PO','E']].apply(pd.to_numeric)

# Drop any players with NaN innings pitched, ERA, and WHIP to remove null values 
final_pitcher_stats_df.dropna(subset=['IP'], axis = 0 , inplace= True)
final_pitcher_stats_df.dropna(subset=['ERA'], axis = 0 , inplace= True)
final_pitcher_stats_df.dropna(subset=['WHIP'], axis = 0 , inplace= True)
final_pitcher_stats_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Remove any pitchers with fewer than 30 innings pitched
final_pitcher_stats_df = final_pitcher_stats_df[final_pitcher_stats_df['IP'] >= 30]

# Select the columns we want for our pitcher analysis
final_pitcher_stats_df = final_pitcher_stats_df[['Year','Name','Age','G','GS','IP','ER','W','L','SV','SO','H','BB','CG','Hold','PO','E']]

# Eliminate Baseball Reference's name badges for accolades
final_pitcher_stats_df['Name'] = final_pitcher_stats_df['Name'].str.extract('([^\*|#]*)')

pitcher_list = final_pitcher_stats_df.Name.tolist()

cleaned_pitcher_list = []
for pitcher in pitcher_list:
    pitcher = pitcher.replace("\xa0", " ")
    cleaned_pitcher_list.append(pitcher) 
    
final_pitcher_stats_df['Name'] = cleaned_pitcher_list 




In [22]:
# # Sort by index to prepare to drop duplicates
# final_pitcher_stats_df = final_pitcher_stats_df.sort_index()

# # Drop duplicate entries of Player Name and Year
# # This is to eliminate partial season data for players who played for 2+ teams in one season
# final_pitcher_stats_df = final_pitcher_stats_df.drop_duplicates(subset=['Year', 'Name'])

In [23]:
# Sort data by name alphabetically, then by year in descending order
final_pitcher_stats_df = final_pitcher_stats_df.sort_values(['Year','Name'], ascending=[True, True])



final_pitcher_stats_df['FPTS'] = ''
final_pitcher_stats_df['AVG_FPTS'] = ''
final_pitcher_stats_df['Pos'] = ''
pos_list = []
final_pos_list = []

for index, row in final_pitcher_stats_df.iterrows():
    final_pitcher_stats_df['FPTS'] = (final_pitcher_stats_df['IP'] - (2*final_pitcher_stats_df['ER']) + (2*final_pitcher_stats_df['W']) - (2*final_pitcher_stats_df['L']) + (5*final_pitcher_stats_df['SV']) + (2*final_pitcher_stats_df['SO']) - final_pitcher_stats_df['H'] - final_pitcher_stats_df['BB'] + (3*final_pitcher_stats_df['CG']) + (2*final_pitcher_stats_df['Hold']) + final_pitcher_stats_df['PO'] - final_pitcher_stats_df['E'])
    final_pitcher_stats_df['AVG_FPTS'] = (final_pitcher_stats_df['FPTS']/final_pitcher_stats_df['G'])
#     final_pitcher_stats_df['Pos'] = (final_pitcher_stats_df['GS']/final_pitcher_stats_df['G'])
    
# #    print(final_pitcher_stats_df['GS'] / final_pitcher_stats_df['G'])
    pitcher_ratio = (final_pitcher_stats_df['GS'] / final_pitcher_stats_df['G'])[index]
    pos_list.append(pitcher_ratio)
# #    final_pitcher_stats_df['Pos'] = pitcher_ratio

for i in range(len(pos_list)):
    if pos_list[i] > (2/3):
        final_pos_list.append('SP')
    elif pos_list[i] < (1/3):
        final_pos_list.append('RP')
    else:
        final_pos_list.append('SP,RP')
        
final_pitcher_stats_df['Pos'] = final_pos_list
        
final_pitcher_stats_df = final_pitcher_stats_df.sort_values(['FPTS'], ascending=False)
final_pitcher_stats_df.head(50)


Unnamed: 0,Year,Name,Age,G,GS,IP,ER,W,L,SV,SO,H,BB,CG,Hold,PO,E,FPTS,AVG_FPTS,Pos
2195,2018,Edwin Díaz,24,73,0,73.1,16,0,4,57,124,41,17,0,0,1,0,509.1,6.973973,RP
759,2021,Liam Hendriks,32,69,0,71.0,20,8,3,38,113,45,7,0,0,4,0,409.0,5.927536,RP
2540,2018,Blake Treinen,30,68,0,80.1,7,9,2,38,100,46,21,0,0,4,1,406.1,5.972059,RP
121,2022,Edwin Díaz,28,61,0,62.0,9,3,1,32,118,34,18,0,4,4,1,403.0,6.606557,RP
2074,2019,Kirby Yates,32,60,0,60.2,8,0,5,41,101,41,13,0,0,6,0,393.2,6.553333,RP
1610,2019,Shane Bieber,24,34,33,214.1,78,15,8,0,259,186,40,3,0,12,0,385.1,11.326471,SP
2329,2018,Craig Kimbrel,30,63,0,62.1,19,5,1,42,96,31,31,0,0,2,0,374.1,5.938095,RP
93,2022,Emmanuel Clase,24,77,0,72.2,11,3,4,42,77,43,10,0,0,7,0,366.2,4.755844,RP
2115,2018,Trevor Bauer,27,28,27,175.1,43,12,6,1,221,134,57,0,0,8,0,365.1,13.039286,SP
1776,2019,Liam Hendriks,30,75,2,85.0,17,4,4,25,124,61,21,0,8,6,0,364.0,4.853333,RP


In [24]:
# Create a new dataframe for stats percentile calculations
pitcher_percentile_df = pd.DataFrame(columns = ['Year','Name','Age','G','GS','IP','ER','W','L','SV','SO','H','BB','CG','Hold','PO','E','Pos'])

# Carry over columnns from final_pitcher_stats_df that shouldn't be comparatively ranked 
pitcher_percentile_df['Year'] = final_pitcher_stats_df['Year']
pitcher_percentile_df['Name'] = final_pitcher_stats_df['Name']
pitcher_percentile_df['Age'] = final_pitcher_stats_df['Age']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
for year in last_five_years:
    year_df = final_pitcher_stats_df.loc[final_pitcher_stats_df['Year'] == year]
    year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
    year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)
    year_df.sort_values('Name', ascending=True)    
    
    # Each of the seasons are added back to the percentile dataframe
    pitcher_percentile_df = pitcher_percentile_df.append(year_df, ignore_index=True)

pitcher_percentile_df = pitcher_percentile_df.sort_values(['Year','Name'], ascending=[True, True])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)


In [25]:
# Clean up the percentile dataframe, drop NaNs and remove unnecessary columns
pitcher_percentile_df = pitcher_percentile_df.dropna()

# Add a rank column that adds the percentiles from each category
pitcher_percentile_df['Rank'] = (pitcher_percentile_df['FPTS_Percentile'] + pitcher_percentile_df['AVG_FPTS_Percentile'])

pitcher_percentile_df


Unnamed: 0,Year,Name,Age,G,GS,IP,ER,W,L,SV,...,CG,Hold,PO,E,Pos,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Rank
2077,2018,A.J. Cole,26,32,2,48.1,33,4,2,0,...,0,0,2,1,RP,28.1,0.878125,0.327068,0.342105,0.669173
2032,2018,Adam Cimber,27,70,0,68.1,26,3,8,0,...,0,13,2,1,RP,64.1,0.915714,0.496241,0.345865,0.842105
1905,2018,Adam Ottavino,32,75,0,77.2,21,6,4,6,...,0,34,8,1,RP,291.2,3.882667,0.973684,0.887218,1.860902
2104,2018,Adam Plutko,26,17,12,76.2,45,4,5,1,...,0,0,3,0,SP,11.2,0.658824,0.225564,0.263158,0.488722
2033,2018,Adam Warren,30,47,0,51.2,18,3,2,0,...,0,4,2,0,RP,63.2,1.344681,0.492481,0.454887,0.947368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,2022,Zach Jackson,27,54,0,48.0,16,2,3,3,...,0,26,2,0,RP,156.0,2.888889,0.837945,0.782609,1.620553
1289,2022,Zach Plesac,27,25,24,131.2,63,3,12,0,...,0,0,10,2,SP,21.2,0.848000,0.181818,0.233202,0.415020
1269,2022,Zach Pop,25,35,0,39.0,12,4,0,0,...,0,3,2,1,RP,35.0,1.000000,0.258893,0.282609,0.541502
1326,2022,Zach Thompson,28,29,22,121.2,70,3,10,0,...,0,0,5,0,SP,-31.8,-1.096552,0.035573,0.047431,0.083004


In [26]:
# pitcher_percentile_df = pitcher_percentile_df.sort_values(['FPTS'], ascending=False)
# pitcher_percentile_df.head(50)

In [27]:
# Create a list of each unique player we have in our dataframe
pitcher_list = pitcher_percentile_df.Name.unique().tolist()

# Create a new dataframe for combined, averaged percentiles over the past 5 seasons
new_pitcher_df = pd.DataFrame(columns = ['Name', 'Rank', 'Trend', 'Years', 'FPTS', 'AVG_FPTS', 'FPTS_Percentile', 'AVG_FPTS_Percentile','Pos','G','GS'])

# Create a list for each percentile stat category for upcoming loop
pitcher_trends = []
average_FPTS = []
average_AVG_FPTS = []
average_FPTS_Percentile = []
average_AVG_FPTS_Percentile = []
pitcher_average_Rank = []
pitcher_year_count = []
pos = []
games = []
games_started = []

cleaned_pitcher_list = []
for pitcher in pitcher_list:
    pitcher = pitcher.replace("\xa0", " ")
    cleaned_pitcher_list.append(pitcher) 

# new_pitcher_df
    
# Loop through each player, check if they played in the past two seasons. If not, remove them
for pitcher in cleaned_pitcher_list:
    filter_df = pitcher_percentile_df.loc[pitcher_percentile_df['Name'] == pitcher]
    filter_df = filter_df.sort_values(['Year'], ascending=[False])
    year_list = filter_df.Year.tolist()
    if (year_list[0] != last_year) and (year_list[0] != (last_year - 1)):
        pitcher_list.remove(pitcher)

# Update new dataframe with updated unique player list
new_pitcher_df['Name'] = pitcher_list        

# Loop through each player, locate their percentile stats for each season, average them out
for pitcher in pitcher_list:
    pitcher_df = pitcher_percentile_df.loc[pitcher_percentile_df['Name'] == pitcher]
    
    # We want to find the slope of the line of best fit for each player's overall ranking each season
    x = np.array(pitcher_df['Year'], dtype = float)
    y = np.array(pitcher_df['Rank'], dtype = float)
    slope, intercept = np.polyfit(x, y, 1)
    pitcher_trends.append(slope)
    
    # Find average of each player's percentiles from previous 5 seasons
    average_FPTS.append(sum(pitcher_df['FPTS']) / len(pitcher_df['FPTS']))
    average_AVG_FPTS.append(sum(pitcher_df['AVG_FPTS']) / len(pitcher_df['AVG_FPTS']))
    average_FPTS_Percentile.append(sum(pitcher_df['FPTS_Percentile']) / len(pitcher_df['FPTS_Percentile']))
    average_AVG_FPTS_Percentile.append(sum(pitcher_df['AVG_FPTS_Percentile']) / len(pitcher_df['AVG_FPTS_Percentile']))
    pitcher_average_Rank.append(sum(pitcher_df['Rank']) / len(pitcher_df['Rank']))
    pitcher_year_count.append(len(x))
    pos.append(pitcher_df['Pos'].iloc[-1])
    games.append(pitcher_df['G'].iloc[-1])
    games_started.append(pitcher_df['GS'].iloc[-1])

# Update new dataframe with the list data from each stat
new_pitcher_df['Trend'] = pitcher_trends
new_pitcher_df['Pos'] = pos
new_pitcher_df['G'] = games
new_pitcher_df['GS'] = games_started
new_pitcher_df['FPTS'] = average_FPTS
new_pitcher_df['AVG_FPTS'] = average_AVG_FPTS
new_pitcher_df['FPTS_Percentile'] = average_FPTS_Percentile
new_pitcher_df['AVG_FPTS_Percentile'] = average_AVG_FPTS_Percentile
new_pitcher_df['Rank'] = pitcher_average_Rank

# Keep track of how many seasons are being considered, so we know how reliable the data is
new_pitcher_df['Years'] = pitcher_year_count



  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

In [28]:
# Create a weighted rank column by adding trend data to the rank data and account for number of seasons played
# Basically, if you played all 5 seasons, your trend stat is added directly
# If you played fewer than all 5 seasons, your trend stat is reduced depending on how few seasons you played
new_pitcher_df['Weighted Rank'] = (new_pitcher_df['Rank'] + ((new_pitcher_df['Trend'] * (new_pitcher_df['Years'] - 1) / 4)))

# shift column 'Weighted Rank' to first position
first_pitcher_column = new_pitcher_df.pop('Weighted Rank')
  
# insert column using insert(position,column_name,first_column) function
new_pitcher_df.insert(1, 'Weighted Rank', first_pitcher_column)

new_pitcher_df = new_pitcher_df.sort_values('Weighted Rank', ascending = False)
new_pitcher_df.head(25)


Unnamed: 0,Name,Weighted Rank,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos,G,GS
361,Spencer Strider,1.992095,1.992095,0.000493,1,347.2,11.2,0.992095,1.0,"SP,RP",31,20
132,Liam Hendriks,1.955961,1.955788,0.000347,3,363.066667,5.410865,0.989501,0.966287,RP,58,0
61,Kenley Jansen,1.930442,1.92312,0.009763,4,309.55,4.674164,0.983652,0.939468,RP,65,0
33,Edwin Díaz,1.92961,1.921787,0.010431,4,357.075,5.40242,0.980656,0.941131,RP,61,0
158,Cristian Javier,1.918259,1.861688,0.113142,3,194.466667,7.38,0.921182,0.940505,SP,30,25
199,Emmanuel Clase,1.910483,1.888159,0.089295,2,317.2,4.266654,0.977031,0.911128,RP,77,0
224,Jordan Romano,1.909973,1.906903,0.01228,2,290.5,4.646441,0.972807,0.934096,RP,63,0
238,Luis Garcia,1.89313,1.89313,0.000468,1,200.1,6.67,0.908397,0.984733,SP,30,28
124,Joe Musgrove,1.886465,1.856254,0.120845,2,214.6,6.70625,0.895127,0.961127,SP,32,31
80,Raisel Iglesias,1.873291,1.876995,-0.004938,4,279.25,4.208927,0.966995,0.91,RP,67,0


In [29]:
draft_df = pd.concat([new_df, new_pitcher_df], ignore_index=True, sort=False)
draft_df = draft_df.sort_values(by='Weighted Rank', ascending=False)
draft_df["G"].fillna("N/A", inplace = True)
draft_df["GS"].fillna("N/A", inplace = True)

draft_df.head(25)

Unnamed: 0,Name,Weighted Rank,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos,G,GS
355,Spencer Strider,1.992095,1.992095,0.000493,1,347.2,11.2,0.992095,1.0,"SP,RP",31.0,20.0
0,Paul Goldschmidt,1.962599,1.963268,-0.000668,5,1632.4,11.676084,0.995547,0.967721,"1B,DH",,
356,Liam Hendriks,1.955961,1.955788,0.000347,3,363.066667,5.410865,0.989501,0.966287,RP,58.0,0.0
1,J.T. Realmuto,1.949851,1.948361,0.00149,5,1379.8,11.558872,0.976092,0.972268,C,,
2,Will Smith,1.94014,1.855459,0.112908,4,1062.0,11.989147,0.863381,0.992078,"C,DH",,
3,Yuli Gurriel,1.930997,1.921756,0.009241,5,1355.2,10.789009,0.979547,0.942209,1B,,
357,Kenley Jansen,1.930442,1.92312,0.009763,4,309.55,4.674164,0.983652,0.939468,RP,65.0,0.0
358,Edwin Díaz,1.92961,1.921787,0.010431,4,357.075,5.40242,0.980656,0.941131,RP,61.0,0.0
4,José Abreu,1.926779,1.937161,-0.010382,5,1415.8,11.080875,0.984405,0.952756,"1B,DH",,
359,Cristian Javier,1.918259,1.861688,0.113142,3,194.466667,7.38,0.921182,0.940505,SP,30.0,25.0


In [30]:
######################################################################################################################
######################################################################################################################
######################################################################################################################
#### DRAFT DAY FUNCTIONS
   
# DROP A PLAYER 
def drafted(player):
    global draft_df
    global final_pitcher_stats_df
    global final_batter_stats_df
    draft_df = draft_df[draft_df.Name != player]
    final_pitcher_stats_df = final_pitcher_stats_df[final_pitcher_stats_df.Name != player]
    final_batter_stats_df = final_batter_stats_df[final_batter_stats_df.Name != player]
    return draft_df.head(25)
    
# FILTER PLAYERS BY POSITION
def position_filter(Pos):
    filtered_draft_df = draft_df[draft_df['Pos'].str.contains(Pos)]
    return filtered_draft_df.head(25)

# PULL PITCHING STAT CATEGORY LEADERS
def pitching_stat_leaders(CAT):
    global final_pitcher_stats_df
    pitching_filtered_draft_df = draft_df.sort_values([CAT], ascending=[False])
    return pitching_filtered_draft_df.head(25)

# PULL BATTING STAT CATEGORY LEADERS
def batting_stat_leaders(CAT):
    global final_batter_stats_df
    batting_filtered_draft_df = draft_df.sort_values([CAT], ascending=[False])
    return batting_filtered_draft_df.head(25)