In [1]:
# Import needed dependencies
import requests
import re
import pandas as pd
import numpy as np
import scipy.stats as stats
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
today = date.today()

# dd/mm/YY
current_year = today.strftime("%Y")

last_year = int(current_year) - 1

In [3]:
last_five_years = []
for i in range(1,6):
    last_five_years.append(int(current_year) - i)

In [4]:
# Create a list to help create a dataframe from batter statistics data
batter_stats = []

# Create a loop to create a dataframe from Baseball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-batting.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = BeautifulSoup(soup.select_one('#all_players_standard_batting').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')


    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        batter_stats.append(tds)

In [5]:
# Create dataframe for batter statistics
batter_stats_df = pd.DataFrame(batter_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
batter_stats_df.columns = df_headers
batter_stats_df

Unnamed: 0,Name,Age,Tm,Lg,G,PA,AB,R,H,2B,...,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos Summary,Year
0,CJ Abrams*,21,TOT,NL,90,302,284,33,70,12,...,.604,76,92,5,9,2,2,0,64/H9D,2022
1,CJ Abrams*,21,SDP,NL,46,139,125,16,29,5,...,.605,77,40,4,6,2,2,0,64/H9D,2022
2,CJ Abrams*,21,WSN,NL,44,163,159,17,41,7,...,.603,75,52,1,3,0,0,0,6/H,2022
3,Albert Abreu,26,TOT,AL,1,0,0,0,0,0,...,,,0,0,0,0,0,0,1,2022
4,Albert Abreu,26,KCR,AL,1,0,0,0,0,0,...,,,0,0,0,0,0,0,/1,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6505,Ryan Zimmerman,33,WSN,NL,85,323,288,33,76,21,...,.824,114,140,10,3,0,2,1,3H,2018
6506,Jordan Zimmermann,32,DET,AL,2,2,2,0,0,0,...,.000,-100,0,0,0,0,0,0,1,2018
6507,Ben Zobrist#,37,CHC,NL,139,520,455,67,139,28,...,.817,117,200,8,2,1,7,1,497H/3D,2018
6508,Mike Zunino,27,SEA,AL,113,405,373,37,75,18,...,.669,85,153,7,6,0,2,0,*2/HD,2018


In [6]:
# Create a list to help create a dataframe from batter statistics data
fielding_stats = []

# Create a loop to create a dataframe from Baseball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-fielding.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = soup.select_one('#all_players_players_standard_fielding_fielding')#.find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')


    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        fielding_stats.append(tds)

# Create dataframe for batter statistics
fielding_stats_df = pd.DataFrame(fielding_stats)

# Create an empty list to store dataframe header information
fielding_header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    fielding_header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
fielding_df_headers = fielding_header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
fielding_df_headers.remove('Rk')
fielding_df_headers.append("Year")

# Set column headers equal to our list
fielding_stats_df.columns = fielding_df_headers

final_fielding_stats_df = fielding_stats_df[['Name','PO','E','Year','Tm']]

position_players_df = pd.merge(final_fielding_stats_df, batter_stats_df, on=['Name','Year','Tm'])
position_players_df

    

Unnamed: 0,Name,PO,E,Year,Tm,Age,Lg,G,PA,AB,...,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos Summary
0,Albert Abreu,2,2,2022,TOT,26,AL,1,0,0,...,,,,0,0,0,0,0,0,1
1,José Abreu,954,11,2022,CHW,35,AL,157,679,601,...,.446,.824,133,268,19,12,0,4,2,*3D
2,Ronald Acuna Jr.,150,4,2022,ATL,24,NL,119,533,467,...,.413,.764,114,193,8,10,0,3,4,9D/H8
3,Willy Adames,178,14,2022,MIL,26,NL,139,617,563,...,.458,.756,112,258,11,1,0,4,3,*6/DH
4,Riley Adams,327,2,2022,WSN,26,NL,48,155,142,...,.310,.555,60,44,2,1,0,0,0,2/HD3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3337,Rob Zastryzny,1,0,2018,CHC,26,NL,6,0,0,...,,,,0,0,0,0,0,0,/1
3338,Brad Ziegler,4,1,2018,TOT,38,NL,76,0,0,...,,,,0,0,0,0,0,0,1
3339,Ryan Zimmerman,538,2,2018,WSN,33,NL,85,323,288,...,.486,.824,114,140,10,3,0,2,1,3H
3340,Jordan Zimmermann,9,0,2018,DET,32,AL,2,2,2,...,.000,.000,-100,0,0,0,0,0,0,1


In [7]:
position_players_df.columns

Index(['Name', 'PO', 'E', 'Year', 'Tm', 'Age', 'Lg', 'G', 'PA', 'AB', 'R', 'H',
       '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG',
       'OPS', 'OPS+', 'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB', 'Pos Summary'],
      dtype='object')

In [8]:
# Change types of columns to numeric for columns with number values
position_players_df[['Age','G','R','H','2B','3B','HR','RBI','SB','TB','BB','SO','PO','E','PA','OPS','OPS+']] = position_players_df[['Age','G', 'R','H','2B','3B','HR','RBI','SB','TB','BB','SO','PO','E','PA','OPS','OPS+']].apply(pd.to_numeric)

# Drop any players with 0 plate appearances to remove null values and change PA type to integer
position_players_df.dropna(subset=['PA'], axis = 0 , inplace= True)

# Remove any players with fewer than 100 plate appearances
filtered_position_players_df = position_players_df[position_players_df['PA'] >= 100]

# Select the columns we want for our batter analysis
final_position_players_df = filtered_position_players_df[['Year','Name','Tm','Age','G', 'R','H','2B','3B','HR','RBI','SB','TB','BB','SO','PO','E','PA','OPS','OPS+','Pos\xa0Summary']]

# Sort by index to prepare to drop duplicates
final_position_players_df = final_position_players_df.sort_index()

# Drop duplicate entries of Player Name and Year
final_position_players_df['1B'] = ''

for index, row in final_position_players_df.iterrows():
    final_position_players_df['1B'] = (final_position_players_df['H'] - (final_position_players_df['2B'] + final_position_players_df['3B'] + final_position_players_df['HR']))

final_position_players_df.columns


Index(['Year', 'Name', 'Tm', 'Age', 'G', 'R', 'H', '2B', '3B', 'HR', 'RBI',
       'SB', 'TB', 'BB', 'SO', 'PO', 'E', 'PA', 'OPS', 'OPS+', 'Pos Summary',
       '1B'],
      dtype='object')

In [9]:
final_position_players_df['FPTS'] = ''
final_position_players_df['AVG_FPTS'] = ''

for index, row in final_position_players_df.iterrows():
    final_position_players_df['FPTS'] = (final_position_players_df['1B'] + (2*final_position_players_df['2B']) + (3*final_position_players_df['3B']) + (4*final_position_players_df['HR']) + final_position_players_df['TB'] + final_position_players_df['BB'] + final_position_players_df['R'] + (2*final_position_players_df['RBI']) + (2*final_position_players_df['SB']) + final_position_players_df['PO'] - (2*final_position_players_df['SO']) - final_position_players_df['E'])
    final_position_players_df['AVG_FPTS'] = (final_position_players_df['FPTS']/final_position_players_df['G'])

final_position_players_df
    

Unnamed: 0,Year,Name,Tm,Age,G,R,H,2B,3B,HR,...,SO,PO,E,PA,OPS,OPS+,Pos Summary,1B,FPTS,AVG_FPTS
1,2022,José Abreu,CHW,35,157,85,183,40,0,15,...,110,954,11,679,0.824,133.0,*3D,128,1556,9.910828
2,2022,Ronald Acuna Jr.,ATL,24,119,71,124,24,0,15,...,126,150,4,533,0.764,114.0,9D/H8,85,562,4.722689
3,2022,Willy Adames,MIL,26,139,83,134,31,0,31,...,166,178,14,617,0.756,112.0,*6/DH,72,692,4.978417
4,2022,Riley Adams,WSN,26,48,14,25,4,0,5,...,46,327,2,155,0.555,60.0,2/HD3,16,367,7.645833
5,2022,Jo Adell,LAA,23,88,22,60,12,2,8,...,107,139,5,285,0.637,79.0,79H/D,38,215,2.443182
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3319,2018,Bobby Wilson,MIN,35,47,12,24,8,0,2,...,37,399,1,151,0.523,43.0,2,14,456,9.702128
3333,2018,Austin Wynns,BAL,27,42,16,28,2,0,4,...,25,257,1,118,0.669,83.0,2/HD,22,333,7.928571
3335,2018,Chris Young,LAA,34,56,17,19,2,1,6,...,37,56,1,128,0.615,68.0,987H/D,10,121,2.160714
3339,2018,Ryan Zimmerman,WSN,33,85,33,76,21,2,13,...,55,538,2,323,0.824,114.0,3H,40,873,10.270588


In [10]:
# Sort data by name alphabetically, then by year in descending order
final_position_players_df = final_position_players_df.sort_values(['Year','Name'], ascending=[True, True])

# Eliminate Baseball Reference's name badges for accolades
final_position_players_df['Name'] = final_position_players_df['Name'].str.extract('([^\*|#]*)')

cleaned_player_list = []
for player in final_position_players_df['Name']:
    player = player.replace("\xa0", " ")
    cleaned_player_list.append(player)

final_position_players_df['Name'] = cleaned_player_list 
final_position_players_df = final_position_players_df.sort_values(['FPTS'], ascending=False)

final_position_players_df

Unnamed: 0,Year,Name,Tm,Age,G,R,H,2B,3B,HR,...,SO,PO,E,PA,OPS,OPS+,Pos Summary,1B,FPTS,AVG_FPTS
2798,2018,Paul Goldschmidt,ARI,30,158,95,172,35,5,33,...,173,1323,6,690,0.922,142.0,*3/DH,99,1968,12.455696
781,2021,Vladimir Guerrero Jr.,TOR,22,161,123,188,29,1,48,...,110,1026,8,698,1.002,167.0,*3D/5,110,1963,12.192547
758,2021,Paul Goldschmidt,STL,33,158,102,177,36,2,31,...,136,1144,2,679,0.879,141.0,*3/DH,108,1881,11.905063
161,2022,Paul Goldschmidt,STL,34,151,106,178,41,0,35,...,141,1071,1,651,0.981,180.0,*3D/H,102,1865,12.350993
1980,2019,Paul Goldschmidt,STL,31,161,97,155,25,1,34,...,166,1256,5,682,0.821,115.0,*3/H,95,1862,11.565217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1345,2020,Jo Adell,LAA,21,38,9,20,4,0,3,...,55,72,3,132,0.478,30.0,9/8H,13,55,1.447368
3139,2018,Yadiel Rivera,MIA,26,111,13,24,3,0,1,...,51,46,4,160,0.485,37.0,H564/7389,20,54,0.486486
1858,2019,Zack Cozart,LAA,33,38,4,12,2,0,0,...,16,30,4,107,0.322,-12.0,5/H64,10,45,1.184211
1513,2020,Carter Kieboom,WSN,22,33,15,20,1,0,0,...,33,16,3,122,0.556,57.0,5/HD,19,39,1.181818


In [11]:
# Create a new dataframe for stats percentile calculations
percentile_df = pd.DataFrame(columns = ['Year', 'Name', 'Age','Pos\xa0Summary', 'Tm', 'FPTS_Percentile', 'AVG_FPTS_Percentile'])

# Carry over columnns from final_batter_stats_df that shouldn't be comparatively ranked 
percentile_df['Year'] = final_position_players_df['Year']
percentile_df['Name'] = final_position_players_df['Name']
percentile_df['Age'] = final_position_players_df['Age']
percentile_df['Tm'] = final_position_players_df['Tm']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
for year in last_five_years:
    year_df = final_position_players_df.loc[final_position_players_df['Year'] == year]
    year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
    year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)
    year_df.sort_values('Name', ascending=True)

    # Each of the seasons are added back to the percentile dataframe
    percentile_df = percentile_df.append(year_df, ignore_index=True)

percentile_df = percentile_df.sort_values(['Year','Name'], ascending=[True, True])
percentile_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)


Unnamed: 0,Year,Name,Age,Pos Summary,Tm,FPTS_Percentile,AVG_FPTS_Percentile,G,R,H,...,BB,SO,PO,E,PA,OPS,OPS+,1B,FPTS,AVG_FPTS
631,2018,A.J. Ellis,37,,SDP,,,,,,...,,,,,,,,,,
2272,2018,A.J. Ellis,37,2H/D7,SDP,0.392,0.72,66.0,19.0,41.0,...,26.0,37.0,321.0,1.0,183.0,0.722,104.0,32.0,425.0,6.439394
346,2018,AJ Pollock,30,,ARI,,,,,,...,,,,,,,,,,
2205,2018,AJ Pollock,30,*8/H,ARI,0.66,0.672,113.0,61.0,106.0,...,31.0,100.0,229.0,1.0,460.0,0.800,108.0,59.0,676.0,5.982301
938,2018,Aaron Altherr,27,,PHI,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1029,2022,Yu Chang,26,,TOT,,,,,,...,,,,,,,,,,
1413,2022,Yu Chang,26,46/531H,TOT,0.129771,0.236641,51.0,14.0,28.0,...,12.0,41.0,116.0,2.0,141.0,0.624,81.0,20.0,168.0,3.294118
1415,2022,Yu Chang,26,46/35HD1,TOT,0.122137,0.064885,69.0,19.0,35.0,...,16.0,59.0,116.0,2.0,190.0,0.605,75.0,25.0,167.0,2.420290
29,2022,Yuli Gurriel,38,,HOU,,,,,,...,,,,,,,,,,


In [12]:
# Clean up the percentile dataframe, drop NaNs and remove unnecessary columns
percentile_df = percentile_df.dropna()

# Add a rank column that adds the percentiles from each category
percentile_df['Rank'] = (percentile_df['FPTS_Percentile'] + percentile_df['AVG_FPTS_Percentile'])

percentile_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  percentile_df['Rank'] = (percentile_df['FPTS_Percentile'] + percentile_df['AVG_FPTS_Percentile'])


Unnamed: 0,Year,Name,Age,Pos Summary,Tm,FPTS_Percentile,AVG_FPTS_Percentile,G,R,H,...,SO,PO,E,PA,OPS,OPS+,1B,FPTS,AVG_FPTS,Rank
2272,2018,A.J. Ellis,37,2H/D7,SDP,0.392,0.72,66.0,19.0,41.0,...,37.0,321.0,1.0,183.0,0.722,104.0,32.0,425.0,6.439394,1.112
2205,2018,AJ Pollock,30,*8/H,ARI,0.66,0.672,113.0,61.0,106.0,...,100.0,229.0,1.0,460.0,0.800,108.0,59.0,676.0,5.982301,1.332
2331,2018,Aaron Altherr,27,9H8/7,PHI,0.156,0.06,105.0,28.0,44.0,...,91.0,95.0,1.0,285.0,0.628,69.0,24.0,220.0,2.095238,0.216
2224,2018,Aaron Judge,26,9D/H8,NYY,0.584,0.588,112.0,77.0,115.0,...,152.0,171.0,3.0,498.0,0.919,150.0,66.0,599.0,5.348214,1.172
2254,2018,Adam Duvall,29,7H3/95,TOT,0.466,0.268,138.0,48.0,75.0,...,117.0,228.0,2.0,427.0,0.639,70.0,40.0,483.0,3.500000,0.734
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1248,2022,Yandy Díaz,30,53DH,TBR,0.759542,0.645038,137.0,71.0,140.0,...,60.0,184.0,6.0,558.0,0.824,143.0,98.0,727.0,5.306569,1.40458
1292,2022,Yonathan Daza,28,*87/H9,COL,0.593511,0.60687,113.0,56.0,112.0,...,58.0,252.0,6.0,408.0,0.733,97.0,87.0,566.0,5.008850,1.200382
1413,2022,Yu Chang,26,46/531H,TOT,0.129771,0.236641,51.0,14.0,28.0,...,41.0,116.0,2.0,141.0,0.624,81.0,20.0,168.0,3.294118,0.366412
1415,2022,Yu Chang,26,46/35HD1,TOT,0.122137,0.064885,69.0,19.0,35.0,...,59.0,116.0,2.0,190.0,0.605,75.0,25.0,167.0,2.420290,0.187023


In [13]:
# Create a list of each unique player we have in our dataframe
player_list = percentile_df.Name.unique().tolist()

# Create a new dataframe for combined, averaged percentiles over the past 5 seasons
new_df = pd.DataFrame(columns = ['Name', 'Rank', 'Trend', 'Pos', 'Years', 'FPTS', 'AVG_FPTS', 'FPTS_Percentile', 'AVG_FPTS_Percentile'])

# Create a list for each percentile stat category for upcoming loop
player_trends = []
average_FPTS = []
average_AVG_FPTS = []
average_FPTS_Percentile = []
average_AVG_FPTS_Percentile = []
average_Rank = []
year_count = []
pos = []

# Loop through each player, check if they played in the past two seasons. If not, remove them
for player in player_list:
    filter_df = percentile_df.loc[percentile_df['Name'] == player]
    filter_df = filter_df.sort_values(['Year'], ascending=[False])
    year_list = filter_df.Year.tolist()
    if (year_list[0] != last_year) and (year_list[0] != (last_year - 1)):
        player_list.remove(player)

# Update new dataframe with updated unique player list
new_df['Name'] = player_list        

# Loop through each player, locate their percentile stats for each season, average them out
for player in player_list:
    player_df = percentile_df.loc[percentile_df['Name'] == player]
    
    # We want to find the slope of the line of best fit for each player's overall ranking each season
    x = np.array(player_df['Year'], dtype = float)
    y = np.array(player_df['Rank'], dtype = float)
    slope, intercept = np.polyfit(x, y, 1)
    player_trends.append(slope)
    
    # Find average of each player's percentiles from previous 5 seasons
    average_FPTS.append(sum(player_df['FPTS']) / len(player_df['FPTS']))
    average_AVG_FPTS.append(sum(player_df['AVG_FPTS']) / len(player_df['AVG_FPTS']))
    average_FPTS_Percentile.append(sum(player_df['FPTS_Percentile']) / len(player_df['FPTS_Percentile']))
    average_AVG_FPTS_Percentile.append(sum(player_df['AVG_FPTS_Percentile']) / len(player_df['AVG_FPTS_Percentile']))
    average_Rank.append(sum(player_df['Rank']) / len(player_df['Rank']))
    year_count.append(len(x))
    
    # Keep player positions for reference purposes during the draft
    pos.append(player_df['Pos\xa0Summary'].unique())

# Update new dataframe with the list data from each stat
new_df['Pos'] = pos
new_df['Trend'] = player_trends
new_df['FPTS'] = average_FPTS
new_df['AVG_FPTS'] = average_AVG_FPTS
new_df['FPTS_Percentile'] = average_FPTS_Percentile
new_df['AVG_FPTS_Percentile'] = average_AVG_FPTS_Percentile
new_df['Rank'] = average_Rank

# Keep track of how many seasons are being considered, so we know how reliable the data is
new_df['Years'] = year_count



  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

In [14]:
# Create a weighted rank column by adding trend data to the rank data and account for number of seasons played
# Basically, if you played all 5 seasons, your trend stat is added directly
# If you played fewer than all 5 seasons, your trend stat is reduced depending on how few seasons you played
# new_df['Weighted Rank'] = (new_df['Rank'] + ((new_df['Trend'] * (new_df['Years'] - 1) / 4)))

# # shift column 'Weighted Rank' to first position
# first_column = new_df.pop('Weighted Rank')
  
# # insert column using insert(position,column_name,first_column) function
# new_df.insert(1, 'Weighted Rank', first_column)

new_df = new_df.sort_values('FPTS', ascending = False)

In [15]:
# separate position column into a list for editing
new_pos_list = []
pos_list = new_df['Pos'].tolist()

# loop through list and pull only the last item, which represents player position listed from most recent season
for i in pos_list:
    j = i[-1]
    
    if re.search('/', j):
        k = re.sub("([^\/]+$)","",j)
        new_pos_list.append(k)
    else:
        new_pos_list.append(j)

cleaned_list = []
for pos in new_pos_list:
    placeholder = re.findall("[a-zA-Z0-9]+", pos)
    placeholder_2 = ''.join(placeholder)
    placeholder_3 = [d for d in placeholder_2]
    cleaned_list.append(placeholder_3)

cleaned_pos_list = []
for n_list in cleaned_list:
    
    placeholder_list = []
    for pos in n_list:
        if pos == '1':
            placeholder_list.append('P')
        elif pos == '2':
            placeholder_list.append('C')
        elif pos == '3':
            placeholder_list.append('1B')
        elif pos == '4':
            placeholder_list.append('2B')
        elif pos == '5':
            placeholder_list.append('3B')
        elif pos == '6':
            placeholder_list.append('SS')
        elif pos == ('7'):
            placeholder_list.append('OF')
        elif pos == ('8'):
            placeholder_list.append('OF')
        elif pos == ('9'):
            placeholder_list.append('OF')
        elif pos == ('D'):
            placeholder_list.append('DH')
        
    cleaned_pos_list.append(placeholder_list)        

temp_pos_list = []
for item in cleaned_pos_list:
    new_string = []
    for pos in item:
        string = str(pos)
        new_string = f'{new_string},{string}'
    temp_pos_list.append(new_string)

    
final_pos_list = []
for i in temp_pos_list:
    i = i.replace('[],', '')
    final_pos_list.append(i)
    
# replace old position column with new position column
new_df.drop('Pos', axis = 1, inplace = True)
new_df['Pos'] = final_pos_list

new_df.head(25)

Unnamed: 0,Name,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos
138,Paul Goldschmidt,1.963268,-0.000668,5,1632.4,11.676084,0.995547,0.967721,"1B,DH"
89,José Abreu,1.937161,-0.010382,5,1415.8,11.080875,0.984405,0.952756,"1B,DH"
219,Pete Alonso,1.895619,0.021368,4,1403.75,10.211517,0.972342,0.923277,"1B,DH"
70,J.T. Realmuto,1.948361,0.00149,5,1379.8,11.558872,0.976092,0.972268,C
26,C.J. Cron,1.83838,0.076399,4,1365.75,9.822698,0.938527,0.899853,"1B,DH"
181,Yuli Gurriel,1.921756,0.009241,5,1355.2,10.789009,0.979547,0.942209,1B
190,Christian Walker,1.834298,0.021583,4,1197.25,9.611939,0.944284,0.890014,1B
143,Rhys Hoskins,1.849587,0.065053,5,1197.2,10.119458,0.942848,0.90674,1B
25,Buster Posey,1.875741,0.018211,3,1192.333333,10.779375,0.930022,0.945719,C
228,Vladimir Guerrero Jr.,1.629204,0.371922,4,1169.25,8.859455,0.849491,0.779713,"1B,DH"


In [16]:
############################################################################################
############################################################################################
############################################################################################

#    However, for example, if your league is set to have a Games Started limit of 12 and you have 10 pitchers at the 
#    completion of Saturday's games and start 4 pitchers on Sunday, you will receive stats for all 14 pitchers. 

#    (Note: This can happen on any day during the week. If managers have 10 pitchers by end of Wednesday and 
#    starts 4 on Thursday, they will receive points for the 4 pitchers on Thursday but for Friday, Saturday and 
#    Sunday, they will not receive any starting pitchers points.)

############################################################################################
############################################################################################
############################################################################################


In [17]:
# Create a list to help create a dataframe from batter statistics data
pitcher_stats = []

for year in last_five_years:

    # input URL and use BeautifulSoup to parse through the page
    pitching_url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-pitching.shtml'
    pitching_soup = BeautifulSoup(requests.get(pitching_url).content, 'html.parser')

    # Grab the table element that has batter statistics
    pitching_table = BeautifulSoup(pitching_soup.select_one('#all_players_standard_pitching').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')

    # Grab data from table and put it into the list created above
    for tr in pitching_table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        pitcher_stats.append(tds)
        

In [18]:
# Create dataframe for batter statistics
raw_pitcher_stats_df = pd.DataFrame(pitcher_stats)

# Create an empty list to store dataframe header information
pitcher_header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in pitching_table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    pitcher_header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
pitcher_df_headers = pitcher_header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
pitcher_df_headers.remove('Rk')
pitcher_df_headers.append("Year")

# Set column headers equal to our list
raw_pitcher_stats_df.columns = pitcher_df_headers



In [19]:
raw_pitcher_stats_df

Unnamed: 0,Name,Age,Tm,Lg,W,L,W-L%,ERA,G,GS,...,BF,ERA+,FIP,WHIP,H9,HR9,BB9,SO9,SO/W,Year
0,Cory Abbott,26,WSN,NL,0,5,.000,5.25,16,9,...,216,75,6.36,1.438,8.3,2.3,4.7,8.4,1.80,2022
1,Albert Abreu,26,TOT,AL,2,2,.500,3.26,33,0,...,172,124,4.77,1.474,8.1,1.2,5.1,8.8,1.73,2022
2,Albert Abreu,26,TEX,AL,0,0,,3.12,7,0,...,42,132,8.54,1.846,4.2,2.1,12.5,9.3,0.75,2022
3,Albert Abreu,26,KCR,AL,0,0,,4.15,4,0,...,22,107,8.19,2.308,12.5,2.1,8.3,6.2,0.75,2022
4,Albert Abreu,26,NYY,AL,2,2,.500,3.16,22,0,...,108,125,2.92,1.208,8.8,0.7,2.1,9.1,4.33,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5071,Brad Ziegler,38,TOT,NL,2,6,.250,3.91,82,0,...,305,102,4.31,1.303,8.7,1.0,3.1,6.1,2.00,2018
5072,Brad Ziegler,38,MIA,NL,1,5,.167,3.98,53,0,...,213,97,4.58,1.269,8.5,1.2,2.9,6.4,2.18,2018
5073,Brad Ziegler,38,ARI,NL,1,1,.500,3.74,29,0,...,92,114,3.67,1.385,9.1,0.4,3.3,5.4,1.63,2018
5074,Jordan Zimmermann,32,DET,AL,7,8,.467,4.52,25,25,...,556,99,4.88,1.264,9.6,1.9,1.8,7.6,4.27,2018


In [20]:
# Create a list to help create a dataframe from batter statistics data
reliever_stats = []

# Create a loop to create a dataframe from Baseball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{year}-reliever-pitching.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = BeautifulSoup(soup.select_one('#all_players_reliever_pitching').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')


    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        reliever_stats.append(tds)

# Create dataframe for batter statistics
reliever_stats_df = pd.DataFrame(reliever_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
reliever_stats_df.columns = df_headers


final_reliever_stats_df = reliever_stats_df[['Name','Hold','Year','Tm']]

final_pitcher_stats_df = pd.merge(final_reliever_stats_df, raw_pitcher_stats_df, how = 'outer', on=['Name','Year','Tm'])

final_pitcher_stats_df



Unnamed: 0,Name,Hold,Year,Tm,Age,Lg,W,L,W-L%,ERA,...,WP,BF,ERA+,FIP,WHIP,H9,HR9,BB9,SO9,SO/W
0,Cory Abbott,0,2022,WSN,26,NL,0,5,.000,5.25,...,2,216,75,6.36,1.438,8.3,2.3,4.7,8.4,1.80
1,Albert Abreu,1,2022,TOT,26,AL,2,2,.500,3.26,...,6,172,124,4.77,1.474,8.1,1.2,5.1,8.8,1.73
2,Albert Abreu,0,2022,TEX,26,AL,0,0,,3.12,...,0,42,132,8.54,1.846,4.2,2.1,12.5,9.3,0.75
3,Albert Abreu,0,2022,KCR,26,AL,0,0,,4.15,...,1,22,107,8.19,2.308,12.5,2.1,8.3,6.2,0.75
4,Albert Abreu,1,2022,NYY,26,AL,2,2,.500,3.16,...,5,108,125,2.92,1.208,8.8,0.7,2.1,9.1,4.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5123,Taijuan Walker,,2018,ARI,25,NL,0,0,,3.46,...,0,56,124,3.93,1.538,10.4,0.7,3.5,6.2,1.80
5124,Zack Wheeler,,2018,NYM,28,NL,12,7,.632,3.31,...,2,744,112,3.25,1.124,7.4,0.7,2.7,8.8,3.25
5125,Trevor Williams,,2018,PIT,26,NL,14,10,.583,3.11,...,4,701,126,3.86,1.178,7.7,0.8,2.9,6.6,2.29
5126,Jordan Zimmermann,,2018,DET,32,AL,7,8,.467,4.52,...,1,556,99,4.88,1.264,9.6,1.9,1.8,7.6,4.27


In [21]:
final_pitcher_stats_df.columns

Index(['Name', 'Hold', 'Year', 'Tm', 'Age', 'Lg', 'W', 'L', 'W-L%', 'ERA', 'G',
       'GS', 'GF', 'CG', 'SHO', 'SV', 'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB',
       'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP', 'WHIP', 'H9', 'HR9',
       'BB9', 'SO9', 'SO/W'],
      dtype='object')

In [22]:
# Change types of columns to numeric for columns with number values
final_pitcher_stats_df[['Age','G','GS','IP','ER','W','L','SV','SO','H','BB','CG','Hold']] = final_pitcher_stats_df[['Age','G','GS','IP','ER','W','L','SV','SO','H','BB','CG','Hold']].apply(pd.to_numeric)

# Drop any players with NaN innings pitched, ERA, and WHIP to remove null values 
final_pitcher_stats_df["Hold"].fillna(0, inplace = True)
final_pitcher_stats_df.dropna(subset=['IP'], axis = 0 , inplace= True)
final_pitcher_stats_df.dropna(subset=['ERA'], axis = 0 , inplace= True)
final_pitcher_stats_df.dropna(subset=['WHIP'], axis = 0 , inplace= True)
final_pitcher_stats_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Remove any pitchers with fewer than 30 innings pitched
final_pitcher_stats_df = final_pitcher_stats_df[final_pitcher_stats_df['IP'] >= 30]

# Select the columns we want for our pitcher analysis
final_pitcher_stats_df = final_pitcher_stats_df[['Year','Name','Age','G','GS','IP','ER','W','L','SV','SO','H','BB','CG','Hold']]

# Eliminate Baseball Reference's name badges for accolades
final_pitcher_stats_df['Name'] = final_pitcher_stats_df['Name'].str.extract('([^\*|#]*)')

pitcher_list = final_pitcher_stats_df.Name.tolist()

cleaned_pitcher_list = []
for pitcher in pitcher_list:
    pitcher = pitcher.replace("\xa0", " ")
    cleaned_pitcher_list.append(pitcher) 
    
final_pitcher_stats_df['Name'] = cleaned_pitcher_list 




In [23]:

final_pitcher_stats_df
test_df = final_pitcher_stats_df.loc[final_pitcher_stats_df['Name'] == 'Blake Snell']
test_df


Unnamed: 0,Year,Name,Age,G,GS,IP,ER,W,L,SV,SO,H,BB,CG,Hold
739,2022,Blake Snell,29.0,24,24,128.0,48,8,10,0,171,103,51,0,0.0
4633,2021,Blake Snell,28.0,27,27,128.2,60,7,6,0,170,101,69,0,0.0
4784,2020,Blake Snell,27.0,11,11,50.0,18,4,2,0,63,42,18,0,0.0
4943,2019,Blake Snell,26.0,23,23,107.0,51,6,8,0,147,96,40,0,0.0
5102,2018,Blake Snell,25.0,31,31,180.2,38,21,5,0,221,112,64,0,0.0


In [24]:
# Sort data by name alphabetically, then by year in descending order
final_pitcher_stats_df = final_pitcher_stats_df.sort_values(['Year','Name'], ascending=[True, True])



final_pitcher_stats_df['FPTS'] = ''
final_pitcher_stats_df['AVG_FPTS'] = ''
final_pitcher_stats_df['Pos'] = ''
pos_list = []
final_pos_list = []

for index, row in final_pitcher_stats_df.iterrows():
    final_pitcher_stats_df['FPTS'] = ((3*final_pitcher_stats_df['IP']) - final_pitcher_stats_df['H'] - (2*final_pitcher_stats_df['ER']) - final_pitcher_stats_df['BB'] + (2*final_pitcher_stats_df['W']) - (2*final_pitcher_stats_df['L']) + (5*final_pitcher_stats_df['SV']) + (2*final_pitcher_stats_df['SO']) + (3*final_pitcher_stats_df['CG']) + (2*final_pitcher_stats_df['Hold']))
    final_pitcher_stats_df['AVG_FPTS'] = (final_pitcher_stats_df['FPTS']/final_pitcher_stats_df['G'])
#     final_pitcher_stats_df['Pos'] = (final_pitcher_stats_df['GS']/final_pitcher_stats_df['G'])
    
# #    print(final_pitcher_stats_df['GS'] / final_pitcher_stats_df['G'])
    pitcher_ratio = (final_pitcher_stats_df['GS'] / final_pitcher_stats_df['G'])[index]
    pos_list.append(pitcher_ratio)
# #    final_pitcher_stats_df['Pos'] = pitcher_ratio

for i in range(len(pos_list)):
    if pos_list[i] > (2/3):
        final_pos_list.append('SP')
    elif pos_list[i] < (1/3):
        final_pos_list.append('RP')
    else:
        final_pos_list.append('SP,RP')
        
final_pitcher_stats_df['Pos'] = final_pos_list
        
final_pitcher_stats_df = final_pitcher_stats_df.sort_values(['FPTS'], ascending=False)
final_pitcher_stats_df.head(50)


Unnamed: 0,Year,Name,Age,G,GS,IP,ER,W,L,SV,SO,H,BB,CG,Hold,FPTS,AVG_FPTS,Pos
4838,2019,Gerrit Cole,28.0,33,33,212.1,59,20,5,0,326,142,48,0,0.0,1010.3,30.615152,SP
4954,2019,Justin Verlander,36.0,34,34,223.0,64,21,6,0,300,137,42,2,0.0,998.0,29.352941,SP
5096,2018,Max Scherzer,33.0,33,33,220.2,62,18,7,0,300,150,51,2,0.0,963.6,29.2,SP
5120,2018,Justin Verlander,35.0,34,34,214.0,60,16,9,0,290,156,37,1,0.0,926.0,27.235294,SP
4995,2018,Jacob deGrom,30.0,32,32,217.0,41,10,9,0,269,152,46,1,0.0,914.0,28.5625,SP
4989,2018,Gerrit Cole,27.0,32,32,200.1,64,15,5,0,276,143,64,1,0.0,840.3,26.259375,SP
4843,2019,Jacob deGrom,31.0,32,32,204.0,55,11,8,0,255,154,44,0,0.0,820.0,25.625,SP
4649,2021,Zack Wheeler,31.0,32,32,213.1,66,14,10,0,247,169,46,3,0.0,803.3,25.103125,SP
2652,2019,Shane Bieber,24.0,34,33,214.1,78,15,8,0,259,186,40,3,0.0,801.3,23.567647,SP
5069,2018,Aaron Nola,25.0,33,33,212.1,56,17,6,0,224,149,58,0,0.0,787.3,23.857576,SP


In [25]:
# Create a new dataframe for stats percentile calculations
pitcher_percentile_df = pd.DataFrame(columns = ['Year','Name','Age','G','GS','IP','ER','W','L','SV','SO','H','BB','CG','Hold','Pos'])

# Carry over columnns from final_pitcher_stats_df that shouldn't be comparatively ranked 
pitcher_percentile_df['Year'] = final_pitcher_stats_df['Year']
pitcher_percentile_df['Name'] = final_pitcher_stats_df['Name']
pitcher_percentile_df['Age'] = final_pitcher_stats_df['Age']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
for year in last_five_years:
    year_df = final_pitcher_stats_df.loc[final_pitcher_stats_df['Year'] == year]
    year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
    year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)
    year_df.sort_values('Name', ascending=True)    
    
    # Each of the seasons are added back to the percentile dataframe
    pitcher_percentile_df = pitcher_percentile_df.append(year_df, ignore_index=True)

pitcher_percentile_df = pitcher_percentile_df.sort_values(['Year','Name'], ascending=[True, True])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)


In [26]:
# Clean up the percentile dataframe, drop NaNs and remove unnecessary columns
pitcher_percentile_df = pitcher_percentile_df.dropna()

# Add a rank column that adds the percentiles from each category
pitcher_percentile_df['Rank'] = (pitcher_percentile_df['FPTS_Percentile'] + pitcher_percentile_df['AVG_FPTS_Percentile'])

pitcher_percentile_df


Unnamed: 0,Year,Name,Age,G,GS,IP,ER,W,L,SV,...,H,BB,CG,Hold,Pos,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Rank
4457,2018,A.J. Cole,26.0,28,0,38.0,18,3,1,0,...,39,16,0,0.0,RP,125.0,4.464286,0.234082,0.421348,0.655431
4461,2018,A.J. Cole,26.0,32,2,48.1,33,4,2,0,...,55,22,0,0.0,RP,123.3,3.853125,0.225655,0.325843,0.551498
4203,2018,A.J. Minter,24.0,65,0,61.1,22,4,3,15,...,57,22,0,12.0,RP,299.3,4.604615,0.709738,0.440075,1.149813
4533,2018,Aaron Bummer,24.0,37,0,31.2,15,0,1,0,...,40,10,0,2.0,RP,85.6,2.313514,0.091760,0.058052,0.149813
4456,2018,Aaron Loup,30.0,59,0,39.2,20,0,0,0,...,48,14,0,11.0,RP,125.6,2.128814,0.235955,0.043071,0.279026
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2561,2022,Zach Thompson,28.0,29,22,121.2,70,3,10,0,...,138,46,0,0.0,SP,205.6,7.089655,0.484733,0.597328,1.082061
2493,2022,Zack Greinke,38.0,26,26,137.0,56,4,9,0,...,157,27,0,0.0,SP,251.0,9.653846,0.613550,0.692748,1.306298
2719,2022,Zack Littell,26.0,39,0,44.1,25,3,3,1,...,48,13,0,5.0,RP,114.3,2.930769,0.183206,0.091603,0.274809
2724,2022,Zack Thompson,24.0,22,1,34.2,8,1,1,1,...,20,14,0,0.0,RP,111.6,5.072727,0.173664,0.459924,0.633588


In [27]:


# test_df = raw_pitcher_stats_df.loc[raw_pitcher_stats_df['Name'] == 'Blake\xa0Snell']
# test_df

In [28]:
# Create a list of each unique player we have in our dataframe
pitcher_list = pitcher_percentile_df.Name.unique().tolist()

# Create a new dataframe for combined, averaged percentiles over the past 5 seasons
new_pitcher_df = pd.DataFrame(columns = ['Name', 'Rank', 'Trend', 'Years', 'FPTS', 'AVG_FPTS', 'FPTS_Percentile', 'AVG_FPTS_Percentile','Pos','G','GS'])

# Create a list for each percentile stat category for upcoming loop
pitcher_trends = []
average_FPTS = []
average_AVG_FPTS = []
average_FPTS_Percentile = []
average_AVG_FPTS_Percentile = []
pitcher_average_Rank = []
pitcher_year_count = []
pos = []
games = []
games_started = []

cleaned_pitcher_list = []
for pitcher in pitcher_list:
    pitcher = pitcher.replace("\xa0", " ")
    cleaned_pitcher_list.append(pitcher) 

# new_pitcher_df
    
# Loop through each player, check if they played in the past two seasons. If not, remove them
for pitcher in cleaned_pitcher_list:
    filter_df = pitcher_percentile_df.loc[pitcher_percentile_df['Name'] == pitcher]
    filter_df = filter_df.sort_values(['Year'], ascending=[False])
    year_list = filter_df.Year.tolist()
    if (year_list[0] != last_year) and (year_list[0] != (last_year - 1)):
        pitcher_list.remove(pitcher)

# Update new dataframe with updated unique player list
new_pitcher_df['Name'] = pitcher_list        

# Loop through each player, locate their percentile stats for each season, average them out
for pitcher in pitcher_list:
    pitcher_df = pitcher_percentile_df.loc[pitcher_percentile_df['Name'] == pitcher]
    
    # We want to find the slope of the line of best fit for each player's overall ranking each season
    x = np.array(pitcher_df['Year'], dtype = float)
    y = np.array(pitcher_df['Rank'], dtype = float)
    slope, intercept = np.polyfit(x, y, 1)
    pitcher_trends.append(slope)
    
    # Find average of each player's percentiles from previous 5 seasons
    average_FPTS.append(sum(pitcher_df['FPTS']) / len(pitcher_df['FPTS']))
    average_AVG_FPTS.append(sum(pitcher_df['AVG_FPTS']) / len(pitcher_df['AVG_FPTS']))
    average_FPTS_Percentile.append(sum(pitcher_df['FPTS_Percentile']) / len(pitcher_df['FPTS_Percentile']))
    average_AVG_FPTS_Percentile.append(sum(pitcher_df['AVG_FPTS_Percentile']) / len(pitcher_df['AVG_FPTS_Percentile']))
    pitcher_average_Rank.append(sum(pitcher_df['Rank']) / len(pitcher_df['Rank']))
    pitcher_year_count.append(len(x))
    pos.append(pitcher_df['Pos'].iloc[-1])
    games.append(pitcher_df['G'].iloc[-1])
    games_started.append(pitcher_df['GS'].iloc[-1])

# Update new dataframe with the list data from each stat
new_pitcher_df['Trend'] = pitcher_trends
new_pitcher_df['Pos'] = pos
new_pitcher_df['G'] = games
new_pitcher_df['GS'] = games_started
new_pitcher_df['FPTS'] = average_FPTS
new_pitcher_df['AVG_FPTS'] = average_AVG_FPTS
new_pitcher_df['FPTS_Percentile'] = average_FPTS_Percentile
new_pitcher_df['AVG_FPTS_Percentile'] = average_AVG_FPTS_Percentile
new_pitcher_df['Rank'] = pitcher_average_Rank

# Keep track of how many seasons are being considered, so we know how reliable the data is
new_pitcher_df['Years'] = pitcher_year_count



  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

In [29]:
# Create a weighted rank column by adding trend data to the rank data and account for number of seasons played
# Basically, if you played all 5 seasons, your trend stat is added directly
# If you played fewer than all 5 seasons, your trend stat is reduced depending on how few seasons you played
# new_pitcher_df['Weighted Rank'] = (new_pitcher_df['Rank'] + ((new_pitcher_df['Trend'] * (new_pitcher_df['Years'] - 1) / 4)))

# # shift column 'Weighted Rank' to first position
# first_pitcher_column = new_pitcher_df.pop('Weighted Rank')
  
# # insert column using insert(position,column_name,first_column) function
# new_pitcher_df.insert(1, 'Weighted Rank', first_pitcher_column)

new_pitcher_df = new_pitcher_df.sort_values('FPTS', ascending = False)
new_pitcher_df.head(25)


Unnamed: 0,Name,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos,G,GS
142,Justin Verlander,1.991142,-0.002376,3,878.0,27.315126,0.994923,0.996219,SP,28,28
90,Gerrit Cole,1.973709,-0.003759,5,729.9,25.954178,0.98892,0.98479,SP,33,33
3,Aaron Nola,1.932655,-0.001182,5,611.1,21.651515,0.973629,0.959026,SP,32,32
617,Spencer Strider,1.900763,0.00047,1,604.6,19.503226,0.96374,0.937023,"SP,RP",31,20
175,Max Scherzer,1.907929,-0.019894,7,582.3,25.799976,0.932045,0.975885,SP,23,23
102,Jacob deGrom,1.922392,-0.056638,5,572.66,28.429924,0.92882,0.993572,SP,11,11
358,Alek Manoah,1.894205,0.118079,2,539.1,20.979516,0.926833,0.967372,SP,31,31
47,Charlie Morton,1.701021,-0.016844,5,537.44,18.573587,0.840384,0.860637,SP,31,31
252,Zack Wheeler,1.850295,0.008665,5,535.18,20.303524,0.926475,0.92382,SP,26,26
489,Shane McClanahan,1.869118,0.177795,2,527.3,19.707071,0.924409,0.944709,SP,28,28


In [30]:
draft_df = pd.concat([new_df, new_pitcher_df], ignore_index=True, sort=False)

# draft_df.drop(columns = ['Rank','FPTS_Percentile','AVG_FPTS_Percentile'])


# draft_df['FPTS_Percentile'] = draft_df['FPTS'].rank(pct=True)
# draft_df['AVG_FPTS_Percentile'] = draft_df['AVG_FPTS'].rank(pct=True)
    
# draft_df['Rank'] = (draft_df['FPTS_Percentile'] + (draft_df['AVG_FPTS_Percentile'] / 2))

# draft_df['Weighted Rank'] = (draft_df['Rank'] + (((draft_df['Trend'] / 2) * (draft_df['Years'] - 1) / 4)))
# shift column 'Weighted Rank' to first position
# weighted_rank_column = draft_df.pop('Weighted Rank')
  
# # insert column using insert(position,column_name,first_column) function
# draft_df.insert(1, 'Weighted Rank', weighted_rank_column)


draft_df['Weighted_Rank'] = ''
draft_df['Weighted_Rank'] = abs(draft_df['FPTS'] * (1 + draft_df['Trend']))

weighted_rank_column = draft_df.pop('Weighted_Rank')
  
# insert column using insert(position,column_name,first_column) function
draft_df.insert(1, 'Weighted_Rank', weighted_rank_column)


draft_df = draft_df.sort_values(by='Weighted_Rank', ascending=False)
draft_df["G"].fillna("N/A", inplace = True)
draft_df["GS"].fillna("N/A", inplace = True)

draft_df.head(50)

Unnamed: 0,Name,Weighted_Rank,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos,G,GS
0,Paul Goldschmidt,1631.308945,1.963268,-0.000668,5,1632.4,11.676084,0.995547,0.967721,"1B,DH",,
9,Vladimir Guerrero Jr.,1604.119752,1.629204,0.371922,4,1169.25,8.859455,0.849491,0.779713,"1B,DH",,
4,C.J. Cron,1470.091665,1.83838,0.076399,4,1365.75,9.822698,0.938527,0.899853,"1B,DH",,
2,Pete Alonso,1433.74568,1.895619,0.021368,4,1403.75,10.211517,0.972342,0.923277,"1B,DH",,
1,José Abreu,1401.101654,1.937161,-0.010382,5,1415.8,11.080875,0.984405,0.952756,"1B,DH",,
3,J.T. Realmuto,1381.856144,1.948361,0.00149,5,1379.8,11.558872,0.976092,0.972268,C,,
5,Yuli Gurriel,1367.722787,1.921756,0.009241,5,1355.2,10.789009,0.979547,0.942209,1B,,
24,Ryan Mountcastle,1303.125962,1.540641,0.414391,3,921.333333,7.911266,0.75598,0.784661,"1B,DH",,
7,Rhys Hoskins,1275.08205,1.849587,0.065053,5,1197.2,10.119458,0.942848,0.90674,1B,,
53,Ty France,1240.459979,1.129641,0.625767,4,763.0,6.281866,0.582624,0.547016,1B,,


In [31]:
######################################################################################################################
######################################################################################################################
######################################################################################################################
#### DRAFT DAY FUNCTIONS
   
# DROP A PLAYER 
def drafted(player):
    global draft_df
    global final_pitcher_stats_df
    global final_batter_stats_df
    draft_df = draft_df[draft_df.Name != player]
    final_pitcher_stats_df = final_pitcher_stats_df[final_pitcher_stats_df.Name != player]
    final_batter_stats_df = final_batter_stats_df[final_batter_stats_df.Name != player]
    return draft_df.head(25)
    
# FILTER PLAYERS BY POSITION
def position_filter(POS):
    filtered_draft_df = draft_df[draft_df['Pos'].str.contains(POS)]
    return filtered_draft_df.head(25)

# PULL PITCHING STAT CATEGORY LEADERS
def pitching_stat_leaders(CAT):
    global final_pitcher_stats_df
    pitching_filtered_draft_df = final_pitcher_stats_df.sort_values([CAT], ascending=[False])
    return pitching_filtered_draft_df.head(25)

# PULL BATTING STAT CATEGORY LEADERS
def batting_stat_leaders(CAT):
    global final_position_players_df
    batting_filtered_draft_df = final_position_players_df.sort_values([CAT], ascending=[False])
    return batting_filtered_draft_df.head(25)

def drop_all_position(POS):
    global draft_df
    draft_df = draft_df[draft_df.Pos != POS]
    return draft_df.head(25)