In [1]:
# Import needed dependencies
import requests
import re
import pandas as pd
import numpy as np
import scipy.stats as stats
from datetime import date
from bs4 import BeautifulSoup, Comment

pd.set_option('display.max_columns', None)

In [2]:
today = date.today()

# dd/mm/YY
current_year = today.strftime("%Y")

last_year = int(current_year) - 1

In [3]:
last_five_years = []
for i in range(1,6):
    last_five_years.append(int(current_year) - i)

In [4]:
# Create a list to help create a dataframe from batter statistics data
batter_stats = []

# Create a loop to create a dataframe from Baseball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-batting.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = soup.select_one('#all_players_standard_batting')


    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        batter_stats.append(tds)

In [5]:
# Create dataframe for batter statistics
batter_stats_df = pd.DataFrame(batter_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
batter_stats_df.columns = df_headers
batter_stats_df

Unnamed: 0,Player,Age,Team,Lg,WAR,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,rOBA,Rbat+,TB,GIDP,HBP,SH,SF,IBB,Pos,Awards,Year
0,Jarren Duran*,27,BOS,AL,8.7,160,735,671,111,191,48,14,21,75,34,7,54,160,.285,.342,.492,.834,129,.373,134,330,6,6,1,3,1,*87,"AS,MVP-8",2024.0
1,Shohei Ohtani*,29,LAD,NL,9.2,159,731,636,134,197,38,7,54,130,59,4,81,162,.310,.390,.646,1.036,190,.449,190,411,7,6,0,5,10,*D,"AS,MVP-1,SS",2024.0
2,Gunnar Henderson*,23,BAL,AL,9.1,159,719,630,118,177,31,7,37,92,21,4,78,159,.281,.364,.529,.893,159,.385,157,333,2,7,0,4,1,*6/D,"AS,MVP-4",2024.0
3,Marcus Semien,33,TEX,AL,4.1,159,718,650,101,154,27,2,23,74,8,3,64,105,.237,.308,.391,.699,100,.310,100,254,9,3,0,1,2,*4,AS,2024.0
4,Juan Soto*,25,NYY,AL,7.9,157,713,576,128,166,31,4,41,109,7,4,129,119,.288,.419,.569,.989,178,.424,179,328,10,4,0,4,2,*9/7DH,"AS,MVP-3,SS",2024.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6671,Kyle Wright,24,ATL,NL,2,0,0,0,0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,/1,,2020,,,,
6672,Ryan Yarbrough,28,TBR,"AL,WS",5,0,0,0,0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,/1,,2020,,,,
6673,Eric Yardley,29,MIL,NL,1,0,0,0,0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,/1,,2020,,,,
6674,Huascar Ynoa,22,ATL,NL,1,0,0,0,0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,/1,,2020,,,,


In [6]:
batter_name_cleaning_list = batter_stats_df.Player.to_list()
cleaned_batter_name_list = []

for item in batter_name_cleaning_list:
    cleaned_batter_name_list.append(re.sub(r'[^A-Za-z0-9 ]+', '', item))

batter_stats_df['Player'] = cleaned_batter_name_list
batter_stats_df

Unnamed: 0,Player,Age,Team,Lg,WAR,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,rOBA,Rbat+,TB,GIDP,HBP,SH,SF,IBB,Pos,Awards,Year
0,Jarren Duran,27,BOS,AL,8.7,160,735,671,111,191,48,14,21,75,34,7,54,160,.285,.342,.492,.834,129,.373,134,330,6,6,1,3,1,*87,"AS,MVP-8",2024.0
1,Shohei Ohtani,29,LAD,NL,9.2,159,731,636,134,197,38,7,54,130,59,4,81,162,.310,.390,.646,1.036,190,.449,190,411,7,6,0,5,10,*D,"AS,MVP-1,SS",2024.0
2,Gunnar Henderson,23,BAL,AL,9.1,159,719,630,118,177,31,7,37,92,21,4,78,159,.281,.364,.529,.893,159,.385,157,333,2,7,0,4,1,*6/D,"AS,MVP-4",2024.0
3,Marcus Semien,33,TEX,AL,4.1,159,718,650,101,154,27,2,23,74,8,3,64,105,.237,.308,.391,.699,100,.310,100,254,9,3,0,1,2,*4,AS,2024.0
4,Juan Soto,25,NYY,AL,7.9,157,713,576,128,166,31,4,41,109,7,4,129,119,.288,.419,.569,.989,178,.424,179,328,10,4,0,4,2,*9/7DH,"AS,MVP-3,SS",2024.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6671,Kyle Wright,24,ATL,NL,2,0,0,0,0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,/1,,2020,,,,
6672,Ryan Yarbrough,28,TBR,"AL,WS",5,0,0,0,0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,/1,,2020,,,,
6673,Eric Yardley,29,MIL,NL,1,0,0,0,0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,/1,,2020,,,,
6674,Huascar Ynoa,22,ATL,NL,1,0,0,0,0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,/1,,2020,,,,


In [7]:
# Create a list to help create a dataframe from batter statistics data
fielding_stats = []

# Create a loop to create a dataframe from Baseball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-fielding.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = soup.select_one('#all_players_standard_fielding')#.find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')


    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        fielding_stats.append(tds)

# Create dataframe for batter statistics
fielding_stats_df = pd.DataFrame(fielding_stats)

# Create an empty list to store dataframe header information
fielding_header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    fielding_header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
fielding_df_headers = fielding_header_list[1]
# Remove the first item from our headers list, it is the index header that we do not need
fielding_df_headers.remove('Rk')
fielding_df_headers.append("Year")

# Set column headers equal to our list
fielding_stats_df.columns = fielding_df_headers

final_fielding_stats_df = fielding_stats_df[['Player','PO','A','E','Year','Team']]

position_players_df = pd.merge(final_fielding_stats_df, batter_stats_df, on=['Player','Year','Team'])
position_players_df


Unnamed: 0,Player,PO,A,E,Year,Team,Age,Lg,WAR,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,rOBA,Rbat+,TB,GIDP,HBP,SH,SF,IBB,Pos,Awards
0,Matt Olson,1259,105,6,2024.0,ATL,30,NL,3.8,162,685,600,78,148,37,1,29,98,0,0,71,170,.247,.333,.457,.790,118,.342,117,274,15,9,0,5,5,*3,
1,Jarren Duran,356,12,6,2024.0,BOS,27,AL,8.7,160,735,671,111,191,48,14,21,75,34,7,54,160,.285,.342,.492,.834,129,.373,134,330,6,6,1,3,1,*87,"AS,MVP-8"
2,Anthony Volpe,210,344,16,2024.0,NYY,23,AL,3.4,160,689,637,90,155,27,7,12,60,28,7,42,156,.243,.293,.364,.657,86,.296,84,232,9,4,2,3,1,*6/H,
3,Pete Alonso,1179,99,7,2024.0,NYM,29,NL,2.6,162,695,608,91,146,31,0,34,88,3,0,70,172,.240,.329,.459,.788,123,.341,122,279,14,13,0,4,4,*3/DH,AS
4,Willy Adames,204,323,20,2024.0,MIL,28,NL,3.1,161,688,610,93,153,33,0,32,112,21,4,74,173,.251,.331,.462,.794,118,.348,117,282,12,1,0,3,3,*6,MVP-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6547,Andrew Chafin,0,0,0,,CHC,30,NL,1,0,0,0,0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,/1,,2020,,,
6548,Matt Foster,0,0,0,,CHW,25,AL,1,0,0,0,0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,/1,,2020,,,
6549,Kodi Whitley,0,0,0,,STL,25,NL,1,0,0,0,0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,/1,,2020,,,
6550,Amir Garrett,0,0,0,,CIN,28,NL,1,0,0,0,0,0,0,0,0,0,0,0,0,,,,,0,0,0,0,0,0,/1,,2020,,,


In [8]:
# Change types of columns to numeric for columns with number values
position_players_df[['Age','G','R','H','2B','3B','HR','RBI','SB','TB','BB','SO','PO','A','E','PA','OPS','OPS+']] = position_players_df[['Age','G', 'R','H','2B','3B','HR','RBI','SB','TB','BB','SO','PO','A','E','PA','OPS','OPS+']].apply(pd.to_numeric)

# Drop any players with 0 plate appearances to remove null values and change PA type to integer
position_players_df.dropna(subset=['PA'], axis = 0 , inplace= True)

# Remove any players with fewer than 100 plate appearances
filtered_position_players_df = position_players_df[position_players_df['PA'] >= 100]

# Select the columns we want for our batter analysis
final_position_players_df = filtered_position_players_df[['Year','Player','Team','Age','G', 'R','H','2B','3B','HR','RBI','SB','TB','BB','SO','PO','A','E','PA','OPS','OPS+','Pos']]

# Sort by index to prepare to drop duplicates
final_position_players_df = final_position_players_df.sort_index()

# Drop duplicate entries of Player Name and Year
final_position_players_df['1B'] = ''

for index, row in final_position_players_df.iterrows():
    final_position_players_df['1B'] = (final_position_players_df['H'] - (final_position_players_df['2B'] + final_position_players_df['3B'] + final_position_players_df['HR']))

final_position_players_df.columns


Index(['Year', 'Player', 'Team', 'Age', 'G', 'R', 'H', '2B', '3B', 'HR', 'RBI',
       'SB', 'TB', 'BB', 'SO', 'PO', 'A', 'E', 'PA', 'OPS', 'OPS+', 'Pos',
       '1B'],
      dtype='object')

In [9]:
final_position_players_df['FPTS'] = ''
final_position_players_df['AVG_FPTS'] = ''

for index, row in final_position_players_df.iterrows():
    final_position_players_df['FPTS'] = (final_position_players_df['1B'] + (2*final_position_players_df['2B']) + (3*final_position_players_df['3B']) + (4*final_position_players_df['HR']) + final_position_players_df['TB'] + final_position_players_df['BB'] + final_position_players_df['R'] + (2*final_position_players_df['RBI']) + (2*final_position_players_df['SB']) + final_position_players_df['PO'] + final_position_players_df['A'] - (2*final_position_players_df['SO']) - final_position_players_df['E'])
    final_position_players_df['AVG_FPTS'] = (final_position_players_df['FPTS']/final_position_players_df['G'])

final_position_players_df
    

Unnamed: 0,Year,Player,Team,Age,G,R,H,2B,3B,HR,RBI,SB,TB,BB,SO,PO,A,E,PA,OPS,OPS+,Pos,1B,FPTS,AVG_FPTS
0,2024.0,Matt Olson,ATL,30.0,162.0,78.0,148.0,37.0,1.0,29.0,98.0,0.0,274.0,71.0,170.0,1259.0,105.0,6.0,685.0,0.790,118.0,*3,81.0,1911.0,11.796296
1,2024.0,Jarren Duran,BOS,27.0,160.0,111.0,191.0,48.0,14.0,21.0,75.0,34.0,330.0,54.0,160.0,356.0,12.0,6.0,735.0,0.834,129.0,*87,108.0,1085.0,6.781250
2,2024.0,Anthony Volpe,NYY,23.0,160.0,90.0,155.0,27.0,7.0,12.0,60.0,28.0,232.0,42.0,156.0,210.0,344.0,16.0,689.0,0.657,86.0,*6/H,109.0,998.0,6.237500
3,2024.0,Pete Alonso,NYM,29.0,162.0,91.0,146.0,31.0,0.0,34.0,88.0,3.0,279.0,70.0,172.0,1179.0,99.0,7.0,695.0,0.788,123.0,*3/DH,81.0,1828.0,11.283951
4,2024.0,Willy Adames,MIL,28.0,161.0,93.0,153.0,33.0,0.0,32.0,112.0,21.0,282.0,74.0,173.0,204.0,323.0,20.0,688.0,0.794,118.0,*6,88.0,1158.0,7.192547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6224,2020.0,Willie Calhoun,TEX,25.0,29.0,3.0,19.0,2.0,1.0,1.0,13.0,0.0,26.0,5.0,17.0,11.0,0.0,0.0,108.0,0.491,36.0,D/7H,15.0,63.0,2.172414
6225,2020.0,Howie Kendrick,WSN,36.0,25.0,11.0,25.0,4.0,0.0,2.0,14.0,0.0,35.0,7.0,17.0,28.0,4.0,1.0,100.0,0.705,91.0,D/3,19.0,113.0,4.520000
6330,2020.0,Matt Kemp,COL,35.0,43.0,18.0,28.0,3.0,0.0,6.0,21.0,1.0,49.0,15.0,41.0,2.0,0.0,0.0,132.0,0.745,89.0,DH/7,19.0,95.0,2.209302
6331,2020.0,Franmil Reyes,CLE,24.0,59.0,27.0,58.0,10.0,0.0,9.0,34.0,0.0,95.0,24.0,69.0,0.0,0.0,0.0,241.0,0.795,116.0,*D/7H,39.0,171.0,2.898305


In [10]:
final_position_players_df

Unnamed: 0,Year,Player,Team,Age,G,R,H,2B,3B,HR,RBI,SB,TB,BB,SO,PO,A,E,PA,OPS,OPS+,Pos,1B,FPTS,AVG_FPTS
0,2024.0,Matt Olson,ATL,30.0,162.0,78.0,148.0,37.0,1.0,29.0,98.0,0.0,274.0,71.0,170.0,1259.0,105.0,6.0,685.0,0.790,118.0,*3,81.0,1911.0,11.796296
1,2024.0,Jarren Duran,BOS,27.0,160.0,111.0,191.0,48.0,14.0,21.0,75.0,34.0,330.0,54.0,160.0,356.0,12.0,6.0,735.0,0.834,129.0,*87,108.0,1085.0,6.781250
2,2024.0,Anthony Volpe,NYY,23.0,160.0,90.0,155.0,27.0,7.0,12.0,60.0,28.0,232.0,42.0,156.0,210.0,344.0,16.0,689.0,0.657,86.0,*6/H,109.0,998.0,6.237500
3,2024.0,Pete Alonso,NYM,29.0,162.0,91.0,146.0,31.0,0.0,34.0,88.0,3.0,279.0,70.0,172.0,1179.0,99.0,7.0,695.0,0.788,123.0,*3/DH,81.0,1828.0,11.283951
4,2024.0,Willy Adames,MIL,28.0,161.0,93.0,153.0,33.0,0.0,32.0,112.0,21.0,282.0,74.0,173.0,204.0,323.0,20.0,688.0,0.794,118.0,*6,88.0,1158.0,7.192547
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6224,2020.0,Willie Calhoun,TEX,25.0,29.0,3.0,19.0,2.0,1.0,1.0,13.0,0.0,26.0,5.0,17.0,11.0,0.0,0.0,108.0,0.491,36.0,D/7H,15.0,63.0,2.172414
6225,2020.0,Howie Kendrick,WSN,36.0,25.0,11.0,25.0,4.0,0.0,2.0,14.0,0.0,35.0,7.0,17.0,28.0,4.0,1.0,100.0,0.705,91.0,D/3,19.0,113.0,4.520000
6330,2020.0,Matt Kemp,COL,35.0,43.0,18.0,28.0,3.0,0.0,6.0,21.0,1.0,49.0,15.0,41.0,2.0,0.0,0.0,132.0,0.745,89.0,DH/7,19.0,95.0,2.209302
6331,2020.0,Franmil Reyes,CLE,24.0,59.0,27.0,58.0,10.0,0.0,9.0,34.0,0.0,95.0,24.0,69.0,0.0,0.0,0.0,241.0,0.795,116.0,*D/7H,39.0,171.0,2.898305


In [11]:
# Sort data by name alphabetically, then by year in descending order
final_position_players_df = final_position_players_df.sort_values(['Year','Player'], ascending=[True, True])

# Eliminate Baseball Reference's name badges for accolades
final_position_players_df['Player'] = final_position_players_df['Player'].str.extract('([^\*|#]*)')

cleaned_player_list = []
for player in final_position_players_df['Player']:
    player = player.replace("\xa0", " ")
    cleaned_player_list.append(player)

final_position_players_df['Player'] = cleaned_player_list 
final_position_players_df = final_position_players_df.sort_values(['FPTS'], ascending=False)

final_position_players_df

Unnamed: 0,Year,Player,Team,Age,G,R,H,2B,3B,HR,RBI,SB,TB,BB,SO,PO,A,E,PA,OPS,OPS+,Pos,1B,FPTS,AVG_FPTS
1763,2023.0,Matt Olson,ATL,29.0,162.0,127.0,172.0,27.0,3.0,54.0,139.0,1.0,367.0,104.0,167.0,1194.0,96.0,8.0,720.0,0.993,164.0,*3,88.0,2193.0,13.537037
1768,2023.0,Freddie Freeman,LAD,33.0,161.0,131.0,211.0,59.0,2.0,29.0,102.0,23.0,361.0,72.0,121.0,1131.0,128.0,1.0,730.0,0.976,163.0,*3,121.0,2191.0,13.608696
4078,2021.0,Freddie Freeman,ATL,31.0,159.0,120.0,180.0,25.0,2.0,31.0,83.0,8.0,302.0,85.0,107.0,1252.0,101.0,3.0,695.0,0.896,136.0,*3,122.0,2127.0,13.377358
2919,2022.0,Freddie Freeman,LAD,32.0,159.0,117.0,199.0,47.0,2.0,21.0,100.0,13.0,313.0,84.0,102.0,1155.0,91.0,5.0,708.0,0.918,156.0,*3,129.0,2090.0,13.144654
4081,2021.0,Matt Olson,OAK,27.0,156.0,101.0,153.0,35.0,0.0,39.0,111.0,4.0,305.0,88.0,113.0,1156.0,73.0,6.0,673.0,0.911,153.0,*3/D,79.0,2026.0,12.987179
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3632,2022.0,Seth Beer,ARI,25.0,38.0,4.0,21.0,3.0,0.0,1.0,9.0,0.0,27.0,11.0,31.0,38.0,0.0,1.0,126.0,0.521,50.0,D/3H,17.0,62.0,1.631579
5339,2021.0,Khris Davis,2TM,33.0,42.0,11.0,21.0,5.0,1.0,3.0,10.0,0.0,37.0,10.0,31.0,4.0,0.0,0.0,114.0,0.635,75.0,DH/7,12.0,57.0,1.357143
5947,2020.0,Jo Adell,LAA,21.0,38.0,9.0,20.0,4.0,0.0,3.0,7.0,0.0,33.0,7.0,55.0,72.0,2.0,3.0,132.0,0.478,30.0,9/8H,13.0,57.0,1.500000
3382,2022.0,Mike Ford,2TM,29.0,44.0,9.0,26.0,5.0,0.0,3.0,8.0,0.0,40.0,16.0,38.0,11.0,0.0,0.0,137.0,0.647,86.0,3D/H,18.0,56.0,1.272727


In [12]:
# Create a new dataframe for stats percentile calculations
percentile_df = pd.DataFrame(columns = ['Year', 'Player', 'Age','Pos', 'Team', 'FPTS_Percentile', 'AVG_FPTS_Percentile'])

# Carry over columnns from final_batter_stats_df that shouldn't be comparatively ranked 
percentile_df['Year'] = final_position_players_df['Year']
percentile_df['Player'] = final_position_players_df['Player']
percentile_df['Age'] = final_position_players_df['Age']
percentile_df['Team'] = final_position_players_df['Team']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
for year in last_five_years:
    year_df = final_position_players_df.loc[final_position_players_df['Year'] == year]
    year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
    year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)
    year_df.sort_values('Player', ascending=True)

    # Each of the seasons are added back to the percentile dataframe
    percentile_df = percentile_df.append(year_df, ignore_index=True)

percentile_df = percentile_df.sort_values(['Year','Player'], ascending=[True, True])
percentile_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)


Unnamed: 0,Year,Player,Age,Pos,Team,FPTS_Percentile,AVG_FPTS_Percentile,G,R,H,2B,3B,HR,RBI,SB,TB,BB,SO,PO,A,E,PA,OPS,OPS+,1B,FPTS,AVG_FPTS
1383,2020.0,AJ Pollock,32.0,,LAD,,,,,,,,,,,,,,,,,,,,,,
3956,2020.0,AJ Pollock,32.0,*78D/H,LAD,0.581132,0.449057,55.0,30.0,54.0,9.0,0.0,16.0,34.0,2.0,111.0,12.0,45.0,60.0,0.0,1.0,210.0,0.881,132.0,29.0,305.0,5.545455
1450,2020.0,Aaron Hicks,30.0,,NYY,,,,,,,,,,,,,,,,,,,,,,
3972,2020.0,Aaron Hicks,30.0,*8/HD,NYY,0.520755,0.384906,54.0,28.0,38.0,10.0,2.0,6.0,21.0,4.0,70.0,41.0,38.0,97.0,3.0,2.0,211.0,0.793,122.0,20.0,281.0,5.203704
1811,2020.0,Aaron Judge,28.0,,NYY,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2319,2024.0,Zach McKinstry,29.0,65H479/1D,DET,0.4,0.195455,118.0,32.0,64.0,14.0,5.0,4.0,23.0,16.0,100.0,24.0,69.0,84.0,142.0,9.0,325.0,0.614,74.0,41.0,413.0,3.500000
159,2024.0,Zach Neto,23.0,,LAA,,,,,,,,,,,,,,,,,,,,,,
2098,2024.0,Zach Neto,23.0,*6/H,LAA,0.902273,0.770455,155.0,70.0,135.0,34.0,1.0,23.0,77.0,30.0,240.0,39.0,140.0,231.0,405.0,18.0,602.0,0.761,113.0,77.0,1141.0,7.361290
404,2024.0,Zack Gelof,24.0,,OAK,,,,,,,,,,,,,,,,,,,,,,


In [13]:
# Clean up the percentile dataframe, drop NaNs and remove unnecessary columns
percentile_df = percentile_df.dropna()

# Add a rank column that adds the percentiles from each category
percentile_df['Rank'] = (percentile_df['FPTS_Percentile'] + percentile_df['AVG_FPTS_Percentile'])

percentile_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  percentile_df['Rank'] = (percentile_df['FPTS_Percentile'] + percentile_df['AVG_FPTS_Percentile'])


Unnamed: 0,Year,Player,Age,Pos,Team,FPTS_Percentile,AVG_FPTS_Percentile,G,R,H,2B,3B,HR,RBI,SB,TB,BB,SO,PO,A,E,PA,OPS,OPS+,1B,FPTS,AVG_FPTS,Rank
3956,2020.0,AJ Pollock,32.0,*78D/H,LAD,0.581132,0.449057,55.0,30.0,54.0,9.0,0.0,16.0,34.0,2.0,111.0,12.0,45.0,60.0,0.0,1.0,210.0,0.881,132.0,29.0,305.0,5.545455,1.030189
3972,2020.0,Aaron Hicks,30.0,*8/HD,NYY,0.520755,0.384906,54.0,28.0,38.0,10.0,2.0,6.0,21.0,4.0,70.0,41.0,38.0,97.0,3.0,2.0,211.0,0.793,122.0,20.0,281.0,5.203704,0.90566
4051,2020.0,Aaron Judge,28.0,9/DH,NYY,0.224528,0.554717,28.0,23.0,26.0,3.0,0.0,9.0,22.0,0.0,56.0,10.0,32.0,45.0,1.0,0.0,114.0,0.891,143.0,14.0,171.0,6.107143,0.779245
3966,2020.0,Adam Duvall,31.0,*79/H8D,ATL,0.545283,0.354717,57.0,34.0,45.0,8.0,0.0,16.0,33.0,0.0,101.0,15.0,54.0,79.0,1.0,2.0,209.0,0.833,114.0,21.0,287.0,5.035088,0.9
4022,2020.0,Adam Eaton,31.0,*9/H,WSN,0.332075,0.377358,41.0,22.0,36.0,11.0,1.0,4.0,17.0,3.0,61.0,12.0,32.0,81.0,0.0,1.0,176.0,0.669,79.0,20.0,212.0,5.170732,0.709434
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2172,2024.0,Yasmani Grandal,35.0,2/H3,PIT,0.735227,0.943182,72.0,26.0,49.0,10.0,0.0,9.0,27.0,1.0,86.0,24.0,46.0,572.0,10.0,7.0,243.0,0.704,95.0,30.0,761.0,10.569444,1.678409
2152,2024.0,Yordan Alvarez,27.0,D7,HOU,0.778409,0.615909,147.0,88.0,170.0,34.0,2.0,35.0,86.0,6.0,313.0,69.0,95.0,71.0,3.0,1.0,635.0,0.959,172.0,99.0,850.0,5.782313,1.394318
2319,2024.0,Zach McKinstry,29.0,65H479/1D,DET,0.4,0.195455,118.0,32.0,64.0,14.0,5.0,4.0,23.0,16.0,100.0,24.0,69.0,84.0,142.0,9.0,325.0,0.614,74.0,41.0,413.0,3.500000,0.595455
2098,2024.0,Zach Neto,23.0,*6/H,LAA,0.902273,0.770455,155.0,70.0,135.0,34.0,1.0,23.0,77.0,30.0,240.0,39.0,140.0,231.0,405.0,18.0,602.0,0.761,113.0,77.0,1141.0,7.361290,1.672727


In [14]:
# Create a list of each unique player we have in our dataframe
player_list = percentile_df.Player.unique().tolist()

# Create a new dataframe for combined, averaged percentiles over the past 5 seasons
new_df = pd.DataFrame(columns = ['Player', 'Rank', 'Trend', 'Pos', 'Years', 'FPTS', 'AVG_FPTS', 'FPTS_Percentile', 'AVG_FPTS_Percentile'])

# Create a list for each percentile stat category for upcoming loop
player_trends = []
average_FPTS = []
average_AVG_FPTS = []
average_FPTS_Percentile = []
average_AVG_FPTS_Percentile = []
average_Rank = []
year_count = []
pos = []

# Loop through each player, check if they played in the past two seasons. If not, remove them
for player in player_list:
    filter_df = percentile_df.loc[percentile_df['Player'] == player]
    filter_df = filter_df.sort_values(['Year'], ascending=[False])
    year_list = filter_df.Year.tolist()
    if (year_list[0] != last_year) and (year_list[0] != (last_year - 1)):
        player_list.remove(player)

# Update new dataframe with updated unique player list
new_df['Player'] = player_list        

# Loop through each player, locate their percentile stats for each season, average them out
for player in player_list:
    player_df = percentile_df.loc[percentile_df['Player'] == player]
    
    # We want to find the slope of the line of best fit for each player's overall ranking each season
    x = np.array(player_df['Year'], dtype = float)
    y = np.array(player_df['Rank'], dtype = float)
    slope, intercept = np.polyfit(x, y, 1)
    player_trends.append(slope)
    
    # Find average of each player's percentiles from previous 5 seasons
    average_FPTS.append(sum(player_df['FPTS']) / len(player_df['FPTS']))
    average_AVG_FPTS.append(sum(player_df['AVG_FPTS']) / len(player_df['AVG_FPTS']))
    average_FPTS_Percentile.append(sum(player_df['FPTS_Percentile']) / len(player_df['FPTS_Percentile']))
    average_AVG_FPTS_Percentile.append(sum(player_df['AVG_FPTS_Percentile']) / len(player_df['AVG_FPTS_Percentile']))
    average_Rank.append(sum(player_df['Rank']) / len(player_df['Rank']))
    year_count.append(len(x))
    
    # Keep player positions for reference purposes during the draft
    pos.append(player_df['Pos'].unique())

# Update new dataframe with the list data from each stat
new_df['Pos'] = pos
new_df['Trend'] = player_trends
new_df['FPTS'] = average_FPTS
new_df['AVG_FPTS'] = average_AVG_FPTS
new_df['FPTS_Percentile'] = average_FPTS_Percentile
new_df['AVG_FPTS_Percentile'] = average_AVG_FPTS_Percentile
new_df['Rank'] = average_Rank

# Keep track of how many seasons are being considered, so we know how reliable the data is
new_df['Years'] = year_count



  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

In [15]:
# Create a weighted rank column by adding trend data to the rank data and account for number of seasons played
# Basically, if you played all 5 seasons, your trend stat is added directly
# If you played fewer than all 5 seasons, your trend stat is reduced depending on how few seasons you played
# new_df['Weighted Rank'] = (new_df['Rank'] + ((new_df['Trend'] * (new_df['Years'] - 1) / 4)))

# # shift column 'Weighted Rank' to first position
# first_column = new_df.pop('Weighted Rank')
  
# # insert column using insert(position,column_name,first_column) function
# new_df.insert(1, 'Weighted Rank', first_column)

new_df = new_df.sort_values('FPTS', ascending = False)

In [16]:
# separate position column into a list for editing
new_pos_list = []
pos_list = new_df['Pos'].tolist()

# loop through list and pull only the last item, which represents player position listed from most recent season
for i in pos_list:
    j = i[-1]
    
    if re.search('/', j):
        k = re.sub("([^\/]+$)","",j)
        new_pos_list.append(k)
    else:
        new_pos_list.append(j)

cleaned_list = []
for pos in new_pos_list:
    placeholder = re.findall("[a-zA-Z0-9]+", pos)
    placeholder_2 = ''.join(placeholder)
    placeholder_3 = [d for d in placeholder_2]
    cleaned_list.append(placeholder_3)

cleaned_pos_list = []
for n_list in cleaned_list:
    
    placeholder_list = []
    for pos in n_list:
        if pos == '1':
            placeholder_list.append('P')
        elif pos == '2':
            placeholder_list.append('C')
        elif pos == '3':
            placeholder_list.append('1B')
        elif pos == '4':
            placeholder_list.append('2B')
        elif pos == '5':
            placeholder_list.append('3B')
        elif pos == '6':
            placeholder_list.append('SS')
        elif pos == ('7'):
            placeholder_list.append('OF')
        elif pos == ('8'):
            placeholder_list.append('OF')
        elif pos == ('9'):
            placeholder_list.append('OF')
        elif pos == ('D'):
            placeholder_list.append('DH')
        
    cleaned_pos_list.append(placeholder_list)        

temp_pos_list = []
for item in cleaned_pos_list:
    new_string = []
    for pos in item:
        string = str(pos)
        new_string = f'{new_string},{string}'
    temp_pos_list.append(new_string)

    
final_pos_list = []
for i in temp_pos_list:
    i = i.replace('[],', '')
    final_pos_list.append(i)
    
# replace old position column with new position column
new_df.drop('Pos', axis = 1, inplace = True)
new_df['Pos'] = final_pos_list

new_df.head(25)

Unnamed: 0,Player,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos
63,Freddie Freeman,1.99774,-0.001374,5,1815.2,13.360563,0.998632,0.999108,1B
129,Matt Olson,1.967747,0.024647,5,1731.8,12.070769,0.994159,0.973588,1B
263,Nathaniel Lowe,1.954128,-0.006737,4,1721.25,11.177006,0.985975,0.968153,1B
150,Paul Goldschmidt,1.964666,0.008719,5,1603.0,11.711919,0.989434,0.975232,"1B,DH"
151,Pete Alonso,1.926627,0.04781,5,1520.8,10.765601,0.977923,0.948704,1B
500,Michael Busch,1.846591,0.000456,1,1396.0,9.184211,0.955682,0.890909,1B
299,Adley Rutschman,1.882705,0.004028,3,1386.0,9.994343,0.952495,0.93021,"C,DH"
36,Christian Walker,1.904026,0.048555,5,1360.6,10.687374,0.95864,0.945386,1B
182,Will Smith,1.95092,0.014888,5,1339.4,12.071568,0.961179,0.98974,C
336,Keibert Ruiz,1.907921,0.008198,3,1322.333333,10.566225,0.9514,0.95652,"C,DH"


In [17]:
############################################################################################
############################################################################################
############################################################################################

#    However, for example, if your league is set to have a Games Started limit of 12 and you have 10 pitchers at the 
#    completion of Saturday's games and start 4 pitchers on Sunday, you will receive stats for all 14 pitchers. 

#    (Note: This can happen on any day during the week. If managers have 10 pitchers by end of Wednesday and 
#    starts 4 on Thursday, they will receive points for the 4 pitchers on Thursday but for Friday, Saturday and 
#    Sunday, they will not receive any starting pitchers points.)

############################################################################################
############################################################################################
############################################################################################


In [18]:
# Create a list to help create a dataframe from batter statistics data
pitcher_stats = []

for year in last_five_years:

    # input URL and use BeautifulSoup to parse through the page
    pitching_url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-pitching.shtml'
    pitching_soup = BeautifulSoup(requests.get(pitching_url).content, 'html.parser')

    # Grab the table element that has batter statistics
    pitching_table = pitching_soup.select_one('#all_players_standard_pitching')#.find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')

    # Grab data from table and put it into the list created above
    for tr in pitching_table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        pitcher_stats.append(tds)
        

In [19]:
# Create dataframe for batter statistics
raw_pitcher_stats_df = pd.DataFrame(pitcher_stats)

# Create an empty list to store dataframe header information
pitcher_header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in pitching_table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    pitcher_header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
pitcher_df_headers = pitcher_header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
pitcher_df_headers.remove('Rk')
pitcher_df_headers.append("Year")

# Set column headers equal to our list
raw_pitcher_stats_df.columns = pitcher_df_headers



In [20]:
# raw_pitcher_stats_df['Name'] 
raw_pitcher_stats_df = raw_pitcher_stats_df.rename(columns={"Player": "Name"})


In [21]:
raw_pitcher_stats_df

Unnamed: 0,Name,Age,Team,Lg,WAR,W,L,W-L%,ERA,G,GS,GF,CG,SHO,SV,IP,H,R,ER,HR,BB,IBB,SO,HBP,BK,WP,BF,ERA+,FIP,WHIP,H9,HR9,BB9,SO9,SO/BB,Awards,Year
0,Logan Gilbert,27,SEA,AL,2.8,9,12,.429,3.23,33,33,0,1,0,0,208.2,148,83,75,26,37,1,220,4,0,11,803,113,3.27,0.887,6.4,1.1,1.6,9.5,5.95,"AS,CYA-6",2024.0
1,Seth Lugo,34,KCR,AL,5.3,16,9,.640,3.00,33,33,0,1,0,0,206.2,177,75,69,16,48,1,181,9,0,4,836,141,3.25,1.089,7.7,0.7,2.1,7.9,3.77,"AS,CYA-2,MVP-15,GG",2024.0
2,Logan Webb,27,SFG,NL,3.7,13,10,.565,3.47,33,33,0,1,1,0,204.2,202,83,79,11,50,3,172,2,1,4,841,111,2.95,1.231,8.9,0.5,2.2,7.6,3.44,"AS,CYA-6",2024.0
3,Zack Wheeler,34,PHI,NL,6.1,16,7,.696,2.57,32,32,0,0,0,0,200.0,139,62,57,20,52,0,224,8,0,8,787,158,3.13,0.955,6.3,0.9,2.3,10.1,4.31,"AS,CYA-2,MVP-12",2024.0
4,Aaron Nola,31,PHI,NL,3.6,14,8,.636,3.57,33,33,0,1,1,0,199.1,189,84,79,30,50,0,197,3,0,2,820,114,3.94,1.199,8.5,1.4,2.3,8.9,3.94,CYA-11,2024.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5902,Amir Garrett*,28,CIN,NL,0,0,,,1,0,1,0,0,0,0.0,1,0,0,0,0,0,0,0,0,0,1,,,,,,,,,2020,,
5903,James Karinchak,24,CLE,AL,0,0,,,1,0,0,0,0,0,0.0,1,1,1,1,2,0,0,0,0,0,3,,,,,,,0.00,,2020,,
5904,Carlos Rodón*,27,CHW,AL,0,0,,,1,0,0,0,0,0,0.0,1,2,2,0,2,1,0,0,0,0,3,,,,,,,0.00,,2020,,
5905,Jordan Weems,27,OAK,AL,0,0,,,1,0,0,0,0,0,0.0,2,2,2,0,1,0,0,0,0,0,3,,,,,,,0.00,,2020,,


In [22]:
# Create a list to help create a dataframe from batter statistics data
reliever_stats = []

# Create a loop to create a dataframe from Baseball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{year}-reliever-pitching.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = BeautifulSoup(soup.select_one('#all_players_reliever_pitching').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')


    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        reliever_stats.append(tds)

# Create dataframe for batter statistics
reliever_stats_df = pd.DataFrame(reliever_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
reliever_stats_df.columns = df_headers


final_reliever_stats_df = reliever_stats_df[['Name','Hold','Year','Tm']]
final_reliever_stats_df = final_reliever_stats_df.rename(columns={"Tm": "Team"})

final_pitcher_stats_df = pd.merge(final_reliever_stats_df, raw_pitcher_stats_df, how = 'outer', on=['Name','Year','Team'])

final_pitcher_stats_df



Unnamed: 0,Name,Hold,Year,Team,Age,Lg,WAR,W,L,W-L%,ERA,G,GS,GF,CG,SHO,SV,IP,H,R,ER,HR,BB,IBB,SO,HBP,BK,WP,BF,ERA+,FIP,WHIP,H9,HR9,BB9,SO9,SO/BB,Awards
0,Bryan Abreu,38,2024.0,HOU,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Jason Adam,31,2024.0,TOT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Jason Adam,19,2024.0,TBR,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Jason Adam,12,2024.0,SDP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Austin Adams,22,2024.0,OAK,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10209,Matt Foster,,,CHW,25,AL,0,0,,0.00,1,0,0,0,0,0,0.1,0,0,0,0,2,0,0,0,0,0,3,21.19,6.000,0.0,0.0,54.0,0.0,0.00,,2020,
10210,Óliver Pérez*,,,CLE,38,AL,0,0,,0.00,1,0,1,0,0,0,0.1,1,0,0,0,0,0,0,0,0,0,2,3.19,3.000,27.0,0.0,0.0,0.0,,,2020,
10211,Kodi Whitley,,,STL,25,NL,0,0,,27.00,1,0,1,0,0,0,0.1,1,1,1,1,0,0,0,0,0,0,2,42.19,3.000,27.0,27.0,0.0,0.0,,,2020,
10212,Amir Garrett*,,,CIN,28,NL,0,0,,,1,0,1,0,0,0,0.0,1,0,0,0,0,0,0,0,0,0,1,,,,,,,,,2020,


In [23]:
final_pitcher_stats_df.columns

Index(['Name', 'Hold', 'Year', 'Team', 'Age', 'Lg', 'WAR', 'W', 'L', 'W-L%',
       'ERA', 'G', 'GS', 'GF', 'CG', 'SHO', 'SV', 'IP', 'H', 'R', 'ER', 'HR',
       'BB', 'IBB', 'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP', 'WHIP', 'H9',
       'HR9', 'BB9', 'SO9', 'SO/BB', 'Awards'],
      dtype='object')

In [24]:
# Change types of columns to numeric for columns with number values
final_pitcher_stats_df[['Age','G','GS','IP','ER','W','L','SV','SO','H','BB','CG','Hold']] = final_pitcher_stats_df[['Age','G','GS','IP','ER','W','L','SV','SO','H','BB','CG','Hold']].apply(pd.to_numeric)

# Drop any players with NaN innings pitched, ERA, and WHIP to remove null values 
final_pitcher_stats_df["Hold"].fillna(0, inplace = True)
final_pitcher_stats_df.dropna(subset=['IP'], axis = 0 , inplace= True)
final_pitcher_stats_df.dropna(subset=['ERA'], axis = 0 , inplace= True)
final_pitcher_stats_df.dropna(subset=['WHIP'], axis = 0 , inplace= True)
final_pitcher_stats_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Remove any pitchers with fewer than 30 innings pitched
final_pitcher_stats_df = final_pitcher_stats_df[final_pitcher_stats_df['IP'] >= 30]

# Select the columns we want for our pitcher analysis
final_pitcher_stats_df = final_pitcher_stats_df[['Year','Name','Age','G','GS','IP','ER','W','L','SV','SO','H','BB','CG','Hold']]

# Eliminate Baseball Reference's name badges for accolades
final_pitcher_stats_df['Name'] = final_pitcher_stats_df['Name'].str.extract('([^\*|#]*)')

pitcher_list = final_pitcher_stats_df.Name.tolist()

cleaned_pitcher_list = []
for pitcher in pitcher_list:
    pitcher = pitcher.replace("\xa0", " ")
    cleaned_pitcher_list.append(pitcher) 
    
final_pitcher_stats_df['Name'] = cleaned_pitcher_list 




In [25]:

final_pitcher_stats_df
test_df = final_pitcher_stats_df.loc[final_pitcher_stats_df['Name'] == 'Blake Snell']
test_df


Unnamed: 0,Year,Name,Age,G,GS,IP,ER,W,L,SV,SO,H,BB,CG,Hold
4447,2024.0,Blake Snell,31.0,20.0,20.0,104.0,36.0,5.0,3.0,0.0,145.0,65.0,44.0,1.0,0.0
5660,2023.0,Blake Snell,30.0,32.0,32.0,180.0,45.0,14.0,9.0,0.0,234.0,115.0,99.0,0.0,0.0
6933,2022.0,Blake Snell,29.0,24.0,24.0,128.0,48.0,8.0,10.0,0.0,171.0,103.0,51.0,0.0,0.0
8117,2021.0,Blake Snell,28.0,27.0,27.0,128.2,60.0,7.0,6.0,0.0,170.0,101.0,69.0,0.0,0.0
9363,2020.0,Blake Snell,27.0,11.0,11.0,50.0,18.0,4.0,2.0,0.0,63.0,42.0,18.0,0.0,0.0


In [26]:
# Sort data by name alphabetically, then by year in descending order
final_pitcher_stats_df = final_pitcher_stats_df.sort_values(['Year','Name'], ascending=[True, True])



final_pitcher_stats_df['FPTS'] = ''
final_pitcher_stats_df['AVG_FPTS'] = ''
final_pitcher_stats_df['Pos'] = ''
pos_list = []
final_pos_list = []

for index, row in final_pitcher_stats_df.iterrows():
    final_pitcher_stats_df['FPTS'] = ((3*final_pitcher_stats_df['IP']) - final_pitcher_stats_df['H'] - (2*final_pitcher_stats_df['ER']) - final_pitcher_stats_df['BB'] + (2*final_pitcher_stats_df['W']) - (2*final_pitcher_stats_df['L']) + (5*final_pitcher_stats_df['SV']) + (2*final_pitcher_stats_df['SO']) + (3*final_pitcher_stats_df['CG']) + (2*final_pitcher_stats_df['Hold']))
    final_pitcher_stats_df['AVG_FPTS'] = (final_pitcher_stats_df['FPTS']/final_pitcher_stats_df['G'])
#     final_pitcher_stats_df['Pos'] = (final_pitcher_stats_df['GS']/final_pitcher_stats_df['G'])
    
# #    print(final_pitcher_stats_df['GS'] / final_pitcher_stats_df['G'])
    pitcher_ratio = (final_pitcher_stats_df['GS'] / final_pitcher_stats_df['G'])[index]
    pos_list.append(pitcher_ratio)
# #    final_pitcher_stats_df['Pos'] = pitcher_ratio

for i in range(len(pos_list)):
    if pos_list[i] > (2/3):
        final_pos_list.append('SP')
    elif pos_list[i] < (1/3):
        final_pos_list.append('RP')
    else:
        final_pos_list.append('SP,RP')
        
final_pitcher_stats_df['Pos'] = final_pos_list
        
final_pitcher_stats_df = final_pitcher_stats_df.sort_values(['FPTS'], ascending=False)
final_pitcher_stats_df.head(50)


Unnamed: 0,Year,Name,Age,G,GS,IP,ER,W,L,SV,SO,H,BB,CG,Hold,FPTS,AVG_FPTS,Pos
8021,2021.0,Zack Wheeler,31.0,32.0,32.0,213.1,66.0,14.0,10.0,0.0,247.0,169.0,46.0,3.0,0.0,803.3,25.103125,SP
6831,2022.0,Sandy Alcántara,26.0,32.0,32.0,228.2,58.0,14.0,9.0,0.0,207.0,174.0,50.0,6.0,0.0,786.6,24.58125,SP
5648,2023.0,Spencer Strider,24.0,32.0,32.0,186.2,80.0,20.0,5.0,0.0,281.0,146.0,58.0,0.0,0.0,786.6,24.58125,SP
8048,2021.0,Max Scherzer,36.0,30.0,30.0,179.1,49.0,15.0,4.0,0.0,236.0,119.0,36.0,1.0,0.0,781.3,26.043333,SP
4314,2024.0,Tarik Skubal,27.0,31.0,31.0,192.0,51.0,18.0,4.0,0.0,228.0,142.0,35.0,0.0,0.0,781.0,25.193548,SP
6834,2022.0,Corbin Burnes,27.0,33.0,33.0,202.0,66.0,12.0,8.0,0.0,243.0,144.0,51.0,0.0,0.0,773.0,23.424242,SP
5631,2023.0,Gerrit Cole,32.0,33.0,33.0,209.0,61.0,15.0,4.0,0.0,222.0,157.0,48.0,2.0,0.0,772.0,23.393939,SP
6836,2022.0,Gerrit Cole,31.0,33.0,33.0,200.2,78.0,13.0,8.0,0.0,257.0,154.0,50.0,0.0,0.0,764.6,23.169697,SP
8025,2021.0,Robbie Ray,29.0,32.0,32.0,193.1,61.0,13.0,7.0,0.0,248.0,150.0,52.0,0.0,0.0,763.3,23.853125,SP
4310,2024.0,Zack Wheeler,34.0,32.0,32.0,200.0,57.0,16.0,7.0,0.0,224.0,139.0,52.0,0.0,0.0,761.0,23.78125,SP


In [27]:
# Create a new dataframe for stats percentile calculations
pitcher_percentile_df = pd.DataFrame(columns = ['Year','Name','Age','G','GS','IP','ER','W','L','SV','SO','H','BB','CG','Hold','Pos'])

# Carry over columnns from final_pitcher_stats_df that shouldn't be comparatively ranked 
pitcher_percentile_df['Year'] = final_pitcher_stats_df['Year']
pitcher_percentile_df['Name'] = final_pitcher_stats_df['Name']
pitcher_percentile_df['Age'] = final_pitcher_stats_df['Age']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
for year in last_five_years:
    year_df = final_pitcher_stats_df.loc[final_pitcher_stats_df['Year'] == year]
    year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
    year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)
    year_df.sort_values('Name', ascending=True)    
    
    # Each of the seasons are added back to the percentile dataframe
    pitcher_percentile_df = pitcher_percentile_df.append(year_df, ignore_index=True)

pitcher_percentile_df = pitcher_percentile_df.sort_values(['Year','Name'], ascending=[True, True])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)


In [28]:
# Clean up the percentile dataframe, drop NaNs and remove unnecessary columns
pitcher_percentile_df = pitcher_percentile_df.dropna()

# Add a rank column that adds the percentiles from each category
pitcher_percentile_df['Rank'] = (pitcher_percentile_df['FPTS_Percentile'] + pitcher_percentile_df['AVG_FPTS_Percentile'])

pitcher_percentile_df


Unnamed: 0,Year,Name,Age,G,GS,IP,ER,W,L,SV,SO,H,BB,CG,Hold,Pos,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Rank
4506,2020.0,Aaron Civale,25.0,12.0,12.0,74.0,39.0,4.0,6.0,0.0,69.0,82.0,16.0,1.0,0.0,SP,183.0,15.250000,0.765432,0.669753,1.435185
4476,2020.0,Aaron Nola,27.0,12.0,12.0,71.1,26.0,5.0,5.0,0.0,96.0,54.0,23.0,2.0,0.0,SP,282.3,23.525000,0.950617,0.944444,1.895062
4498,2020.0,Adam Wainwright,38.0,10.0,10.0,65.2,23.0,5.0,3.0,0.0,54.0,54.0,15.0,2.0,0.0,SP,198.6,19.860000,0.814815,0.845679,1.660494
4587,2020.0,Adrian Houser,27.0,12.0,11.0,56.0,33.0,1.0,6.0,0.0,44.0,63.0,21.0,0.0,0.0,SP,96.0,8.000000,0.265432,0.256173,0.521605
4524,2020.0,Alec Mills,28.0,11.0,11.0,62.1,31.0,5.0,5.0,0.0,46.0,53.0,19.0,1.0,0.0,SP,147.3,13.390909,0.657407,0.561728,1.219136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2625,2024.0,Zack Kelly,29.0,49.0,3.0,56.2,25.0,6.0,3.0,0.0,61.0,44.0,27.0,0.0,0.0,RP,175.6,3.583673,0.434743,0.318015,0.752757
2385,2024.0,Zack Littell,28.0,29.0,29.0,156.1,63.0,8.0,10.0,0.0,141.0,164.0,31.0,0.0,0.0,SP,425.3,14.665517,0.875919,0.834559,1.710478
2318,2024.0,Zack Wheeler,34.0,32.0,32.0,200.0,57.0,16.0,7.0,0.0,224.0,139.0,52.0,0.0,0.0,SP,761.0,23.781250,0.998162,0.992647,1.990809
2822,2024.0,Zebby Matthews,24.0,9.0,9.0,37.2,28.0,1.0,4.0,0.0,43.0,51.0,11.0,0.0,0.0,SP,73.6,8.177778,0.071691,0.621324,0.693015


In [29]:


# test_df = raw_pitcher_stats_df.loc[raw_pitcher_stats_df['Name'] == 'Blake\xa0Snell']
# test_df

In [30]:
# Create a list of each unique player we have in our dataframe
pitcher_list = pitcher_percentile_df.Name.unique().tolist()

# Create a new dataframe for combined, averaged percentiles over the past 5 seasons
new_pitcher_df = pd.DataFrame(columns = ['Name', 'Rank', 'Trend', 'Years', 'FPTS', 'AVG_FPTS', 'FPTS_Percentile', 'AVG_FPTS_Percentile','Pos','G','GS'])

# Create a list for each percentile stat category for upcoming loop
pitcher_trends = []
average_FPTS = []
average_AVG_FPTS = []
average_FPTS_Percentile = []
average_AVG_FPTS_Percentile = []
pitcher_average_Rank = []
pitcher_year_count = []
pos = []
games = []
games_started = []

cleaned_pitcher_list = []
for pitcher in pitcher_list:
    pitcher = pitcher.replace("\xa0", " ")
    cleaned_pitcher_list.append(pitcher) 

# new_pitcher_df
    
# Loop through each player, check if they played in the past two seasons. If not, remove them
for pitcher in cleaned_pitcher_list:
    filter_df = pitcher_percentile_df.loc[pitcher_percentile_df['Name'] == pitcher]
    filter_df = filter_df.sort_values(['Year'], ascending=[False])
    year_list = filter_df.Year.tolist()
    if (year_list[0] != last_year) and (year_list[0] != (last_year - 1)):
        pitcher_list.remove(pitcher)

# Update new dataframe with updated unique player list
new_pitcher_df['Name'] = pitcher_list        

# Loop through each player, locate their percentile stats for each season, average them out
for pitcher in pitcher_list:
    pitcher_df = pitcher_percentile_df.loc[pitcher_percentile_df['Name'] == pitcher]
    
    # We want to find the slope of the line of best fit for each player's overall ranking each season
    x = np.array(pitcher_df['Year'], dtype = float)
    y = np.array(pitcher_df['Rank'], dtype = float)
    slope, intercept = np.polyfit(x, y, 1)
    pitcher_trends.append(slope)
    
    # Find average of each player's percentiles from previous 5 seasons
    average_FPTS.append(sum(pitcher_df['FPTS']) / len(pitcher_df['FPTS']))
    average_AVG_FPTS.append(sum(pitcher_df['AVG_FPTS']) / len(pitcher_df['AVG_FPTS']))
    average_FPTS_Percentile.append(sum(pitcher_df['FPTS_Percentile']) / len(pitcher_df['FPTS_Percentile']))
    average_AVG_FPTS_Percentile.append(sum(pitcher_df['AVG_FPTS_Percentile']) / len(pitcher_df['AVG_FPTS_Percentile']))
    pitcher_average_Rank.append(sum(pitcher_df['Rank']) / len(pitcher_df['Rank']))
    pitcher_year_count.append(len(x))
    pos.append(pitcher_df['Pos'].iloc[-1])
    games.append(pitcher_df['G'].iloc[-1])
    games_started.append(pitcher_df['GS'].iloc[-1])

# Update new dataframe with the list data from each stat
new_pitcher_df['Trend'] = pitcher_trends
new_pitcher_df['Pos'] = pos
new_pitcher_df['G'] = games
new_pitcher_df['GS'] = games_started
new_pitcher_df['FPTS'] = average_FPTS
new_pitcher_df['AVG_FPTS'] = average_AVG_FPTS
new_pitcher_df['FPTS_Percentile'] = average_FPTS_Percentile
new_pitcher_df['AVG_FPTS_Percentile'] = average_AVG_FPTS_Percentile
new_pitcher_df['Rank'] = pitcher_average_Rank

# Keep track of how many seasons are being considered, so we know how reliable the data is
new_pitcher_df['Years'] = pitcher_year_count



  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

In [31]:
# Create a weighted rank column by adding trend data to the rank data and account for number of seasons played
# Basically, if you played all 5 seasons, your trend stat is added directly
# If you played fewer than all 5 seasons, your trend stat is reduced depending on how few seasons you played
# new_pitcher_df['Weighted Rank'] = (new_pitcher_df['Rank'] + ((new_pitcher_df['Trend'] * (new_pitcher_df['Years'] - 1) / 4)))

# # shift column 'Weighted Rank' to first position
# first_pitcher_column = new_pitcher_df.pop('Weighted Rank')
  
# # insert column using insert(position,column_name,first_column) function
# new_pitcher_df.insert(1, 'Weighted Rank', first_pitcher_column)

new_pitcher_df = new_pitcher_df.sort_values('FPTS', ascending = False)
new_pitcher_df.head(25)


Unnamed: 0,Name,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos,G,GS
386,Spencer Strider,1.947163,0.102018,2,693.6,21.977722,0.980769,0.966394,SP,32.0,32.0
24,Corbin Burnes,1.948143,0.013779,5,605.1,22.237081,0.978689,0.969453,SP,32.0,32.0
605,Shota Imanaga,1.957721,0.000484,1,602.3,20.768966,0.981618,0.976103,SP,29.0,29.0
472,Kodai Senga,1.954296,0.000483,1,599.3,20.665517,0.976234,0.978062,SP,29.0,29.0
109,Zack Wheeler,1.885241,0.076915,5,590.26,21.539244,0.94449,0.940752,SP,32.0,32.0
35,Gerrit Cole,1.916367,-0.044117,5,576.18,22.905119,0.943414,0.972953,SP,17.0,17.0
594,Paul Skenes,1.957721,0.000484,1,571.0,24.826087,0.961397,0.996324,SP,23.0,23.0
267,Shohei Ohtani,1.937092,0.017105,3,562.433333,22.657557,0.952548,0.984544,SP,23.0,23.0
1,Aaron Nola,1.918524,0.006235,5,561.56,20.373977,0.971679,0.946845,SP,33.0,33.0
229,Logan Gilbert,1.856824,0.100492,4,561.275,18.28267,0.933373,0.923451,SP,33.0,33.0


In [32]:
draft_df = pd.concat([new_df, new_pitcher_df], ignore_index=True, sort=False)

# draft_df.drop(columns = ['Rank','FPTS_Percentile','AVG_FPTS_Percentile'])


# draft_df['FPTS_Percentile'] = draft_df['FPTS'].rank(pct=True)
# draft_df['AVG_FPTS_Percentile'] = draft_df['AVG_FPTS'].rank(pct=True)
    
# draft_df['Rank'] = (draft_df['FPTS_Percentile'] + (draft_df['AVG_FPTS_Percentile'] / 2))

draft_df['Weighted_Rank'] = ''
draft_df['Weighted_Rank'] = abs(draft_df['FPTS'] * (1 + (draft_df['Trend'] * (draft_df['Years'] - 1) / 4)))
# shift column 'Weighted Rank' to first position
# weighted_rank_column = draft_df.pop('Weighted Rank')
  
# # insert column using insert(position,column_name,first_column) function
# draft_df.insert(1, 'Weighted Rank', weighted_rank_column)


# draft_df['Weighted_Rank'] = ''
# draft_df['Weighted_Rank'] = abs(draft_df['FPTS'] * (1 + draft_df['Trend']))

weighted_rank_column = draft_df.pop('Weighted_Rank')
  
# insert column using insert(position,column_name,first_column) function
draft_df.insert(1, 'Weighted_Rank', weighted_rank_column)


draft_df = draft_df.sort_values(by='Weighted_Rank', ascending=False)
draft_df["G"].fillna("N/A", inplace = True)
draft_df["GS"].fillna("N/A", inplace = True)

draft_df.head(50)

Unnamed: 0,Player,Weighted_Rank,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos,Name,G,GS
0,Freddie Freeman,1812.705596,1.99774,-0.001374,5,1815.2,13.360563,0.998632,0.999108,1B,,,
1,Matt Olson,1774.483071,1.967747,0.024647,5,1731.8,12.070769,0.994159,0.973588,1B,,,
2,Nathaniel Lowe,1712.552452,1.954128,-0.006737,4,1721.25,11.177006,0.985975,0.968153,1B,,,
3,Paul Goldschmidt,1616.977187,1.964666,0.008719,5,1603.0,11.711919,0.989434,0.975232,"1B,DH",,,
4,Pete Alonso,1593.508795,1.926627,0.04781,5,1520.8,10.765601,0.977923,0.948704,1B,,,
16,Cal Raleigh,1437.886996,1.724751,0.263321,4,1200.75,9.796927,0.81572,0.909031,"C,DH",,,
7,Christian Walker,1426.663351,1.904026,0.048555,5,1360.6,10.687374,0.95864,0.945386,1B,,,
41,Josh Naylor,1404.075821,1.345565,0.481405,5,947.8,8.03492,0.663481,0.682084,"1B,DH",,,
5,Michael Busch,1396.0,1.846591,0.000456,1,1396.0,9.184211,0.955682,0.890909,1B,,,
6,Adley Rutschman,1388.791575,1.882705,0.004028,3,1386.0,9.994343,0.952495,0.93021,"C,DH",,,


In [33]:
######################################################################################################################
######################################################################################################################
######################################################################################################################
#### DRAFT DAY FUNCTIONS
   
# DROP A PLAYER 
def drafted(player):
    global draft_df
    global final_pitcher_stats_df
    global final_position_players_df
    draft_df = draft_df[draft_df.Name != player]
    final_pitcher_stats_df = final_pitcher_stats_df[final_pitcher_stats_df.Name != player]
    final_position_players_df = final_position_players_df[final_position_players_df.Name != player]
    return draft_df.head(25)
    
# FILTER PLAYERS BY POSITION
def position_filter(POS):
    filtered_draft_df = draft_df[draft_df['Pos'].str.contains(POS)]
    return filtered_draft_df.head(25)

# PULL PITCHING STAT CATEGORY LEADERS
def pitching_stat_leaders(CAT):
    global final_pitcher_stats_df
    pitching_filtered_draft_df = final_pitcher_stats_df.sort_values([CAT], ascending=[False])
    return pitching_filtered_draft_df.head(25)

# PULL BATTING STAT CATEGORY LEADERS
def batting_stat_leaders(CAT):
    global final_position_players_df
    batting_filtered_draft_df = final_position_players_df.sort_values([CAT], ascending=[False])
    return batting_filtered_draft_df.head(25)

def drop_all_position(POS):
    global draft_df
    draft_df = draft_df[draft_df.Pos != POS]
    return draft_df.head(25)

In [34]:
# drop_all_position('')
# drafted('')


In [36]:
# Filter the DataFrame by a string value in the "City" column
filtered_df = draft_df[draft_df["Pos"].str.contains("1B")]
filtered_df.head(60)

Unnamed: 0,Player,Weighted_Rank,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos,Name,G,GS
0,Freddie Freeman,1812.705596,1.99774,-0.001374,5,1815.2,13.360563,0.998632,0.999108,1B,,,
1,Matt Olson,1774.483071,1.967747,0.024647,5,1731.8,12.070769,0.994159,0.973588,1B,,,
2,Nathaniel Lowe,1712.552452,1.954128,-0.006737,4,1721.25,11.177006,0.985975,0.968153,1B,,,
3,Paul Goldschmidt,1616.977187,1.964666,0.008719,5,1603.0,11.711919,0.989434,0.975232,"1B,DH",,,
4,Pete Alonso,1593.508795,1.926627,0.04781,5,1520.8,10.765601,0.977923,0.948704,1B,,,
7,Christian Walker,1426.663351,1.904026,0.048555,5,1360.6,10.687374,0.95864,0.945386,1B,,,
41,Josh Naylor,1404.075821,1.345565,0.481405,5,947.8,8.03492,0.663481,0.682084,"1B,DH",,,
5,Michael Busch,1396.0,1.846591,0.000456,1,1396.0,9.184211,0.955682,0.890909,1B,,,
12,Yainer Diaz,1325.025454,1.794965,0.305525,2,1231.0,9.556783,0.888866,0.906099,"C,DH,1B",,,
32,Andrew Vaughn,1293.648477,1.489322,0.327586,4,1038.5,7.226729,0.812082,0.677241,"1B,DH",,,


In [37]:
copy_test_df = filtered_df

copy_test_df['Test_FPTS_Percentile'] = copy_test_df['FPTS'].rank(pct=True)
copy_test_df['Test_AVG_FPTS_Percentile'] = copy_test_df['AVG_FPTS'].rank(pct=True)

copy_test_df['Test_Rank'] = (copy_test_df['Test_FPTS_Percentile'] + copy_test_df['Test_AVG_FPTS_Percentile'])


copy_test_df.head(60)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  copy_test_df['Test_FPTS_Percentile'] = copy_test_df['FPTS'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  copy_test_df['Test_AVG_FPTS_Percentile'] = copy_test_df['AVG_FPTS'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  copy_test_df['Test_Rank'] = (copy_test_df['Test

Unnamed: 0,Player,Weighted_Rank,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos,Name,G,GS,Test_FPTS_Percentile,Test_AVG_FPTS_Percentile,Test_Rank
0,Freddie Freeman,1812.705596,1.99774,-0.001374,5,1815.2,13.360563,0.998632,0.999108,1B,,,,1.0,1.0,2.0
1,Matt Olson,1774.483071,1.967747,0.024647,5,1731.8,12.070769,0.994159,0.973588,1B,,,,0.989362,0.989362,1.978723
2,Nathaniel Lowe,1712.552452,1.954128,-0.006737,4,1721.25,11.177006,0.985975,0.968153,1B,,,,0.978723,0.968085,1.946809
3,Paul Goldschmidt,1616.977187,1.964666,0.008719,5,1603.0,11.711919,0.989434,0.975232,"1B,DH",,,,0.968085,0.978723,1.946809
4,Pete Alonso,1593.508795,1.926627,0.04781,5,1520.8,10.765601,0.977923,0.948704,1B,,,,0.957447,0.93617,1.893617
7,Christian Walker,1426.663351,1.904026,0.048555,5,1360.6,10.687374,0.95864,0.945386,1B,,,,0.93617,0.925532,1.861702
41,Josh Naylor,1404.075821,1.345565,0.481405,5,947.8,8.03492,0.663481,0.682084,"1B,DH",,,,0.776596,0.723404,1.5
5,Michael Busch,1396.0,1.846591,0.000456,1,1396.0,9.184211,0.955682,0.890909,1B,,,,0.946809,0.797872,1.744681
12,Yainer Diaz,1325.025454,1.794965,0.305525,2,1231.0,9.556783,0.888866,0.906099,"C,DH,1B",,,,0.904255,0.829787,1.734043
32,Andrew Vaughn,1293.648477,1.489322,0.327586,4,1038.5,7.226729,0.812082,0.677241,"1B,DH",,,,0.819149,0.585106,1.404255


In [38]:
copy_test_df.sort_values('Test_Rank', ascending = False).head(60)

Unnamed: 0,Player,Weighted_Rank,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos,Name,G,GS,Test_FPTS_Percentile,Test_AVG_FPTS_Percentile,Test_Rank
0,Freddie Freeman,1812.705596,1.99774,-0.001374,5,1815.2,13.360563,0.998632,0.999108,1B,,,,1.0,1.0,2.0
1,Matt Olson,1774.483071,1.967747,0.024647,5,1731.8,12.070769,0.994159,0.973588,1B,,,,0.989362,0.989362,1.978723
2,Nathaniel Lowe,1712.552452,1.954128,-0.006737,4,1721.25,11.177006,0.985975,0.968153,1B,,,,0.978723,0.968085,1.946809
3,Paul Goldschmidt,1616.977187,1.964666,0.008719,5,1603.0,11.711919,0.989434,0.975232,"1B,DH",,,,0.968085,0.978723,1.946809
4,Pete Alonso,1593.508795,1.926627,0.04781,5,1520.8,10.765601,0.977923,0.948704,1B,,,,0.957447,0.93617,1.893617
7,Christian Walker,1426.663351,1.904026,0.048555,5,1360.6,10.687374,0.95864,0.945386,1B,,,,0.93617,0.925532,1.861702
11,Salvador Perez,1222.219369,1.881125,-0.014975,5,1240.8,10.674428,0.948518,0.932608,"C,1B,DH",,,,0.914894,0.914894,1.829787
21,Rhys Hoskins,1068.953374,1.870748,-0.067832,4,1126.25,10.777448,0.935217,0.935531,"1B,DH",,,,0.87234,0.946809,1.819149
10,Spencer Torkelson,1236.633315,1.845663,-0.011844,3,1244.0,10.228637,0.903703,0.94196,1B,,,,0.925532,0.882979,1.808511
29,Anthony Rizzo,1001.147946,1.816295,-0.026488,7,1042.571429,10.968481,0.863926,0.952369,1B,,,,0.840426,0.957447,1.797872
