In [1]:
# Import needed dependencies
import requests
import re
import pandas as pd
import numpy as np
import scipy.stats as stats
from datetime import date
from bs4 import BeautifulSoup, Comment

pd.set_option('display.max_columns', None)

In [2]:
today = date.today()

# dd/mm/YY
current_year = today.strftime("%Y")

last_year = int(current_year) - 1

In [3]:
last_five_years = []
for i in range(1,6):
    last_five_years.append(int(current_year) - i)

In [4]:
# Create a list to help create a dataframe from batter statistics data
batter_stats = []

# Create a loop to create a dataframe from Baseball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-batting.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = BeautifulSoup(soup.select_one('#all_players_standard_batting').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')


    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        batter_stats.append(tds)

In [5]:
# Create dataframe for batter statistics
batter_stats_df = pd.DataFrame(batter_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
batter_stats_df.columns = df_headers

# Eliminate Baseball Reference's name badges for accolades
batter_stats_df['Name'] = batter_stats_df['Name'].str.extract('([^\*|#]*)')

# cleaned_player_list = []
# for player in batter_stats_df['Name']:
#     player = player.replace("\xa0", "")
#     cleaned_player_list.append(player)

# batter_stats_df['Name'] = cleaned_player_list 
# batter_stats_df = final_position_players_df.sort_values(['FPTS'], ascending=False)

batter_stats_df


Unnamed: 0,Name,Age,Tm,Lg,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos Summary,Year
0,CJ Abrams,22,WSN,NL,151,614,563,83,138,28,6,18,64,47,4,32,118,.245,.300,.412,.712,95,232,7,13,3,3,2,*6/H,2023
1,José Abreu,36,HOU,AL,141,594,540,62,128,23,1,18,90,0,1,42,130,.237,.296,.383,.680,87,207,16,6,0,6,1,*3/D,2023
2,Wilyer Abreu,24,BOS,AL,28,85,76,10,24,6,0,2,14,3,1,9,23,.316,.388,.474,.862,132,36,0,0,0,0,0,87/H9D,2023
3,Ronald Acuna Jr.,25,ATL,NL,159,735,643,149,217,35,4,41,106,73,14,80,84,.337,.416,.596,1.012,168,383,15,9,0,3,3,*9/D,2023
4,Willy Adames,27,MIL,NL,149,638,553,73,120,29,2,24,80,5,3,71,165,.217,.310,.407,.717,95,225,12,6,0,6,1,*6/D,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5865,Ryan Zimmerman,34,WSN,NL,52,190,171,20,44,9,0,6,27,0,0,17,39,.257,.321,.415,.736,89,71,4,0,0,2,0,3/HD,2019
5866,Jordan Zimmermann,33,DET,AL,1,2,2,0,0,0,0,0,0,0,0,0,2,.000,.000,.000,.000,-100,0,0,0,0,0,0,1,2019
5867,Ben Zobrist,38,CHC,NL,47,176,150,24,39,5,0,1,17,0,0,23,24,.260,.358,.313,.671,79,47,6,1,0,2,0,49/7HD16,2019
5868,Mike Zunino,28,TBR,AL,90,289,266,30,44,10,1,9,32,0,0,20,98,.165,.232,.312,.544,45,83,4,3,0,0,0,2/H,2019


In [6]:
# Create a list to help create a dataframe from batter statistics data
fielding_stats = []

# Create a loop to create a dataframe from Baseball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-fielding.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = soup.select_one('#all_players_players_standard_fielding_fielding')#.find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')


    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        fielding_stats.append(tds)

# Create dataframe for batter statistics
fielding_stats_df = pd.DataFrame(fielding_stats)

# Create an empty list to store dataframe header information
fielding_header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    fielding_header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
fielding_df_headers = fielding_header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
fielding_df_headers.remove('Rk')
fielding_df_headers.append("Year")

# Set column headers equal to our list
fielding_stats_df.columns = fielding_df_headers

final_fielding_stats_df = fielding_stats_df[['Name','PO','A','E','Year','Tm']]

position_players_df = pd.merge(final_fielding_stats_df, batter_stats_df, on=['Name','Year','Tm'])
position_players_df


Unnamed: 0,Name,PO,A,E,Year,Tm,Age,Lg,G,PA,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,BA,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB,Pos Summary
0,CJ Abrams,245,384,22,2023,WSN,22,NL,151,614,563,83,138,28,6,18,64,47,4,32,118,.245,.300,.412,.712,95,232,7,13,3,3,2,*6/H
1,José Abreu,1017,67,5,2023,HOU,36,AL,141,594,540,62,128,23,1,18,90,0,1,42,130,.237,.296,.383,.680,87,207,16,6,0,6,1,*3/D
2,Wilyer Abreu,40,2,1,2023,BOS,24,AL,28,85,76,10,24,6,0,2,14,3,1,9,23,.316,.388,.474,.862,132,36,0,0,0,0,0,87/H9D
3,Ronald Acuna Jr.,281,10,5,2023,ATL,25,NL,159,735,643,149,217,35,4,41,106,73,14,80,84,.337,.416,.596,1.012,168,383,15,9,0,3,3,*9/D
4,Willy Adames,159,351,14,2023,MIL,27,NL,149,638,553,73,120,29,2,24,80,5,3,71,165,.217,.310,.407,.717,95,225,12,6,0,6,1,*6/D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4820,Kyle Zimmer,0,1,0,2019,KCR,27,AL,1,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,0,0,0,0,0,0,1
4821,Ryan Zimmerman,307,20,3,2019,WSN,34,NL,52,190,171,20,44,9,0,6,27,0,0,17,39,.257,.321,.415,.736,89,71,4,0,0,2,0,3/HD
4822,Jordan Zimmermann,7,10,1,2019,DET,33,AL,1,2,2,0,0,0,0,0,0,0,0,0,2,.000,.000,.000,.000,-100,0,0,0,0,0,0,1
4823,Ben Zobrist,58,56,3,2019,CHC,38,NL,47,176,150,24,39,5,0,1,17,0,0,23,24,.260,.358,.313,.671,79,47,6,1,0,2,0,49/7HD16


In [7]:
# position_players_df = pd.merge(final_fielding_stats_df, batter_stats_df, on=['Name','Year','Tm'])
# position_players_df
final_fielding_stats_df

Unnamed: 0,Name,PO,A,E,Year,Tm
0,Fernando Abad,0,0,1,2023,COL
1,Andrew Abbott,1,9,0,2023,CIN
2,Cory Abbott,0,1,0,2023,WSN
3,CJ Abrams,245,384,22,2023,WSN
4,Albert Abreu,6,4,2,2023,NYY
...,...,...,...,...,...,...
7113,Kyle Zimmer,0,1,0,2019,KCR
7114,Ryan Zimmerman,307,20,3,2019,WSN
7115,Jordan Zimmermann,7,10,1,2019,DET
7116,Ben Zobrist,58,56,3,2019,CHC


In [8]:
# Change types of columns to numeric for columns with number values
position_players_df[['Age','G','R','H','2B','3B','HR','RBI','SB','TB','BB','SO','PO','A','E','PA','OPS','OPS+']] = position_players_df[['Age','G', 'R','H','2B','3B','HR','RBI','SB','TB','BB','SO','PO','A','E','PA','OPS','OPS+']].apply(pd.to_numeric)

# Drop any players with 0 plate appearances to remove null values and change PA type to integer
position_players_df.dropna(subset=['PA'], axis = 0 , inplace= True)

# Remove any players with fewer than 100 plate appearances
filtered_position_players_df = position_players_df[position_players_df['PA'] >= 100]

# Select the columns we want for our batter analysis
final_position_players_df = filtered_position_players_df[['Year','Name','Tm','Age','G', 'R','H','2B','3B','HR','RBI','SB','TB','BB','SO','PO','A','E','PA','OPS','OPS+','Pos\xa0Summary']]

# Sort by index to prepare to drop duplicates
final_position_players_df = final_position_players_df.sort_index()

# Drop duplicate entries of Player Name and Year
final_position_players_df['1B'] = ''

for index, row in final_position_players_df.iterrows():
    final_position_players_df['1B'] = (final_position_players_df['H'] - (final_position_players_df['2B'] + final_position_players_df['3B'] + final_position_players_df['HR']))

final_position_players_df.columns


Index(['Year', 'Name', 'Tm', 'Age', 'G', 'R', 'H', '2B', '3B', 'HR', 'RBI',
       'SB', 'TB', 'BB', 'SO', 'PO', 'A', 'E', 'PA', 'OPS', 'OPS+',
       'Pos Summary', '1B'],
      dtype='object')

In [9]:
final_position_players_df['FPTS'] = ''
final_position_players_df['AVG_FPTS'] = ''

for index, row in final_position_players_df.iterrows():
    final_position_players_df['FPTS'] = (final_position_players_df['1B'] + (2*final_position_players_df['2B']) + (3*final_position_players_df['3B']) + (4*final_position_players_df['HR']) + final_position_players_df['TB'] + final_position_players_df['BB'] + final_position_players_df['R'] + (2*final_position_players_df['RBI']) + (2*final_position_players_df['SB']) + final_position_players_df['PO'] + final_position_players_df['A'] - (2*final_position_players_df['SO']) - final_position_players_df['E'])
    final_position_players_df['AVG_FPTS'] = (final_position_players_df['FPTS']/final_position_players_df['G'])

final_position_players_df
    

Unnamed: 0,Year,Name,Tm,Age,G,R,H,2B,3B,HR,RBI,SB,TB,BB,SO,PO,A,E,PA,OPS,OPS+,Pos Summary,1B,FPTS,AVG_FPTS
0,2023,CJ Abrams,WSN,22,151,83,138,28,6,18,64,47,232,32,118,245,384,22,614,0.712,95.0,*6/H,86,1172,7.761589
1,2023,José Abreu,HOU,36,141,62,128,23,1,18,90,0,207,42,130,1017,67,5,594,0.680,87.0,*3/D,86,1517,10.758865
3,2023,Ronald Acuna Jr.,ATL,25,159,149,217,35,4,41,106,73,383,80,84,281,10,5,735,1.012,168.0,*9/D,137,1471,9.251572
4,2023,Willy Adames,MIL,27,149,73,120,29,2,24,80,5,225,71,165,159,351,14,638,0.717,95.0,*6/D,65,930,6.241611
6,2023,Riley Adams,WSN,27,44,8,39,13,2,4,21,0,68,11,45,293,8,2,158,0.807,120.0,2/HD,20,406,9.227273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4809,2019,Mike Yastrzemski,SFG,28,107,64,101,22,3,21,55,2,192,32,107,189,2,4,411,0.852,122.0,79H/8,55,567,5.299065
4811,2019,Christian Yelich,MIL,27,130,100,161,29,3,44,97,30,328,80,118,225,7,4,580,1.100,179.0,*9/7HD8,85,1082,8.323077
4821,2019,Ryan Zimmerman,WSN,34,52,20,44,9,0,6,27,0,71,17,39,307,20,3,190,0.736,89.0,3/HD,29,479,9.211538
4823,2019,Ben Zobrist,CHC,38,47,24,39,5,0,1,17,0,47,23,24,58,56,3,176,0.671,79.0,49/7HD16,33,238,5.063830


In [10]:
# Sort data by name alphabetically, then by year in descending order
final_position_players_df = final_position_players_df.sort_values(['Year','Name'], ascending=[True, True])

# Eliminate Baseball Reference's name badges for accolades
final_position_players_df['Name'] = final_position_players_df['Name'].str.extract('([^\*|#]*)')

cleaned_player_list = []
for player in final_position_players_df['Name']:
    player = player.replace("\xa0", " ")
    cleaned_player_list.append(player)

final_position_players_df['Name'] = cleaned_player_list 
final_position_players_df = final_position_players_df.sort_values(['FPTS'], ascending=False)

final_position_players_df

Unnamed: 0,Year,Name,Tm,Age,G,R,H,2B,3B,HR,RBI,SB,TB,BB,SO,PO,A,E,PA,OPS,OPS+,Pos Summary,1B,FPTS,AVG_FPTS
3929,2019,Freddie Freeman,ATL,29,158,113,176,34,2,38,121,6,328,87,127,1296,63,6,692,0.938,135.0,*3/H,102,2209,13.981013
500,2023,Matt Olson,ATL,29,162,127,172,27,3,54,139,1,367,104,167,1194,96,8,720,0.993,162.0,*3,88,2193,13.537037
224,2023,Freddie Freeman,LAD,33,161,131,211,59,2,29,102,23,361,72,121,1131,128,1,730,0.976,161.0,*3,121,2191,13.608696
1965,2021,Freddie Freeman,ATL,31,159,120,180,25,2,31,83,8,302,85,107,1252,101,3,695,0.896,136.0,*3,122,2127,13.377358
1003,2022,Freddie Freeman,LAD,32,159,117,199,47,2,21,100,13,313,84,102,1155,91,5,708,0.918,156.0,*3,129,2090,13.144654
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
829,2022,Seth Beer,ARI,25,38,4,21,3,0,1,9,0,27,11,31,38,0,1,126,0.521,50.0,D/3H,17,62,1.631579
1845,2021,Khris Davis,TOT,33,42,11,21,5,1,3,10,0,37,10,31,4,0,0,114,0.635,75.0,DH/7,12,57,1.357143
2935,2020,Jo Adell,LAA,21,38,9,20,4,0,3,7,0,33,7,55,72,2,3,132,0.478,30.0,9/8H,13,57,1.500000
521,2023,Everson Pereira,NYY,22,27,6,14,4,0,0,10,4,18,8,40,44,1,0,103,0.427,20.0,7,10,43,1.592593


In [11]:
# Create a new dataframe for stats percentile calculations
percentile_df = pd.DataFrame(columns = ['Year', 'Name', 'Age','Pos\xa0Summary', 'Tm', 'FPTS_Percentile', 'AVG_FPTS_Percentile'])

# Carry over columnns from final_batter_stats_df that shouldn't be comparatively ranked 
percentile_df['Year'] = final_position_players_df['Year']
percentile_df['Name'] = final_position_players_df['Name']
percentile_df['Age'] = final_position_players_df['Age']
percentile_df['Tm'] = final_position_players_df['Tm']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
for year in last_five_years:
    year_df = final_position_players_df.loc[final_position_players_df['Year'] == year]
    year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
    year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)
    year_df.sort_values('Name', ascending=True)

    # Each of the seasons are added back to the percentile dataframe
    percentile_df = percentile_df.append(year_df, ignore_index=True)

percentile_df = percentile_df.sort_values(['Year','Name'], ascending=[True, True])
percentile_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)


Unnamed: 0,Year,Name,Age,Pos Summary,Tm,FPTS_Percentile,AVG_FPTS_Percentile,G,R,H,2B,3B,HR,RBI,SB,TB,BB,SO,PO,A,E,PA,OPS,OPS+,1B,FPTS,AVG_FPTS
1206,2019,AJ Pollock,31,,LAD,,,,,,,,,,,,,,,,,,,,,,
4161,2019,AJ Pollock,31,87/HD,LAD,0.329646,0.360619,86.0,49.0,82.0,15.0,1.0,15.0,47.0,5.0,144.0,23.0,74.0,116.0,3.0,2.0,342.0,0.795,107.0,51.0,433.0,5.034884
1529,2019,Aaron Hicks,29,,NYY,,,,,,,,,,,,,,,,,,,,,,
4217,2019,Aaron Hicks,29,8/DH,NYY,0.205752,0.39823,59.0,41.0,52.0,10.0,0.0,12.0,36.0,1.0,98.0,31.0,72.0,115.0,0.0,1.0,255.0,0.769,103.0,30.0,312.0,5.288136
912,2019,Aaron Judge,27,,NYY,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2547,2023,Zach Remillard,29,4/H5976D,CHW,0.149675,0.229935,54.0,16.0,37.0,7.0,0.0,1.0,18.0,4.0,47.0,8.0,48.0,72.0,90.0,4.0,160.0,0.615,69.0,29.0,224.0,4.148148
963,2023,Zack Gelof,23,,OAK,,,,,,,,,,,,,,,,,,,,,,
2388,2023,Zack Gelof,23,4,OAK,0.492408,0.787419,69.0,40.0,72.0,20.0,1.0,14.0,32.0,14.0,136.0,26.0,82.0,137.0,141.0,3.0,300.0,0.840,137.0,37.0,541.0,7.840580
1292,2023,Zack Short,28,,DET,,,,,,,,,,,,,,,,,,,,,,


In [12]:
# Clean up the percentile dataframe, drop NaNs and remove unnecessary columns
percentile_df = percentile_df.dropna()

# Add a rank column that adds the percentiles from each category
percentile_df['Rank'] = (percentile_df['FPTS_Percentile'] + percentile_df['AVG_FPTS_Percentile'])

percentile_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  percentile_df['Rank'] = (percentile_df['FPTS_Percentile'] + percentile_df['AVG_FPTS_Percentile'])


Unnamed: 0,Year,Name,Age,Pos Summary,Tm,FPTS_Percentile,AVG_FPTS_Percentile,G,R,H,2B,3B,HR,RBI,SB,TB,BB,SO,PO,A,E,PA,OPS,OPS+,1B,FPTS,AVG_FPTS,Rank
4161,2019,AJ Pollock,31,87/HD,LAD,0.329646,0.360619,86.0,49.0,82.0,15.0,1.0,15.0,47.0,5.0,144.0,23.0,74.0,116.0,3.0,2.0,342.0,0.795,107.0,51.0,433.0,5.034884,0.690265
4217,2019,Aaron Hicks,29,8/DH,NYY,0.205752,0.39823,59.0,41.0,52.0,10.0,0.0,12.0,36.0,1.0,98.0,31.0,72.0,115.0,0.0,1.0,255.0,0.769,103.0,30.0,312.0,5.288136,0.603982
4103,2019,Aaron Judge,27,9D/H,NYY,0.457965,0.440265,102.0,75.0,103.0,18.0,1.0,27.0,55.0,3.0,204.0,64.0,141.0,177.0,7.0,0.0,447.0,0.921,143.0,57.0,565.0,5.539216,0.89823
4010,2019,Adalberto Mondesí,23,6/DH,KCR,0.663717,0.747788,102.0,58.0,109.0,20.0,10.0,9.0,62.0,43.0,176.0,19.0,132.0,147.0,286.0,7.0,443.0,0.715,85.0,70.0,801.0,7.852941,1.411504
4283,2019,Adam Duvall,30,7H/9,ATL,0.060841,0.212389,41.0,17.0,32.0,4.0,1.0,10.0,19.0,0.0,68.0,7.0,39.0,44.0,2.0,2.0,130.0,0.882,117.0,17.0,164.0,4.000000,0.27323
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2344,2023,Zach McKinstry,28,549H67/8D1,DET,0.590022,0.290672,148.0,60.0,107.0,21.0,4.0,9.0,35.0,16.0,163.0,44.0,113.0,170.0,184.0,9.0,518.0,0.654,79.0,73.0,651.0,4.398649,0.880694
2403,2023,Zach Neto,22,6/H,LAA,0.462039,0.611714,84.0,38.0,65.0,17.0,0.0,9.0,34.0,5.0,109.0,20.0,77.0,114.0,205.0,7.0,329.0,0.685,86.0,39.0,512.0,6.095238,1.073753
2547,2023,Zach Remillard,29,4/H5976D,CHW,0.149675,0.229935,54.0,16.0,37.0,7.0,0.0,1.0,18.0,4.0,47.0,8.0,48.0,72.0,90.0,4.0,160.0,0.615,69.0,29.0,224.0,4.148148,0.37961
2388,2023,Zack Gelof,23,4,OAK,0.492408,0.787419,69.0,40.0,72.0,20.0,1.0,14.0,32.0,14.0,136.0,26.0,82.0,137.0,141.0,3.0,300.0,0.840,137.0,37.0,541.0,7.840580,1.279826


In [13]:
# Create a list of each unique player we have in our dataframe
player_list = percentile_df.Name.unique().tolist()

# Create a new dataframe for combined, averaged percentiles over the past 5 seasons
new_df = pd.DataFrame(columns = ['Name', 'Rank', 'Trend', 'Pos', 'Years', 'FPTS', 'AVG_FPTS', 'FPTS_Percentile', 'AVG_FPTS_Percentile'])

# Create a list for each percentile stat category for upcoming loop
player_trends = []
average_FPTS = []
average_AVG_FPTS = []
average_FPTS_Percentile = []
average_AVG_FPTS_Percentile = []
average_Rank = []
year_count = []
pos = []

# Loop through each player, check if they played in the past two seasons. If not, remove them
for player in player_list:
    filter_df = percentile_df.loc[percentile_df['Name'] == player]
    filter_df = filter_df.sort_values(['Year'], ascending=[False])
    year_list = filter_df.Year.tolist()
    if (year_list[0] != last_year) and (year_list[0] != (last_year - 1)):
        player_list.remove(player)

# Update new dataframe with updated unique player list
new_df['Name'] = player_list        

# Loop through each player, locate their percentile stats for each season, average them out
for player in player_list:
    player_df = percentile_df.loc[percentile_df['Name'] == player]
    
    # We want to find the slope of the line of best fit for each player's overall ranking each season
    x = np.array(player_df['Year'], dtype = float)
    y = np.array(player_df['Rank'], dtype = float)
    slope, intercept = np.polyfit(x, y, 1)
    player_trends.append(slope)
    
    # Find average of each player's percentiles from previous 5 seasons
    average_FPTS.append(sum(player_df['FPTS']) / len(player_df['FPTS']))
    average_AVG_FPTS.append(sum(player_df['AVG_FPTS']) / len(player_df['AVG_FPTS']))
    average_FPTS_Percentile.append(sum(player_df['FPTS_Percentile']) / len(player_df['FPTS_Percentile']))
    average_AVG_FPTS_Percentile.append(sum(player_df['AVG_FPTS_Percentile']) / len(player_df['AVG_FPTS_Percentile']))
    average_Rank.append(sum(player_df['Rank']) / len(player_df['Rank']))
    year_count.append(len(x))
    
    # Keep player positions for reference purposes during the draft
    pos.append(player_df['Pos\xa0Summary'].unique())

# Update new dataframe with the list data from each stat
new_df['Pos'] = pos
new_df['Trend'] = player_trends
new_df['FPTS'] = average_FPTS
new_df['AVG_FPTS'] = average_AVG_FPTS
new_df['FPTS_Percentile'] = average_FPTS_Percentile
new_df['AVG_FPTS_Percentile'] = average_AVG_FPTS_Percentile
new_df['Rank'] = average_Rank

# Keep track of how many seasons are being considered, so we know how reliable the data is
new_df['Years'] = year_count



  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [14]:
# Create a weighted rank column by adding trend data to the rank data and account for number of seasons played
# Basically, if you played all 5 seasons, your trend stat is added directly
# If you played fewer than all 5 seasons, your trend stat is reduced depending on how few seasons you played
# new_df['Weighted Rank'] = (new_df['Rank'] + ((new_df['Trend'] * (new_df['Years'] - 1) / 4)))

# # shift column 'Weighted Rank' to first position
# first_column = new_df.pop('Weighted Rank')
  
# # insert column using insert(position,column_name,first_column) function
# new_df.insert(1, 'Weighted Rank', first_column)

new_df = new_df.sort_values('FPTS', ascending = False)

In [15]:
# separate position column into a list for editing
new_pos_list = []
pos_list = new_df['Pos'].tolist()

# loop through list and pull only the last item, which represents player position listed from most recent season
for i in pos_list:
    j = i[-1]
    
    if re.search('/', j):
        k = re.sub("([^\/]+$)","",j)
        new_pos_list.append(k)
    else:
        new_pos_list.append(j)

cleaned_list = []
for pos in new_pos_list:
    placeholder = re.findall("[a-zA-Z0-9]+", pos)
    placeholder_2 = ''.join(placeholder)
    placeholder_3 = [d for d in placeholder_2]
    cleaned_list.append(placeholder_3)

cleaned_pos_list = []
for n_list in cleaned_list:
    
    placeholder_list = []
    for pos in n_list:
        if pos == '1':
            placeholder_list.append('P')
        elif pos == '2':
            placeholder_list.append('C')
        elif pos == '3':
            placeholder_list.append('1B')
        elif pos == '4':
            placeholder_list.append('2B')
        elif pos == '5':
            placeholder_list.append('3B')
        elif pos == '6':
            placeholder_list.append('SS')
        elif pos == ('7'):
            placeholder_list.append('OF')
        elif pos == ('8'):
            placeholder_list.append('OF')
        elif pos == ('9'):
            placeholder_list.append('OF')
        elif pos == ('D'):
            placeholder_list.append('DH')
        
    cleaned_pos_list.append(placeholder_list)        

temp_pos_list = []
for item in cleaned_pos_list:
    new_string = []
    for pos in item:
        string = str(pos)
        new_string = f'{new_string},{string}'
    temp_pos_list.append(new_string)

    
final_pos_list = []
for i in temp_pos_list:
    i = i.replace('[],', '')
    final_pos_list.append(i)
    
# replace old position column with new position column
new_df.drop('Pos', axis = 1, inplace = True)
new_df['Pos'] = final_pos_list

new_df.head(25)

Unnamed: 0,Name,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos
101,Freddie Freeman,1.999566,-0.000434,5,1896.2,13.702344,0.999566,1.0,1B
210,Matt Olson,1.963284,0.017629,5,1681.8,12.327258,0.988394,0.97489,1B
248,Paul Goldschmidt,1.965587,0.005351,5,1651.2,11.9135,0.990874,0.974713,"1B,DH"
249,Pete Alonso,1.918285,0.024847,5,1540.2,10.900115,0.975016,0.943269,1B
166,José Abreu,1.936185,-0.007414,5,1473.8,11.317376,0.980749,0.955436,"1B,DH"
124,J.T. Realmuto,1.948702,0.013493,5,1459.0,11.991166,0.972636,0.976065,C
233,Nathaniel Lowe,1.620838,0.356275,4,1419.0,9.905221,0.785051,0.835786,1B
59,Christian Walker,1.883481,0.030102,5,1390.0,10.539662,0.952357,0.931124,1B
533,Spencer Torkelson,1.866045,0.18548,2,1389.5,10.163608,0.928899,0.937146,1B
626,Triston Casas,1.889371,0.000467,1,1382.0,10.469697,0.950108,0.939262,1B


In [16]:
############################################################################################
############################################################################################
############################################################################################

#    However, for example, if your league is set to have a Games Started limit of 12 and you have 10 pitchers at the 
#    completion of Saturday's games and start 4 pitchers on Sunday, you will receive stats for all 14 pitchers. 

#    (Note: This can happen on any day during the week. If managers have 10 pitchers by end of Wednesday and 
#    starts 4 on Thursday, they will receive points for the 4 pitchers on Thursday but for Friday, Saturday and 
#    Sunday, they will not receive any starting pitchers points.)

############################################################################################
############################################################################################
############################################################################################


In [17]:
# Create a list to help create a dataframe from batter statistics data
pitcher_stats = []

for year in last_five_years:

    # input URL and use BeautifulSoup to parse through the page
    pitching_url = f'https://www.baseball-reference.com/leagues/majors/{year}-standard-pitching.shtml'
    pitching_soup = BeautifulSoup(requests.get(pitching_url).content, 'html.parser')

    # Grab the table element that has batter statistics
    pitching_table = BeautifulSoup(pitching_soup.select_one('#all_players_standard_pitching').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')

    # Grab data from table and put it into the list created above
    for tr in pitching_table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        pitcher_stats.append(tds)
        

In [18]:
# Create dataframe for batter statistics
raw_pitcher_stats_df = pd.DataFrame(pitcher_stats)

# Create an empty list to store dataframe header information
pitcher_header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in pitching_table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    pitcher_header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
pitcher_df_headers = pitcher_header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
pitcher_df_headers.remove('Rk')
pitcher_df_headers.append("Year")

# Set column headers equal to our list
raw_pitcher_stats_df.columns = pitcher_df_headers



In [19]:
raw_pitcher_stats_df

Unnamed: 0,Name,Age,Tm,Lg,W,L,W-L%,ERA,G,GS,GF,CG,SHO,SV,IP,H,R,ER,HR,BB,IBB,SO,HBP,BK,WP,BF,ERA+,FIP,WHIP,H9,HR9,BB9,SO9,SO/W,Year
0,Fernando Abad*,37,COL,NL,1,0,1.000,4.26,6,0,2,0,0,0,6.1,11,3,3,2,3,0,2,0,0,0,32,124,8.15,2.211,15.6,2.8,4.3,2.8,0.67,2023
1,Andrew Abbott*,24,CIN,NL,8,6,.571,3.87,21,21,0,0,0,0,109.1,100,47,47,16,44,0,120,1,1,1,459,118,4.20,1.317,8.2,1.3,3.6,9.9,2.73,2023
2,Cory Abbott,27,WSN,NL,1,2,.333,6.64,22,0,10,0,0,0,39.1,48,29,29,9,19,0,40,4,1,4,183,65,5.95,1.703,11.0,2.1,4.3,9.2,2.11,2023
3,Albert Abreu,27,NYY,AL,2,2,.500,4.73,45,0,19,0,0,0,59.0,52,39,31,9,35,1,61,6,0,5,268,92,5.26,1.475,7.9,1.4,5.3,9.3,1.74,2023
4,Bryan Abreu,26,HOU,AL,3,2,.600,1.75,72,0,16,0,0,5,72.0,44,17,14,6,31,0,100,3,0,1,287,241,2.98,1.042,5.5,0.8,3.9,12.5,3.23,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5149,T.J. Zeuch,23,TOR,AL,1,2,.333,4.76,5,3,0,0,0,0,22.2,22,13,12,2,11,0,20,0,0,2,99,96,4.05,1.456,8.7,0.8,4.4,7.9,1.82,2019
5150,Kyle Zimmer,27,KCR,AL,0,1,.000,10.80,15,0,3,0,0,0,18.1,28,22,22,2,19,0,18,0,0,2,102,45,5.78,2.564,13.7,1.0,9.3,8.8,0.95,2019
5151,Jordan Zimmermann,33,DET,AL,1,13,.071,6.91,23,23,0,0,0,0,112.0,145,89,86,19,25,2,82,6,0,3,504,69,4.79,1.518,11.7,1.5,2.0,6.6,3.28,2019
5152,Ben Zobrist,38,CHC,NL,0,0,,0.00,1,0,1,0,0,0,1.0,0,0,0,0,2,0,1,0,0,0,5,,7.21,2.000,0.0,0.0,18.0,9.0,0.50,2019


In [20]:
# Create a list to help create a dataframe from batter statistics data
reliever_stats = []

# Create a loop to create a dataframe from Baseball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.baseball-reference.com/leagues/majors/{year}-reliever-pitching.shtml'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has batter statistics
    table = BeautifulSoup(soup.select_one('#all_players_reliever_pitching').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')


    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        reliever_stats.append(tds)

# Create dataframe for batter statistics
reliever_stats_df = pd.DataFrame(reliever_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
reliever_stats_df.columns = df_headers


final_reliever_stats_df = reliever_stats_df[['Name','Hold','Year','Tm']]

final_pitcher_stats_df = pd.merge(final_reliever_stats_df, raw_pitcher_stats_df, how = 'outer', on=['Name','Year','Tm'])

final_pitcher_stats_df



Unnamed: 0,Name,Hold,Year,Tm,Age,Lg,W,L,W-L%,ERA,G,GS,GF,CG,SHO,SV,IP,H,R,ER,HR,BB,IBB,SO,HBP,BK,WP,BF,ERA+,FIP,WHIP,H9,HR9,BB9,SO9,SO/W
0,Fernando Abad*,0,2023,COL,37,NL,1,0,1.000,4.26,6,0,2,0,0,0,6.1,11,3,3,2,3,0,2,0,0,0,32,124,8.15,2.211,15.6,2.8,4.3,2.8,0.67
1,Cory Abbott,0,2023,WSN,27,NL,1,2,.333,6.64,22,0,10,0,0,0,39.1,48,29,29,9,19,0,40,4,1,4,183,65,5.95,1.703,11.0,2.1,4.3,9.2,2.11
2,Albert Abreu,3,2023,NYY,27,AL,2,2,.500,4.73,45,0,19,0,0,0,59.0,52,39,31,9,35,1,61,6,0,5,268,92,5.26,1.475,7.9,1.4,5.3,9.3,1.74
3,Bryan Abreu,24,2023,HOU,26,AL,3,2,.600,1.75,72,0,16,0,0,5,72.0,44,17,14,6,31,0,100,3,0,1,287,241,2.98,1.042,5.5,0.8,3.9,12.5,3.23
4,Domingo Acevedo,2,2023,OAK,29,AL,0,0,,10.61,9,0,1,0,0,0,9.1,16,11,11,2,2,1,7,1,0,1,47,40,5.51,1.929,15.4,1.9,1.9,6.8,3.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5205,Alex Wood*,,2019,CIN,28,NL,1,3,.250,5.80,7,7,0,0,0,0,35.2,41,25,23,11,9,0,30,1,0,0,153,81,6.38,1.402,10.3,2.8,2.3,7.6,3.33
5206,Brandon Woodruff,,2019,MIL,26,NL,11,3,.786,3.62,22,22,0,0,0,0,121.2,109,49,49,12,30,0,143,5,1,1,493,123,3.01,1.142,8.1,0.9,2.2,10.6,4.77
5207,Jordan Yamamoto,,2019,MIA,23,NL,4,5,.444,4.46,15,15,0,0,0,0,78.2,54,42,39,11,36,1,82,5,0,5,325,96,4.51,1.144,6.2,1.3,4.1,9.4,2.28
5208,Jordan Zimmermann,,2019,DET,33,AL,1,13,.071,6.91,23,23,0,0,0,0,112.0,145,89,86,19,25,2,82,6,0,3,504,69,4.79,1.518,11.7,1.5,2.0,6.6,3.28


In [21]:
final_pitcher_stats_df.columns

Index(['Name', 'Hold', 'Year', 'Tm', 'Age', 'Lg', 'W', 'L', 'W-L%', 'ERA', 'G',
       'GS', 'GF', 'CG', 'SHO', 'SV', 'IP', 'H', 'R', 'ER', 'HR', 'BB', 'IBB',
       'SO', 'HBP', 'BK', 'WP', 'BF', 'ERA+', 'FIP', 'WHIP', 'H9', 'HR9',
       'BB9', 'SO9', 'SO/W'],
      dtype='object')

In [22]:
# Change types of columns to numeric for columns with number values
final_pitcher_stats_df[['Age','G','GS','IP','ER','W','L','SV','SO','H','BB','CG','Hold']] = final_pitcher_stats_df[['Age','G','GS','IP','ER','W','L','SV','SO','H','BB','CG','Hold']].apply(pd.to_numeric)

# Drop any players with NaN innings pitched, ERA, and WHIP to remove null values 
final_pitcher_stats_df["Hold"].fillna(0, inplace = True)
final_pitcher_stats_df.dropna(subset=['IP'], axis = 0 , inplace= True)
final_pitcher_stats_df.dropna(subset=['ERA'], axis = 0 , inplace= True)
final_pitcher_stats_df.dropna(subset=['WHIP'], axis = 0 , inplace= True)
final_pitcher_stats_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Remove any pitchers with fewer than 30 innings pitched
final_pitcher_stats_df = final_pitcher_stats_df[final_pitcher_stats_df['IP'] >= 30]

# Select the columns we want for our pitcher analysis
final_pitcher_stats_df = final_pitcher_stats_df[['Year','Name','Age','G','GS','IP','ER','W','L','SV','SO','H','BB','CG','Hold']]

# Eliminate Baseball Reference's name badges for accolades
final_pitcher_stats_df['Name'] = final_pitcher_stats_df['Name'].str.extract('([^\*|#]*)')

pitcher_list = final_pitcher_stats_df.Name.tolist()

cleaned_pitcher_list = []
for pitcher in pitcher_list:
    pitcher = pitcher.replace("\xa0", " ")
    cleaned_pitcher_list.append(pitcher) 
    
final_pitcher_stats_df['Name'] = cleaned_pitcher_list 




In [23]:

final_pitcher_stats_df
test_df = final_pitcher_stats_df.loc[final_pitcher_stats_df['Name'] == 'Blake Snell']
test_df


Unnamed: 0,Year,Name,Age,G,GS,IP,ER,W,L,SV,SO,H,BB,CG,Hold
1639,2022,Blake Snell,29.0,24,24,128.0,48,8,10,0,171,103,51,0,0.0
4493,2023,Blake Snell,30.0,32,32,180.0,45,14,9,0,234,115,99,0,0.0
4877,2021,Blake Snell,28.0,27,27,128.2,60,7,6,0,170,101,69,0,0.0
5028,2020,Blake Snell,27.0,11,11,50.0,18,4,2,0,63,42,18,0,0.0
5187,2019,Blake Snell,26.0,23,23,107.0,51,6,8,0,147,96,40,0,0.0


In [24]:
# Sort data by name alphabetically, then by year in descending order
final_pitcher_stats_df = final_pitcher_stats_df.sort_values(['Year','Name'], ascending=[True, True])



final_pitcher_stats_df['FPTS'] = ''
final_pitcher_stats_df['AVG_FPTS'] = ''
final_pitcher_stats_df['Pos'] = ''
pos_list = []
final_pos_list = []

for index, row in final_pitcher_stats_df.iterrows():
    final_pitcher_stats_df['FPTS'] = ((3*final_pitcher_stats_df['IP']) - final_pitcher_stats_df['H'] - (2*final_pitcher_stats_df['ER']) - final_pitcher_stats_df['BB'] + (2*final_pitcher_stats_df['W']) - (2*final_pitcher_stats_df['L']) + (5*final_pitcher_stats_df['SV']) + (2*final_pitcher_stats_df['SO']) + (3*final_pitcher_stats_df['CG']) + (2*final_pitcher_stats_df['Hold']))
    final_pitcher_stats_df['AVG_FPTS'] = (final_pitcher_stats_df['FPTS']/final_pitcher_stats_df['G'])
#     final_pitcher_stats_df['Pos'] = (final_pitcher_stats_df['GS']/final_pitcher_stats_df['G'])
    
# #    print(final_pitcher_stats_df['GS'] / final_pitcher_stats_df['G'])
    pitcher_ratio = (final_pitcher_stats_df['GS'] / final_pitcher_stats_df['G'])[index]
    pos_list.append(pitcher_ratio)
# #    final_pitcher_stats_df['Pos'] = pitcher_ratio

for i in range(len(pos_list)):
    if pos_list[i] > (2/3):
        final_pos_list.append('SP')
    elif pos_list[i] < (1/3):
        final_pos_list.append('RP')
    else:
        final_pos_list.append('SP,RP')
        
final_pitcher_stats_df['Pos'] = final_pos_list
        
final_pitcher_stats_df = final_pitcher_stats_df.sort_values(['FPTS'], ascending=False)
final_pitcher_stats_df.head(50)


Unnamed: 0,Year,Name,Age,G,GS,IP,ER,W,L,SV,SO,H,BB,CG,Hold,FPTS,AVG_FPTS,Pos
5082,2019,Gerrit Cole,28.0,33,33,212.1,59,20,5,0,326,142,48,0,0.0,1010.3,30.615152,SP
5198,2019,Justin Verlander,36.0,34,34,223.0,64,21,6,0,300,137,42,2,0.0,998.0,29.352941,SP
5087,2019,Jacob deGrom,31.0,32,32,204.0,55,11,8,0,255,154,44,0,0.0,820.0,25.625,SP
4893,2021,Zack Wheeler,31.0,32,32,213.1,66,14,10,0,247,169,46,3,0.0,803.3,25.103125,SP
3552,2019,Shane Bieber,24.0,34,33,214.1,78,15,8,0,259,186,40,3,0.0,801.3,23.567647,SP
4496,2023,Spencer Strider,24.0,32,32,186.2,80,20,5,0,281,146,58,0,0.0,786.6,24.58125,SP
4532,2022,Sandy Alcántara,26.0,32,32,228.2,58,14,9,0,207,174,50,6,0.0,786.6,24.58125,SP
5189,2019,Stephen Strasburg,30.0,33,33,209.0,77,18,6,0,251,161,56,0,0.0,782.0,23.69697,SP
4871,2021,Max Scherzer,36.0,30,30,179.1,49,15,4,0,236,119,36,1,0.0,781.3,26.043333,SP
4548,2022,Corbin Burnes,27.0,33,33,202.0,66,12,8,0,243,144,51,0,0.0,773.0,23.424242,SP


In [25]:
# Create a new dataframe for stats percentile calculations
pitcher_percentile_df = pd.DataFrame(columns = ['Year','Name','Age','G','GS','IP','ER','W','L','SV','SO','H','BB','CG','Hold','Pos'])

# Carry over columnns from final_pitcher_stats_df that shouldn't be comparatively ranked 
pitcher_percentile_df['Year'] = final_pitcher_stats_df['Year']
pitcher_percentile_df['Name'] = final_pitcher_stats_df['Name']
pitcher_percentile_df['Age'] = final_pitcher_stats_df['Age']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
for year in last_five_years:
    year_df = final_pitcher_stats_df.loc[final_pitcher_stats_df['Year'] == year]
    year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
    year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)
    year_df.sort_values('Name', ascending=True)    
    
    # Each of the seasons are added back to the percentile dataframe
    pitcher_percentile_df = pitcher_percentile_df.append(year_df, ignore_index=True)

pitcher_percentile_df = pitcher_percentile_df.sort_values(['Year','Name'], ascending=[True, True])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)


In [26]:
# Clean up the percentile dataframe, drop NaNs and remove unnecessary columns
pitcher_percentile_df = pitcher_percentile_df.dropna()

# Add a rank column that adds the percentiles from each category
pitcher_percentile_df['Rank'] = (pitcher_percentile_df['FPTS_Percentile'] + pitcher_percentile_df['AVG_FPTS_Percentile'])

pitcher_percentile_df


Unnamed: 0,Year,Name,Age,G,GS,IP,ER,W,L,SV,SO,H,BB,CG,Hold,Pos,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Rank
4355,2019,Aaron Brooks,29.0,29,18,110.0,69,6,8,0,82,118,34,0,0.0,"SP,RP",200.0,6.896552,0.508588,0.601145,1.109733
4522,2019,Aaron Brooks,29.0,15,6,50.1,28,2,3,0,43,49,14,0,0.0,"SP,RP",115.3,7.686667,0.190840,0.633588,0.824427
4567,2019,Aaron Brooks,29.0,14,12,59.2,41,4,5,0,39,69,20,0,0.0,SP,82.6,5.900000,0.104962,0.549618,0.654580
4258,2019,Aaron Bummer,25.0,58,0,67.2,16,0,0,1,60,43,24,0,27.0,RP,281.6,4.855172,0.694656,0.463740,1.158397
4411,2019,Aaron Civale,24.0,10,10,57.2,15,3,4,0,46,44,16,0,0.0,SP,171.6,17.160000,0.402672,0.906489,1.309160
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2517,2023,Zack Greinke,39.0,30,27,142.1,80,2,15,0,97,158,23,0,0.0,SP,253.3,8.443333,0.628159,0.651625,1.279783
2561,2023,Zack Littell,27.0,28,14,90.0,41,3,6,0,74,94,12,0,0.0,"SP,RP",224.0,8.000000,0.549639,0.638989,1.188628
2562,2023,Zack Littell,27.0,26,14,87.0,38,3,6,0,72,91,9,0,0.0,"SP,RP",223.0,8.576923,0.546931,0.657040,1.203971
2624,2023,Zack Thompson,25.0,25,9,66.1,33,5,7,0,72,69,25,0,1.0,"SP,RP",180.3,7.212000,0.435018,0.590253,1.025271


In [27]:


# test_df = raw_pitcher_stats_df.loc[raw_pitcher_stats_df['Name'] == 'Blake\xa0Snell']
# test_df

In [28]:
# Create a list of each unique player we have in our dataframe
pitcher_list = pitcher_percentile_df.Name.unique().tolist()

# Create a new dataframe for combined, averaged percentiles over the past 5 seasons
new_pitcher_df = pd.DataFrame(columns = ['Name', 'Rank', 'Trend', 'Years', 'FPTS', 'AVG_FPTS', 'FPTS_Percentile', 'AVG_FPTS_Percentile','Pos','G','GS'])

# Create a list for each percentile stat category for upcoming loop
pitcher_trends = []
average_FPTS = []
average_AVG_FPTS = []
average_FPTS_Percentile = []
average_AVG_FPTS_Percentile = []
pitcher_average_Rank = []
pitcher_year_count = []
pos = []
games = []
games_started = []

cleaned_pitcher_list = []
for pitcher in pitcher_list:
    pitcher = pitcher.replace("\xa0", " ")
    cleaned_pitcher_list.append(pitcher) 

# new_pitcher_df
    
# Loop through each player, check if they played in the past two seasons. If not, remove them
for pitcher in cleaned_pitcher_list:
    filter_df = pitcher_percentile_df.loc[pitcher_percentile_df['Name'] == pitcher]
    filter_df = filter_df.sort_values(['Year'], ascending=[False])
    year_list = filter_df.Year.tolist()
    if (year_list[0] != last_year) and (year_list[0] != (last_year - 1)):
        pitcher_list.remove(pitcher)

# Update new dataframe with updated unique player list
new_pitcher_df['Name'] = pitcher_list        

# Loop through each player, locate their percentile stats for each season, average them out
for pitcher in pitcher_list:
    pitcher_df = pitcher_percentile_df.loc[pitcher_percentile_df['Name'] == pitcher]
    
    # We want to find the slope of the line of best fit for each player's overall ranking each season
    x = np.array(pitcher_df['Year'], dtype = float)
    y = np.array(pitcher_df['Rank'], dtype = float)
    slope, intercept = np.polyfit(x, y, 1)
    pitcher_trends.append(slope)
    
    # Find average of each player's percentiles from previous 5 seasons
    average_FPTS.append(sum(pitcher_df['FPTS']) / len(pitcher_df['FPTS']))
    average_AVG_FPTS.append(sum(pitcher_df['AVG_FPTS']) / len(pitcher_df['AVG_FPTS']))
    average_FPTS_Percentile.append(sum(pitcher_df['FPTS_Percentile']) / len(pitcher_df['FPTS_Percentile']))
    average_AVG_FPTS_Percentile.append(sum(pitcher_df['AVG_FPTS_Percentile']) / len(pitcher_df['AVG_FPTS_Percentile']))
    pitcher_average_Rank.append(sum(pitcher_df['Rank']) / len(pitcher_df['Rank']))
    pitcher_year_count.append(len(x))
    pos.append(pitcher_df['Pos'].iloc[-1])
    games.append(pitcher_df['G'].iloc[-1])
    games_started.append(pitcher_df['GS'].iloc[-1])

# Update new dataframe with the list data from each stat
new_pitcher_df['Trend'] = pitcher_trends
new_pitcher_df['Pos'] = pos
new_pitcher_df['G'] = games
new_pitcher_df['GS'] = games_started
new_pitcher_df['FPTS'] = average_FPTS
new_pitcher_df['AVG_FPTS'] = average_AVG_FPTS
new_pitcher_df['FPTS_Percentile'] = average_FPTS_Percentile
new_pitcher_df['AVG_FPTS_Percentile'] = average_AVG_FPTS_Percentile
new_pitcher_df['Rank'] = pitcher_average_Rank

# Keep track of how many seasons are being considered, so we know how reliable the data is
new_pitcher_df['Years'] = pitcher_year_count



  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [29]:
# Create a weighted rank column by adding trend data to the rank data and account for number of seasons played
# Basically, if you played all 5 seasons, your trend stat is added directly
# If you played fewer than all 5 seasons, your trend stat is reduced depending on how few seasons you played
# new_pitcher_df['Weighted Rank'] = (new_pitcher_df['Rank'] + ((new_pitcher_df['Trend'] * (new_pitcher_df['Years'] - 1) / 4)))

# # shift column 'Weighted Rank' to first position
# first_pitcher_column = new_pitcher_df.pop('Weighted Rank')
  
# # insert column using insert(position,column_name,first_column) function
# new_pitcher_df.insert(1, 'Weighted Rank', first_pitcher_column)

new_pitcher_df = new_pitcher_df.sort_values('FPTS', ascending = False)
new_pitcher_df.head(25)


Unnamed: 0,Name,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos,G,GS
81,Gerrit Cole,1.974887,0.00401,5,716.24,25.381091,0.989682,0.985205,SP,33,33
502,Spencer Strider,1.949479,0.097432,2,695.6,22.042238,0.98187,0.967609,SP,32,32
585,Kodai Senga,1.954874,0.000483,1,599.3,20.665517,0.976534,0.978339,SP,29,29
2,Aaron Nola,1.914393,0.003927,5,568.56,20.47125,0.967907,0.946486,SP,32,32
365,Shohei Ohtani,1.937188,0.017889,3,562.433333,22.657557,0.952475,0.984713,SP,23,23
243,Zack Wheeler,1.861066,0.048409,5,550.72,20.417187,0.932936,0.928129,SP,32,32
128,Justin Verlander,1.777411,-0.090201,5,535.32,21.78772,0.821877,0.955534,SP,11,11
200,Shane Bieber,1.882688,-0.066086,5,516.24,23.537608,0.91858,0.964108,SP,21,21
35,Charlie Morton,1.681653,0.079569,5,512.9,17.755587,0.834116,0.847537,SP,30,30
235,Yu Darvish,1.890662,-0.042978,5,509.76,20.64436,0.945639,0.945023,SP,24,24


In [30]:
draft_df = pd.concat([new_df, new_pitcher_df], ignore_index=True, sort=False)

# draft_df.drop(columns = ['Rank','FPTS_Percentile','AVG_FPTS_Percentile'])


# draft_df['FPTS_Percentile'] = draft_df['FPTS'].rank(pct=True)
# draft_df['AVG_FPTS_Percentile'] = draft_df['AVG_FPTS'].rank(pct=True)
    
# draft_df['Rank'] = (draft_df['FPTS_Percentile'] + (draft_df['AVG_FPTS_Percentile'] / 2))

draft_df['Weighted_Rank'] = ''
draft_df['Weighted_Rank'] = abs(draft_df['FPTS'] * (1 + (draft_df['Trend'] * (draft_df['Years'] - 1) / 4)))
# shift column 'Weighted Rank' to first position
# weighted_rank_column = draft_df.pop('Weighted Rank')
  
# # insert column using insert(position,column_name,first_column) function
# draft_df.insert(1, 'Weighted Rank', weighted_rank_column)


# draft_df['Weighted_Rank'] = ''
# draft_df['Weighted_Rank'] = abs(draft_df['FPTS'] * (1 + draft_df['Trend']))

weighted_rank_column = draft_df.pop('Weighted_Rank')
  
# insert column using insert(position,column_name,first_column) function
draft_df.insert(1, 'Weighted_Rank', weighted_rank_column)


draft_df = draft_df.sort_values(by='Weighted_Rank', ascending=False)
draft_df["G"].fillna("N/A", inplace = True)
draft_df["GS"].fillna("N/A", inplace = True)

draft_df.head(50)

Unnamed: 0,Name,Weighted_Rank,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos,G,GS
0,Freddie Freeman,1895.377354,1.999566,-0.000434,5,1896.2,13.702344,0.999566,1.0,1B,,
6,Nathaniel Lowe,1798.165882,1.620838,0.356275,4,1419.0,9.905221,0.785051,0.835786,1B,,
1,Matt Olson,1711.448073,1.963284,0.017629,5,1681.8,12.327258,0.988394,0.97489,1B,,
2,Paul Goldschmidt,1660.035414,1.965587,0.005351,5,1651.2,11.9135,0.990874,0.974713,"1B,DH",,
3,Pete Alonso,1578.469793,1.918285,0.024847,5,1540.2,10.900115,0.975016,0.943269,1B,,
15,Vladimir Guerrero Jr.,1564.941015,1.73573,0.19025,5,1314.8,9.624007,0.897282,0.838448,"1B,DH",,
5,J.T. Realmuto,1478.685954,1.948702,0.013493,5,1459.0,11.991166,0.972636,0.976065,C,,
4,José Abreu,1462.873519,1.936185,-0.007414,5,1473.8,11.317376,0.980749,0.955436,"1B,DH",,
42,Ty France,1453.9677,1.339648,0.445296,5,1006.0,7.838502,0.677217,0.66243,1B,,
8,Spencer Torkelson,1453.931172,1.866045,0.18548,2,1389.5,10.163608,0.928899,0.937146,1B,,


In [31]:
######################################################################################################################
######################################################################################################################
######################################################################################################################
#### DRAFT DAY FUNCTIONS
   
# DROP A PLAYER 
def drafted(player):
    global draft_df
    global final_pitcher_stats_df
    global final_position_players_df
    draft_df = draft_df[draft_df.Name != player]
    final_pitcher_stats_df = final_pitcher_stats_df[final_pitcher_stats_df.Name != player]
    final_position_players_df = final_position_players_df[final_position_players_df.Name != player]
    return draft_df.head(25)
    
# FILTER PLAYERS BY POSITION
def position_filter(POS):
    filtered_draft_df = draft_df[draft_df['Pos'].str.contains(POS)]
    return filtered_draft_df.head(25)

# PULL PITCHING STAT CATEGORY LEADERS
def pitching_stat_leaders(CAT):
    global final_pitcher_stats_df
    pitching_filtered_draft_df = final_pitcher_stats_df.sort_values([CAT], ascending=[False])
    return pitching_filtered_draft_df.head(25)

# PULL BATTING STAT CATEGORY LEADERS
def batting_stat_leaders(CAT):
    global final_position_players_df
    batting_filtered_draft_df = final_position_players_df.sort_values([CAT], ascending=[False])
    return batting_filtered_draft_df.head(25)

def drop_all_position(POS):
    global draft_df
    draft_df = draft_df[draft_df.Pos != POS]
    return draft_df.head(25)

In [32]:
# drop_all_position('')
# drafted('')


In [33]:
# Filter the DataFrame by a string value in the "City" column
filtered_df = draft_df[draft_df["Pos"].str.contains("1B")]
filtered_df.head(60)

Unnamed: 0,Name,Weighted_Rank,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos,G,GS
0,Freddie Freeman,1895.377354,1.999566,-0.000434,5,1896.2,13.702344,0.999566,1.0,1B,,
6,Nathaniel Lowe,1798.165882,1.620838,0.356275,4,1419.0,9.905221,0.785051,0.835786,1B,,
1,Matt Olson,1711.448073,1.963284,0.017629,5,1681.8,12.327258,0.988394,0.97489,1B,,
2,Paul Goldschmidt,1660.035414,1.965587,0.005351,5,1651.2,11.9135,0.990874,0.974713,"1B,DH",,
3,Pete Alonso,1578.469793,1.918285,0.024847,5,1540.2,10.900115,0.975016,0.943269,1B,,
15,Vladimir Guerrero Jr.,1564.941015,1.73573,0.19025,5,1314.8,9.624007,0.897282,0.838448,"1B,DH",,
4,José Abreu,1462.873519,1.936185,-0.007414,5,1473.8,11.317376,0.980749,0.955436,"1B,DH",,
42,Ty France,1453.9677,1.339648,0.445296,5,1006.0,7.838502,0.677217,0.66243,1B,,
8,Spencer Torkelson,1453.931172,1.866045,0.18548,2,1389.5,10.163608,0.928899,0.937146,1B,,
7,Christian Walker,1431.841242,1.883481,0.030102,5,1390.0,10.539662,0.952357,0.931124,1B,,


In [34]:
copy_test_df = filtered_df

copy_test_df['Test_FPTS_Percentile'] = copy_test_df['FPTS'].rank(pct=True)
copy_test_df['Test_AVG_FPTS_Percentile'] = copy_test_df['AVG_FPTS'].rank(pct=True)

copy_test_df['Test_Rank'] = (copy_test_df['Test_FPTS_Percentile'] + copy_test_df['Test_AVG_FPTS_Percentile'])


copy_test_df.head(60)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  copy_test_df['Test_FPTS_Percentile'] = copy_test_df['FPTS'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  copy_test_df['Test_AVG_FPTS_Percentile'] = copy_test_df['AVG_FPTS'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  copy_test_df['Test_Rank'] = (copy_test_df['Test

Unnamed: 0,Name,Weighted_Rank,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos,G,GS,Test_FPTS_Percentile,Test_AVG_FPTS_Percentile,Test_Rank
0,Freddie Freeman,1895.377354,1.999566,-0.000434,5,1896.2,13.702344,0.999566,1.0,1B,,,1.0,1.0,2.0
6,Nathaniel Lowe,1798.165882,1.620838,0.356275,4,1419.0,9.905221,0.785051,0.835786,1B,,,0.960938,0.882812,1.84375
1,Matt Olson,1711.448073,1.963284,0.017629,5,1681.8,12.327258,0.988394,0.97489,1B,,,0.992188,0.992188,1.984375
2,Paul Goldschmidt,1660.035414,1.965587,0.005351,5,1651.2,11.9135,0.990874,0.974713,"1B,DH",,,0.984375,0.984375,1.96875
3,Pete Alonso,1578.469793,1.918285,0.024847,5,1540.2,10.900115,0.975016,0.943269,1B,,,0.976562,0.945312,1.921875
15,Vladimir Guerrero Jr.,1564.941015,1.73573,0.19025,5,1314.8,9.624007,0.897282,0.838448,"1B,DH",,,0.898438,0.859375,1.757812
4,José Abreu,1462.873519,1.936185,-0.007414,5,1473.8,11.317376,0.980749,0.955436,"1B,DH",,,0.96875,0.960938,1.929688
42,Ty France,1453.9677,1.339648,0.445296,5,1006.0,7.838502,0.677217,0.66243,1B,,,0.839844,0.71875,1.558594
8,Spencer Torkelson,1453.931172,1.866045,0.18548,2,1389.5,10.163608,0.928899,0.937146,1B,,,0.945312,0.90625,1.851562
7,Christian Walker,1431.841242,1.883481,0.030102,5,1390.0,10.539662,0.952357,0.931124,1B,,,0.953125,0.921875,1.875


In [35]:
copy_test_df.sort_values('Test_Rank', ascending = False).head(60)

Unnamed: 0,Name,Weighted_Rank,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos,G,GS,Test_FPTS_Percentile,Test_AVG_FPTS_Percentile,Test_Rank
0,Freddie Freeman,1895.377354,1.999566,-0.000434,5,1896.2,13.702344,0.999566,1.0,1B,,,1.0,1.0,2.0
1,Matt Olson,1711.448073,1.963284,0.017629,5,1681.8,12.327258,0.988394,0.97489,1B,,,0.992188,0.992188,1.984375
2,Paul Goldschmidt,1660.035414,1.965587,0.005351,5,1651.2,11.9135,0.990874,0.974713,"1B,DH",,,0.984375,0.984375,1.96875
4,José Abreu,1462.873519,1.936185,-0.007414,5,1473.8,11.317376,0.980749,0.955436,"1B,DH",,,0.96875,0.960938,1.929688
3,Pete Alonso,1578.469793,1.918285,0.024847,5,1540.2,10.900115,0.975016,0.943269,1B,,,0.976562,0.945312,1.921875
12,Anthony Rizzo,1290.559134,1.919524,-0.046714,5,1353.8,11.634595,0.954608,0.964916,1B,,,0.921875,0.96875,1.890625
13,Rhys Hoskins,1335.469362,1.932309,-0.000779,4,1336.25,11.739209,0.959712,0.972597,1B,,,0.914062,0.976562,1.890625
10,C.J. Cron,1295.784053,1.842104,-0.067146,4,1364.5,11.041019,0.892536,0.949568,1B,,,0.929688,0.953125,1.882812
7,Christian Walker,1431.841242,1.883481,0.030102,5,1390.0,10.539662,0.952357,0.931124,1B,,,0.953125,0.921875,1.875
8,Spencer Torkelson,1453.931172,1.866045,0.18548,2,1389.5,10.163608,0.928899,0.937146,1B,,,0.945312,0.90625,1.851562


In [36]:
new_df.loc[new_df['Name'] == 'Vladimir Guerrero Jr.']

Unnamed: 0,Name,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos
299,Vladimir Guerrero Jr.,1.73573,0.19025,5,1314.8,9.624007,0.897282,0.838448,"1B,DH"


In [37]:
new_df.sort_values('Name').head(60)

Unnamed: 0,Name,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Pos
0,AJ Pollock,0.782843,-0.131911,5,433.4,4.616637,0.426793,0.356051,"DH,OF"
1,Aaron Hicks,0.568651,-0.04909,5,310.8,4.386489,0.293013,0.275638,"OF,OF,OF"
2,Aaron Judge,1.213238,0.159652,5,707.6,6.363672,0.598025,0.615213,"OF,DH,OF"
351,Abraham Toro,0.949883,-0.522648,2,527.0,5.245872,0.493078,0.456805,"2B,3B,DH"
3,Adam Duvall,0.742704,0.079865,5,403.0,4.64514,0.397763,0.344941,"OF,OF"
4,Adam Engel,0.413094,0.034001,3,254.333333,3.68879,0.182611,0.230483,"OF,OF"
5,Adam Frazier,1.399866,-0.032381,5,849.0,6.355607,0.78658,0.613286,2B
6,Adam Jones,0.838496,0.000208,1,640.0,4.671533,0.526549,0.311947,OF
7,Adeiny Hechavarría,0.610619,0.000151,1,396.0,4.714286,0.292035,0.318584,"2B,SS"
443,Adley Rutschman,1.874594,0.090291,2,1362.5,10.150299,0.940955,0.93364,"C,DH"


In [38]:
# Create a new dataframe for stats percentile calculations
new_percentile_df = pd.DataFrame(columns = ['Name','Pos'])

# Carry over columnns from final_batter_stats_df that shouldn't be comparatively ranked 
# new_percentile_df['Year'] = new_df['Year']
new_percentile_df['Name'] = new_df['Name']
# new_percentile_df['Age'] = new_df['Age']
# new_percentile_df['Tm'] = new_df['Tm']
new_percentile_df['Pos'] = new_df['Pos']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
pos_list = ['C','1B','2B','3B','SS','OF']

for pos in pos_list:
    pos_filtered_df = new_df[new_df['Pos'].str.contains(pos)]

    pos_filtered_df[f'{pos}_FPTS_Percentile'] = pos_filtered_df['FPTS'].rank(pct=True)
    pos_filtered_df[f'{pos}_AVG_FPTS_Percentile'] = pos_filtered_df['AVG_FPTS'].rank(pct=True)

    # Each of the seasons are added back to the percentile dataframe
    new_percentile_df = new_percentile_df.append(pos_filtered_df, ignore_index=True)
        
#     new_percentile_df['New_Pos'] = pos

        
new_percentile_df = new_percentile_df[new_percentile_df['Rank'].notna()]

new_percentile_df = new_percentile_df.sort_values('Name', ascending=True)
# new_percentile_df = new_percentile_df.drop(['Tm','G','R','H','2B','3B','HR','RBI','SB','TB','BB','SO','PO','A','E','PA','OPS','OPS+','1B'],axis=1)

new_percentile_df = new_percentile_df.fillna(0)

new_percentile_df.head(60)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos_filtered_df[f'{pos}_FPTS_Percentile'] = pos_filtered_df['FPTS'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos_filtered_df[f'{pos}_AVG_FPTS_Percentile'] = pos_filtered_df['AVG_FPTS'].rank(pct=True)


Unnamed: 0,Name,Pos,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,C_FPTS_Percentile,C_AVG_FPTS_Percentile,1B_FPTS_Percentile,1B_AVG_FPTS_Percentile,2B_FPTS_Percentile,2B_AVG_FPTS_Percentile,3B_FPTS_Percentile,3B_AVG_FPTS_Percentile,SS_FPTS_Percentile,SS_AVG_FPTS_Percentile,OF_FPTS_Percentile,OF_AVG_FPTS_Percentile
1351,AJ Pollock,"DH,OF",0.782843,-0.131911,5.0,433.4,4.616637,0.426793,0.356051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.541219,0.602151
1415,Aaron Hicks,"OF,OF,OF",0.568651,-0.04909,5.0,310.8,4.386489,0.293013,0.275638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.311828,0.523297
1246,Aaron Judge,"OF,DH,OF",1.213238,0.159652,5.0,707.6,6.363672,0.598025,0.615213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.917563,0.9319
922,Abraham Toro,"2B,3B,DH",0.949883,-0.522648,2.0,527.0,5.245872,0.493078,0.456805,0.0,0.0,0.0,0.0,0.569343,0.481752,0.0,0.0,0.0,0.0,0.0,0.0
1050,Abraham Toro,"2B,3B,DH",0.949883,-0.522648,2.0,527.0,5.245872,0.493078,0.456805,0.0,0.0,0.0,0.0,0.0,0.0,0.62406,0.56391,0.0,0.0,0.0,0.0
1368,Adam Duvall,"OF,OF",0.742704,0.079865,5.0,403.0,4.64514,0.397763,0.344941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.480287,0.620072
1437,Adam Engel,"OF,OF",0.413094,0.034001,3.0,254.333333,3.68879,0.182611,0.230483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.232975,0.247312
873,Adam Frazier,2B,1.399866,-0.032381,5.0,849.0,6.355607,0.78658,0.613286,0.0,0.0,0.0,0.0,0.927007,0.759124,0.0,0.0,0.0,0.0,0.0,0.0
1261,Adam Jones,OF,0.838496,0.000208,1.0,640.0,4.671533,0.526549,0.311947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.863799,0.62724
1202,Adeiny Hechavarría,"2B,SS",0.610619,0.000151,1.0,396.0,4.714286,0.292035,0.318584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.233333,0.277778,0.0,0.0


In [43]:
# catcher_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('C')]
# catcher_percentile_df['Ranked_Pos'] = 'C'
# catcher_percentile_df = catcher_percentile_df[['Name','Ranked_Pos','Years','FPTS','AVG_FPTS','C_FPTS_Percentile','C_AVG_FPTS_Percentile']]
# catcher_percentile_df['Pos_Ranking'] = (catcher_percentile_df['C_FPTS_Percentile'] + catcher_percentile_df['C_AVG_FPTS_Percentile'])/2


# firstbase_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('1B')]
# firstbase_percentile_df['Ranked_Pos'] = '1B'
# firstbase_percentile_df = firstbase_percentile_df[['Name','Ranked_Pos','Years','FPTS','AVG_FPTS','1B_FPTS_Percentile','1B_AVG_FPTS_Percentile']]
# firstbase_percentile_df['Pos_Ranking'] = (firstbase_percentile_df['1B_AVG_FPTS_Percentile'] + firstbase_percentile_df['1B_AVG_FPTS_Percentile'])/2


# secondbase_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('2B')]
# secondbase_percentile_df['Ranked_Pos'] = '2B'
# secondbase_percentile_df = secondbase_percentile_df[['Name','Ranked_Pos','Years','FPTS','AVG_FPTS','2B_FPTS_Percentile','2B_AVG_FPTS_Percentile']]
# secondbase_percentile_df['Pos_Ranking'] = (secondbase_percentile_df['2B_FPTS_Percentile'] + secondbase_percentile_df['2B_AVG_FPTS_Percentile'])/2


# thirdbase_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('3B')]
# thirdbase_percentile_df['Ranked_Pos'] = '3B'
# thirdbase_percentile_df = thirdbase_percentile_df[['Name','Ranked_Pos','Years','FPTS','AVG_FPTS','3B_FPTS_Percentile','3B_AVG_FPTS_Percentile']]
# thirdbase_percentile_df['Pos_Ranking'] = (thirdbase_percentile_df['3B_FPTS_Percentile'] + thirdbase_percentile_df['3B_AVG_FPTS_Percentile'])/2


# SS_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('SS')]
# SS_percentile_df['Ranked_Pos'] = 'SS'
# SS_percentile_df = SS_percentile_df[['Name','Ranked_Pos','Years','FPTS','AVG_FPTS','SS_FPTS_Percentile','SS_AVG_FPTS_Percentile']]
# SS_percentile_df['Pos_Ranking'] = (SS_percentile_df['SS_FPTS_Percentile'] + SS_percentile_df['SS_AVG_FPTS_Percentile'])/2


OF_percentile_df = new_percentile_df[new_percentile_df['Pos'].str.contains('OF')]
OF_percentile_df['Ranked_Pos'] = 'OF'
OF_percentile_df = OF_percentile_df[['Name','Ranked_Pos','Years','FPTS','AVG_FPTS','OF_FPTS_Percentile','OF_AVG_FPTS_Percentile']]
OF_percentile_df['Pos_Ranking'] = (OF_percentile_df['OF_FPTS_Percentile'] + OF_percentile_df['OF_AVG_FPTS_Percentile'])/2



OF_percentile_df.sort_values('Pos_Ranking',ascending = False).head(20)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  OF_percentile_df['Ranked_Pos'] = 'OF'


Unnamed: 0,Name,Ranked_Pos,Years,FPTS,AVG_FPTS,OF_FPTS_Percentile,OF_AVG_FPTS_Percentile,Pos_Ranking
1224,Mookie Betts,OF,5.0,969.2,7.801106,0.996416,1.0,0.998208
1225,Whit Merrifield,OF,5.0,968.6,7.206553,0.992832,0.982079,0.987455
1227,Juan Soto,OF,5.0,914.8,7.095612,0.985663,0.978495,0.982079
1230,Cody Bellinger,OF,5.0,858.2,7.340271,0.97491,0.985663,0.980287
1223,Julio Rodríguez,OF,2.0,988.5,6.887781,1.0,0.960573,0.980287
1229,Tommy Edman,OF,5.0,860.2,6.912534,0.978495,0.967742,0.973118
1228,Kyle Tucker,OF,4.0,886.75,6.901448,0.982079,0.964158,0.973118
1238,Fernando Tatis Jr.,OF,4.0,780.75,7.775563,0.946237,0.996416,0.971326
1233,Ronald Acuna Jr.,OF,5.0,805.0,7.019623,0.964158,0.97491,0.969534
1240,Eric Haase,OF,3.0,743.333333,7.495142,0.939068,0.989247,0.964158
