In [1]:
# Import needed dependencies
import requests
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
today = date.today()

# dd/mm/YY
current_year = today.strftime("%Y")

last_year = int(current_year) - 1

In [3]:
last_five_years = []
for i in range(0,5):
    last_five_years.append(int(current_year) - i)

In [4]:
# Create a list to help create a dataframe from batter statistics data
nba_stats = []

# Create a loop to create a dataframe from Basketball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_totals.html'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has NBA player statistics
    table = soup.select_one('#div_totals_stats') 

    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        nba_stats.append(tds)

In [5]:
# Create dataframe for batter statistics
nba_stats_df = pd.DataFrame(nba_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
nba_stats_df.columns = df_headers

In [6]:
# Change types of columns to numeric for columns with number values
nba_stats_df[['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']] = nba_stats_df[['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']].apply(pd.to_numeric)

# Drop all players with NaN games to remove null values
nba_stats_df.dropna(subset=['G'], axis = 0 , inplace= True)

# Remove any players with fewer than 27 games played (1/3rd of the season)
filtered_nba_stats_df = nba_stats_df[nba_stats_df['G'] >= 27]

# points, rebounds, assists, steals, blocks, threes, field-goal percentage, and free-throw percentage

# Select the columns we want for our NBA player analysis
yahoo_nba_df = filtered_nba_stats_df[['Year','Player','Age','Pos','Tm','G','PTS','FG','FGA','FG%','3P','FT','FTA','FT%','TRB','AST','STL','BLK','TOV']]


In [7]:
# Sort by index to prepare to drop duplicates
yahoo_nba_df = yahoo_nba_df.sort_index()

# Drop duplicate entries of Player Name and Year
# This is to eliminate partial season data for players who played for 2+ teams in one season
yahoo_nba_df = yahoo_nba_df.drop_duplicates(subset=['Year', 'Player'])

In [8]:
yahoo_nba_df['FPTS'] = ''
yahoo_nba_df['AVG_FPTS'] = ''

for index, row in yahoo_nba_df.iterrows():
    yahoo_nba_df['FPTS'] = (yahoo_nba_df['PTS'] + yahoo_nba_df['FG'] - yahoo_nba_df['FGA'] + yahoo_nba_df['FT'] - yahoo_nba_df['FTA'] + yahoo_nba_df['TRB'] + yahoo_nba_df['AST'] + (1.5 * yahoo_nba_df['STL']) + (1.5 * yahoo_nba_df['BLK']) - yahoo_nba_df['TOV'])
    yahoo_nba_df['AVG_FPTS'] = (yahoo_nba_df['FPTS']/yahoo_nba_df['G'])
    
yahoo_nba_df


Unnamed: 0,Year,Player,Age,Pos,Tm,G,PTS,FG,FGA,FG%,...,FT,FTA,FT%,TRB,AST,STL,BLK,TOV,FPTS,AVG_FPTS
0,2022,Precious Achiuwa,22,C,TOR,73,664,265,603,0.439,...,78,131,0.595,473,82,37,41,84,861.0,11.794521
1,2022,Steven Adams,28,C,MEM,76,528,210,384,0.547,...,108,199,0.543,760,256,65,60,115,1351.5,17.782895
2,2022,Bam Adebayo,24,C,MIA,56,1068,406,729,0.557,...,256,340,0.753,564,190,80,44,148,1453.0,25.946429
3,2022,Santi Aldama,21,PF,MEM,32,132,53,132,0.402,...,20,32,0.625,87,21,6,10,16,157.0,4.906250
4,2022,LaMarcus Aldridge,36,C,BRK,47,607,252,458,0.550,...,89,102,0.873,258,42,14,47,44,735.5,15.648936
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3533,2018,Cody Zeller,25,C,CHO,33,233,85,156,0.545,...,61,85,0.718,177,31,14,21,33,365.5,11.075758
3534,2018,Tyler Zeller,28,C,TOT,66,441,187,334,0.560,...,57,79,0.722,305,47,15,35,47,652.0,9.878788
3537,2018,Paul Zipser,23,SF,CHI,54,218,81,234,0.346,...,19,25,0.760,131,46,20,15,43,245.5,4.546296
3538,2018,Ante Žižić,21,C,CLE,32,119,49,67,0.731,...,21,29,0.724,60,5,2,13,11,169.5,5.296875


In [9]:
# Sort data by name alphabetically, then by year in descending order
final_yahoo_nba_df = yahoo_nba_df.sort_values(['Year','Player'], ascending=[True, True])
final_yahoo_nba_df

Unnamed: 0,Year,Player,Age,Pos,Tm,G,PTS,FG,FGA,FG%,...,FT,FTA,FT%,TRB,AST,STL,BLK,TOV,FPTS,AVG_FPTS
2955,2018,Aaron Brooks,33,PG,MIN,32,75,28,69,0.406,...,8,11,0.727,17,20,6,0,11,66.0,2.062500
3092,2018,Aaron Gordon,22,PF,ORL,58,1022,375,865,0.434,...,157,225,0.698,457,136,59,45,107,1106.0,19.068966
3323,2018,Abdel Nader,24,SF,BOS,48,146,50,149,0.336,...,23,39,0.590,71,26,15,10,34,131.5,2.739583
3155,2018,Al Horford,31,C,BOS,72,927,368,753,0.489,...,94,120,0.783,530,339,43,78,132,1434.5,19.923611
3184,2018,Al Jefferson,33,C,IND,36,252,111,208,0.534,...,30,36,0.833,143,30,16,23,21,359.5,9.986111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
766,2022,Yuta Watanabe,27,SF,TOR,38,163,56,138,0.406,...,24,40,0.600,92,21,11,16,19,199.5,5.250000
136,2022,Zach Collins,24,C,SAS,28,218,76,155,0.490,...,52,65,0.800,153,61,13,23,45,349.0,12.464286
442,2022,Zach LaVine,26,SG,CHI,67,1635,565,1186,0.476,...,320,375,0.853,308,303,41,23,171,1495.0,22.313433
553,2022,Zeke Nnaji,21,PF,DEN,41,270,96,186,0.516,...,41,65,0.631,147,17,16,13,23,340.5,8.304878


In [10]:
# Create a new dataframe for stats percentile calculations
percentile_df = pd.DataFrame(columns = ['Year', 'Player', 'Age', 'Pos', 'Tm', 'FPTS_Percentile', 'AVG_FPTS_Percentile'])

# Carry over your non-numeric columnns from final_yahoo_nba_df
percentile_df['Year'] = final_yahoo_nba_df['Year']
percentile_df['Player'] = final_yahoo_nba_df['Player']
percentile_df['Age'] = final_yahoo_nba_df['Age']
percentile_df['Pos'] = final_yahoo_nba_df['Pos']
percentile_df['Tm'] = final_yahoo_nba_df['Tm']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
for year in last_five_years:
    year_df = final_yahoo_nba_df.loc[final_yahoo_nba_df['Year'] == year]
    year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
    year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)
    year_df.sort_values('Player', ascending=True)

    # Each of the seasons are added back to the percentile dataframe
    percentile_df = percentile_df.append(year_df, ignore_index=True)

percentile_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)


Unnamed: 0,Year,Player,Age,Pos,Tm,FPTS_Percentile,AVG_FPTS_Percentile,G,PTS,FG,...,FT,FTA,FT%,TRB,AST,STL,BLK,TOV,FPTS,AVG_FPTS
0,2018,Aaron Brooks,33,PG,MIN,,,,,,...,,,,,,,,,,
1,2018,Aaron Gordon,22,PF,ORL,,,,,,...,,,,,,,,,,
2,2018,Abdel Nader,24,SF,BOS,,,,,,...,,,,,,,,,,
3,2018,Al Horford,31,C,BOS,,,,,,...,,,,,,,,,,
4,2018,Al Jefferson,33,C,IND,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3951,2018,Yogi Ferrell,24,SG,DAL,0.618421,0.492105,82.0,838.0,311.0,...,82.0,103.0,0.796,249.0,201.0,64.0,9.0,78.0,879.5,10.725610
3952,2018,Zach Collins,20,C,POR,0.239474,0.152632,66.0,292.0,115.0,...,27.0,42.0,0.643,221.0,52.0,17.0,31.0,58.0,390.0,5.909091
3953,2018,Zach Randolph,36,PF,SAC,0.639474,0.765789,59.0,857.0,361.0,...,84.0,107.0,0.785,397.0,127.0,42.0,10.0,116.0,918.0,15.559322
3954,2018,Zaza Pachulia,33,C,GSW,0.460526,0.426316,69.0,373.0,149.0,...,75.0,93.0,0.806,321.0,109.0,38.0,17.0,72.0,680.5,9.862319


In [19]:
# Clean up the percentile dataframe, drop NaNs and remove unnecessary columns
percentile_df = percentile_df.dropna()
#percentile_df = percentile_df.drop(['Tm','G','PTS','FG','FGA','FG%','3P','FT','FTA','FT%','TRB','AST','STL','BLK','TOV'], axis = 1)

# Add a rank column that adds the percentiles from each category
# Average out percentile categories for FG% and FT% with PTS, as we might still want a player with somewhat  
## lower FG% and FT% if they score a lot of points
percentile_df['Rank'] = (percentile_df['FPTS_Percentile'] + percentile_df['AVG_FPTS_Percentile'])

percentile_df


Unnamed: 0,Year,Player,Age,Pos,FPTS_Percentile,AVG_FPTS_Percentile,FPTS,AVG_FPTS,Rank
1978,2022,Aaron Gordon,26,PF,0.850962,0.795673,1293.0,17.240000,1.646635
1979,2022,Aaron Holiday,25,PG,0.346154,0.252404,488.5,7.753968,0.598558
1980,2022,Aaron Nesmith,22,SF,0.086538,0.045673,198.5,3.817308,0.132212
1981,2022,Aaron Wiggins,23,SG,0.3125,0.362981,464.0,9.280000,0.675481
1982,2022,Admiral Schofield,24,SF,0.064904,0.084135,174.5,4.592105,0.149038
...,...,...,...,...,...,...,...,...,...
3951,2018,Yogi Ferrell,24,SG,0.618421,0.492105,879.5,10.725610,1.110526
3952,2018,Zach Collins,20,C,0.239474,0.152632,390.0,5.909091,0.392105
3953,2018,Zach Randolph,36,PF,0.639474,0.765789,918.0,15.559322,1.405263
3954,2018,Zaza Pachulia,33,C,0.460526,0.426316,680.5,9.862319,0.886842


In [None]:
# # Create a list of each unique player we have in our dataframe
# player_list = percentile_df.Player.unique().tolist()

# # Create a new dataframe for combined, averaged percentiles over the past 5 seasons
# new_df = pd.DataFrame(columns = ['Player', 'Rank', 'Trend', 'Pos', 'Years', '3P_Percentile', 'FG%_Percentile', 'TRB_Percentile', 'FT%_Percentile', 'AST_Percentile', 'STL_Percentile', 'BLK_Percentile', 'PTS_Percentile'])

# # Update new dataframe with unique player list
# new_df['Player'] = player_list

# # Create a list for each percentile stat category for upcoming loop
# player_trends = []
# average_3P = []
# average_FG = []
# average_TRB = []
# average_FT = []
# average_AST = []
# average_STL = []
# average_BLK = []
# average_PTS = []
# average_Rank = []
# year_count = []
# pos = []

# # Loop through each player, locate their percentile stats for each season, average them out
# for player in player_list:
#     player_df = percentile_df.loc[percentile_df['Player'] == player]
    
#     # We want to find the slope of the line of best fit for each player's overall ranking each season
#     x = np.array(player_df['Year'], dtype = float)
#     y = np.array(player_df['Rank'], dtype = float)
#     slope, intercept = np.polyfit(x, y, 1)
#     player_trends.append(slope)
    
#     # Find average of each player's percentiles from previous 5 seasons
#     average_3P.append(sum(player_df['3P_Percentile']) / len(player_df['3P_Percentile']))
#     average_FG.append(sum(player_df['FG%_Percentile']) / len(player_df['FG%_Percentile']))
#     average_TRB.append(sum(player_df['TRB_Percentile']) / len(player_df['TRB_Percentile']))
#     average_FT.append(sum(player_df['FT%_Percentile']) / len(player_df['FT%_Percentile']))
#     average_AST.append(sum(player_df['AST_Percentile']) / len(player_df['AST_Percentile']))
#     average_STL.append(sum(player_df['STL_Percentile']) / len(player_df['STL_Percentile']))
#     average_BLK.append(sum(player_df['BLK_Percentile']) / len(player_df['BLK_Percentile']))
#     average_PTS.append(sum(player_df['PTS_Percentile']) / len(player_df['PTS_Percentile']))
#     average_Rank.append(sum(player_df['Rank']) / len(player_df['Rank']))
#     year_count.append(len(x))
    
#     # Keep player positions for reference purposes during the draft
#     pos.append(player_df['Pos'].unique())

# # Update new dataframe with the list data from each stat
# new_df['Pos'] = pos
# new_df['Trend'] = player_trends
# new_df['3P_Percentile'] = average_3P
# new_df['FG%_Percentile'] = average_FG
# new_df['TRB_Percentile'] = average_TRB
# new_df['FT%_Percentile'] = average_FT
# new_df['AST_Percentile'] = average_AST
# new_df['STL_Percentile'] = average_STL
# new_df['BLK_Percentile'] = average_BLK
# new_df['PTS_Percentile'] = average_PTS
# new_df['Rank'] = average_Rank

# # Keep track of how many seasons are being considered, so we know how reliable the data is
# new_df['Years'] = year_count



In [None]:
# # Create a weighted rank column by adding trend data to the rank data and account for number of seasons played
# # Basically, if you played all 5 seasons, your trend stat is added directly
# # If you played fewer than all 5 seasons, your trend stat is reduced depending on how few seasons you played
# new_df['Weighted Rank'] = (new_df['Rank'] + ((new_df['Trend'] * (new_df['Years'] - 1) / 4)))

# # shift column 'Weighted Rank' to first position
# first_column = new_df.pop('Weighted Rank')
  
# # insert column using insert(position,column_name,first_column) function
# new_df.insert(1, 'Weighted Rank', first_column)

# new_df = new_df.sort_values('Weighted Rank', ascending = False)
# new_df.head(50)

In [None]:
# new_df.to_csv("/Users/michaelbinger/Documents/Projects/Fantasy-Baseball-Analysis/Fantasy_Basketball/nba_trends.csv")
# final_yahoo_nba_df.to_csv("/Users/michaelbinger/Documents/Projects/Fantasy-Baseball-Analysis/Fantasy_Basketball/nba_stats.csv")
