In [1]:
# Import needed dependencies
import requests
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
today = date.today()

# dd/mm/YY
current_year = today.strftime("%Y")

last_year = int(current_year) - 1

In [3]:
last_five_years = []
for i in range(0,5):
    last_five_years.append(int(current_year) - i)

In [4]:
# Create a list to help create a dataframe from batter statistics data
nba_stats = []

# Create a loop to create a dataframe from Basketball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_totals.html'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has NBA player statistics
    table = soup.select_one('#div_totals_stats') 

    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        nba_stats.append(tds)

In [5]:
# Create dataframe for batter statistics
nba_stats_df = pd.DataFrame(nba_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
nba_stats_df.columns = df_headers

In [6]:
# Change types of columns to numeric for columns with number values
nba_stats_df[['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']] = nba_stats_df[['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']].apply(pd.to_numeric)

# Drop all players with NaN games to remove null values
nba_stats_df.dropna(subset=['G'], axis = 0 , inplace= True)

# Remove any players with fewer than 27 games played (1/3rd of the season)
filtered_nba_stats_df = nba_stats_df[nba_stats_df['G'] >= 27]

# points, rebounds, assists, steals, blocks, threes, field-goal percentage, and free-throw percentage

# Select the columns we want for our NBA player analysis
yahoo_nba_df = filtered_nba_stats_df[['Year','Player','Age','Pos','Tm','G','PTS','FG','FGA','FG%','3P','FT','FTA','FT%','TRB','AST','STL','BLK','TOV']]


In [7]:
# Sort by index to prepare to drop duplicates
yahoo_nba_df = yahoo_nba_df.sort_index()

# Drop duplicate entries of Player Name and Year
# This is to eliminate partial season data for players who played for 2+ teams in one season
yahoo_nba_df = yahoo_nba_df.drop_duplicates(subset=['Year', 'Player'])

In [8]:
yahoo_nba_df['FPTS'] = ''
yahoo_nba_df['AVG_FPTS'] = ''

for index, row in yahoo_nba_df.iterrows():
    yahoo_nba_df['FPTS'] = (yahoo_nba_df['PTS'] + yahoo_nba_df['FG'] - yahoo_nba_df['FGA'] + yahoo_nba_df['FT'] - yahoo_nba_df['FTA'] + yahoo_nba_df['TRB'] + yahoo_nba_df['AST'] + (1.5 * yahoo_nba_df['STL']) + (1.5 * yahoo_nba_df['BLK']) - yahoo_nba_df['TOV'])
    yahoo_nba_df['AVG_FPTS'] = (yahoo_nba_df['FPTS']/yahoo_nba_df['G'])


In [9]:
# Sort data by name alphabetically, then by year in descending order
final_yahoo_nba_df = yahoo_nba_df.sort_values(['Year','Player'], ascending=[True, True])


In [10]:
# Create a new dataframe for stats percentile calculations
percentile_df = pd.DataFrame(columns = ['Year', 'Player', 'Age', 'Pos', 'Tm', 'FPTS_Percentile', 'AVG_FPTS_Percentile'])

# Carry over your non-numeric columnns from final_yahoo_nba_df
percentile_df['Year'] = final_yahoo_nba_df['Year']
percentile_df['Player'] = final_yahoo_nba_df['Player']
percentile_df['Age'] = final_yahoo_nba_df['Age']
percentile_df['Pos'] = final_yahoo_nba_df['Pos']
percentile_df['Tm'] = final_yahoo_nba_df['Tm']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
for year in last_five_years:
    year_df = final_yahoo_nba_df.loc[final_yahoo_nba_df['Year'] == year]
    year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
    year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)
    year_df.sort_values('Player', ascending=True)

    # Each of the seasons are added back to the percentile dataframe
    percentile_df = percentile_df.append(year_df, ignore_index=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['FPTS_Percentile'] = year_df['FPTS'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['AVG_FPTS_Percentile'] = year_df['AVG_FPTS'].rank(pct=True)


In [11]:
# Clean up the percentile dataframe, drop NaNs and remove unnecessary columns
percentile_df = percentile_df.dropna()
#percentile_df = percentile_df.drop(['Tm','G','PTS','FG','FGA','FG%','3P','FT','FTA','FT%','TRB','AST','STL','BLK','TOV'], axis = 1)

# Add a rank column that adds the percentiles from each category
# Average out percentile categories for FG% and FT% with PTS, as we might still want a player with somewhat  
## lower FG% and FT% if they score a lot of points
percentile_df['Rank'] = (((2*percentile_df['FPTS_Percentile']) + percentile_df['AVG_FPTS_Percentile'])/3)

percentile_df


Unnamed: 0,Year,Player,Age,Pos,Tm,FPTS_Percentile,AVG_FPTS_Percentile,G,PTS,FG,...,FTA,FT%,TRB,AST,STL,BLK,TOV,FPTS,AVG_FPTS,Rank
2015,2023,A.J. Green,23,SG,MIL,0.055156,0.06235,35.0,154.0,53.0,...,4.0,1.000,45.0,22.0,6.0,0.0,9.0,149.0,4.257143,0.057554
2016,2023,AJ Griffin,19,SF,ATL,0.443645,0.347722,72.0,639.0,248.0,...,47.0,0.894,153.0,73.0,42.0,12.0,42.0,614.0,8.527778,0.411671
2017,2023,Aaron Gordon,27,PF,DEN,0.844125,0.851319,68.0,1109.0,429.0,...,314.0,0.608,446.0,203.0,54.0,51.0,98.0,1362.5,20.036765,0.846523
2018,2023,Aaron Holiday,26,PG,ATL,0.206235,0.117506,63.0,247.0,92.0,...,32.0,0.844,74.0,89.0,37.0,12.0,36.0,314.5,4.992063,0.176659
2019,2023,Aaron Nesmith,23,SF,IND,0.589928,0.505995,73.0,738.0,252.0,...,142.0,0.838,277.0,98.0,55.0,34.0,75.0,810.5,11.102740,0.56195
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4025,2019,Yogi Ferrell,25,PG,SAC,0.309045,0.211055,71.0,420.0,153.0,...,67.0,0.896,109.0,137.0,36.0,4.0,40.0,480.0,6.760563,0.276382
4026,2019,Zach Collins,21,C,POR,0.537688,0.419598,77.0,512.0,189.0,...,126.0,0.746,324.0,71.0,25.0,66.0,77.0,723.5,9.396104,0.498325
4027,2019,Zach LaVine,23,SG,CHI,0.834171,0.89196,63.0,1492.0,530.0,...,375.0,0.832,294.0,283.0,60.0,26.0,215.0,1315.0,20.873016,0.853434
4028,2019,Zaza Pachulia,34,C,DET,0.330402,0.256281,68.0,267.0,85.0,...,124.0,0.782,265.0,91.0,31.0,17.0,57.0,503.0,7.397059,0.305695


In [12]:
# Create a list of each unique player we have in our dataframe
player_list = percentile_df.Player.unique().tolist()

# Create a new dataframe for combined, averaged percentiles over the past 5 seasons
new_df = pd.DataFrame(columns = ['Name', 'Rank', 'Trend', 'Pos', 'Years', 'FPTS', 'AVG_FPTS', 'FPTS_Percentile', 'AVG_FPTS_Percentile'])

# Update new dataframe with unique player list
new_df['Player'] = player_list

# Create a list for each percentile stat category for upcoming loop
player_trends = []
average_FPTS = []
average_AVG_FPTS = []
average_FPTS_Percentile = []
average_AVG_FPTS_Percentile = []
average_Rank = []
year_count = []
pos = []

# Loop through each player, locate their percentile stats for each season, average them out
for player in player_list:
    player_df = percentile_df.loc[percentile_df['Player'] == player]
    
    # We want to find the slope of the line of best fit for each player's overall ranking each season
    x = np.array(player_df['Year'], dtype = float)
    y = np.array(player_df['Rank'], dtype = float)
    slope, intercept = np.polyfit(x, y, 1)
    player_trends.append(slope)
    
    # Find average of each player's percentiles from previous 5 seasons
    average_FPTS.append(sum(player_df['FPTS']) / len(player_df['FPTS']))
    average_AVG_FPTS.append(sum(player_df['AVG_FPTS']) / len(player_df['AVG_FPTS']))
    average_FPTS_Percentile.append(sum(player_df['FPTS_Percentile']) / len(player_df['FPTS_Percentile']))
    average_AVG_FPTS_Percentile.append(sum(player_df['AVG_FPTS_Percentile']) / len(player_df['AVG_FPTS_Percentile']))
    average_Rank.append(sum(player_df['Rank']) / len(player_df['Rank']))
    year_count.append(len(x))
    
    # Keep player positions for reference purposes during the draft
    pos.append(player_df['Pos'].unique())

# Update new dataframe with the list data from each stat
new_df['Pos'] = pos
new_df['Trend'] = player_trends
new_df['FPTS'] = average_FPTS
new_df['AVG_FPTS'] = average_AVG_FPTS
new_df['FPTS_Percentile'] = average_FPTS_Percentile
new_df['AVG_FPTS_Percentile'] = average_AVG_FPTS_Percentile
new_df['Rank'] = average_Rank

# Keep track of how many seasons are being considered, so we know how reliable the data is
new_df['Years'] = year_count


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [13]:
# Create a weighted rank column by adding trend data to the rank data and account for number of seasons played
# Basically, if you played all 5 seasons, your trend stat is added directly
# If you played fewer than all 5 seasons, your trend stat is reduced depending on how few seasons you played
new_df['Weighted Rank'] = (new_df['Rank'] + ((new_df['Trend'] * (new_df['Years'] - 1) / 4)))

# shift column 'Weighted Rank' to first position
first_column = new_df.pop('Weighted Rank')
  
# insert column using insert(position,column_name,first_column) function
new_df.insert(1, 'Weighted Rank', first_column)

new_df = new_df.sort_values('Weighted Rank', ascending = False)

new_df = new_df.round({'FPTS': 0, 'AVG_FPTS': 2})

new_df.head(50)

Unnamed: 0,Name,Weighted Rank,Rank,Trend,Pos,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Player
310,,0.998857,0.995536,0.003321,[C],5,2554.0,34.84,0.997437,0.991733,Nikola Jokić
134,,0.993493,0.995781,-0.002287,[PF],5,2306.0,35.31,0.995117,0.997107,Giannis Antetokounmpo
265,,0.988669,0.973145,0.015524,"[PG, SG]",5,1950.0,29.71,0.972219,0.974998,Luka Dončić
199,,0.985722,0.977345,0.008377,[C],5,2011.0,33.19,0.971399,0.989237,Joel Embiid
105,,0.978783,0.960474,0.018309,"[C, PF]",5,1873.0,27.67,0.961522,0.958377,Domantas Sabonis
188,,0.975077,0.940914,0.034163,"[SF, PF]",5,1779.0,24.87,0.949056,0.924631,Jayson Tatum
311,,0.965569,0.969342,-0.003773,[C],5,1956.0,26.62,0.976501,0.955024,Nikola Vučević
385,,0.963654,0.94522,0.018434,[PG],5,1737.0,24.81,0.953413,0.928834,Trae Young
88,,0.95867,0.953025,0.005645,"[SF, PF, SG]",5,1758.0,24.66,0.961141,0.936792,DeMar DeRozan
363,,0.95738,0.960014,-0.003513,[PG],4,1761.0,28.04,0.956192,0.967659,Stephen Curry


In [14]:
new_pos_list = []
pos_list = new_df['Pos'].tolist()
for item in pos_list:
    new_string = []
    for pos in item:
        string = str(pos)
        new_string = f'{new_string},{string}'
    new_pos_list.append(new_string)

final_pos_list = []
for i in new_pos_list:
    i = i.replace('[],', '')
    final_pos_list.append(i)
    
final_pos_list

new_df.drop('Pos', axis = 1, inplace = True)
new_df['Pos'] = final_pos_list
new_df.drop('Name', axis = 1, inplace = True)


new_df.head(15)

Unnamed: 0,Weighted Rank,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Player,Pos
310,0.998857,0.995536,0.003321,5,2554.0,34.84,0.997437,0.991733,Nikola Jokić,C
134,0.993493,0.995781,-0.002287,5,2306.0,35.31,0.995117,0.997107,Giannis Antetokounmpo,PF
265,0.988669,0.973145,0.015524,5,1950.0,29.71,0.972219,0.974998,Luka Dončić,"PG,SG"
199,0.985722,0.977345,0.008377,5,2011.0,33.19,0.971399,0.989237,Joel Embiid,C
105,0.978783,0.960474,0.018309,5,1873.0,27.67,0.961522,0.958377,Domantas Sabonis,"C,PF"
188,0.975077,0.940914,0.034163,5,1779.0,24.87,0.949056,0.924631,Jayson Tatum,"SF,PF"
311,0.965569,0.969342,-0.003773,5,1956.0,26.62,0.976501,0.955024,Nikola Vučević,C
385,0.963654,0.94522,0.018434,5,1737.0,24.81,0.953413,0.928834,Trae Young,PG
88,0.95867,0.953025,0.005645,5,1758.0,24.66,0.961141,0.936792,DeMar DeRozan,"SF,PF,SG"
363,0.95738,0.960014,-0.003513,4,1761.0,28.04,0.956192,0.967659,Stephen Curry,PG


In [15]:
draft_df = new_df

In [16]:
######################################################################################################################
######################################################################################################################
######################################################################################################################
#### DRAFT DAY FUNCTIONS
   
# DROP A PLAYER 
def drafted(player):
    global draft_df
    draft_df = draft_df[draft_df.Player != player]
    return draft_df.head(25)
    
# FILTER PLAYERS BY POSITION
def position_filter(Pos):
    filtered_draft_df = draft_df[draft_df['Pos'].str.contains(Pos)]
    return filtered_draft_df.head(25)

# PULL STAT CATEGORY LEADERS
def stat_leaders(CAT):
    global final_yahoo_nba_df
    final_yahoo_nba_df = final_yahoo_nba_df.sort_values([CAT], ascending=[False])
    return final_yahoo_nba_df.head(25)

Unnamed: 0,Weighted Rank,Rank,Trend,Years,FPTS,AVG_FPTS,FPTS_Percentile,AVG_FPTS_Percentile,Player,Pos
188,0.975077,0.940914,0.034163,5,1779.0,24.87,0.949056,0.924631,Jayson Tatum,"SF,PF"
88,0.95867,0.953025,0.005645,5,1758.0,24.66,0.961141,0.936792,DeMar DeRozan,"SF,PF,SG"
261,0.956719,0.957967,-0.001248,5,1711.0,30.7,0.945608,0.982685,LeBron James,"PF,C,PG,SF"
195,0.946122,0.931618,0.014504,5,1525.0,25.87,0.922433,0.949989,Jimmy Butler,"PF,SF,SF-SG"
241,0.9301,0.937321,-0.009627,4,1650.0,30.92,0.913366,0.985231,Kevin Durant,"PF,SF"
413,0.925262,0.908923,0.016339,5,1467.0,22.61,0.909275,0.908218,Zach LaVine,"SG,SF"
228,0.92371,0.93564,-0.015907,4,1540.0,27.86,0.919897,0.967124,Kawhi Leonard,SF
185,0.907129,0.846329,0.0608,5,1293.0,20.3,0.850259,0.838471,Jaylen Brown,"SF,SG"
355,0.882235,0.88837,-0.024542,2,1499.0,19.86,0.907579,0.849953,Scottie Barnes,"SF,PF"
383,0.873456,0.900379,-0.026924,5,1508.0,20.82,0.915107,0.870923,Tobias Harris,"SF,PF"
