In [1]:
# Import needed dependencies
import requests
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
today = date.today()

# dd/mm/YY
current_year = today.strftime("%Y")

last_year = int(current_year) - 1

In [3]:
last_five_years = []
for i in range(0,5):
    last_five_years.append(int(current_year) - i)

In [4]:
# Create a list to help create a dataframe from batter statistics data
nba_stats = []

# Create a loop to create a dataframe from Basketball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_totals.html'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has NBA player statistics
    table = soup.select_one('#div_totals_stats') 

    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        nba_stats.append(tds)

In [5]:
# Create dataframe for batter statistics
nba_stats_df = pd.DataFrame(nba_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
nba_stats_df.columns = df_headers

In [6]:
# Change types of columns to numeric for columns with number values
nba_stats_df[['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']] = nba_stats_df[['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']].apply(pd.to_numeric)

# Drop all players with NaN games to remove null values
nba_stats_df.dropna(subset=['G'], axis = 0 , inplace= True)

# Remove any players with fewer than 27 games played (1/3rd of the season)
filtered_nba_stats_df = nba_stats_df[nba_stats_df['G'] >= 27]

# points, rebounds, assists, steals, blocks, threes, field-goal percentage, and free-throw percentage

# Select the columns we want for our NBA player analysis
yahoo_nba_df = filtered_nba_stats_df[['Year','Player','Age','Pos','Tm','G','FG%','3P','FT%','TRB','AST','STL','BLK','PTS']]


In [7]:
# Sort by index to prepare to drop duplicates
yahoo_nba_df = yahoo_nba_df.sort_index()

# Drop duplicate entries of Player Name and Year
# This is to eliminate partial season data for players who played for 2+ teams in one season
yahoo_nba_df = yahoo_nba_df.drop_duplicates(subset=['Year', 'Player'])

In [8]:
# Sort data by name alphabetically, then by year in descending order
final_yahoo_nba_df = yahoo_nba_df.sort_values(['Year','Player'], ascending=[True, True])
final_yahoo_nba_df

Unnamed: 0,Year,Player,Age,Pos,Tm,G,FG%,3P,FT%,TRB,AST,STL,BLK,PTS
3096,2019,Aaron Gordon,23,PF,ORL,78,0.449,121,0.731,574,289,57,56,1246
3147,2019,Aaron Holiday,22,PG,IND,50,0.401,43,0.820,67,87,21,13,294
3344,2019,Abdel Nader,25,SF,OKC,61,0.423,32,0.750,116,20,20,12,241
3158,2019,Al Horford,32,C,BOS,68,0.535,73,0.821,458,283,59,86,925
2859,2019,Al-Farouq Aminu,28,PF,POR,81,0.433,96,0.867,610,104,68,33,760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,2023,Zach Collins,25,C,SAS,63,0.518,55,0.761,402,180,37,49,731
360,2023,Zach LaVine,27,SG,CHI,77,0.485,204,0.848,345,327,69,18,1913
457,2023,Zeke Nnaji,22,PF,DEN,53,0.561,17,0.645,138,18,17,23,277
663,2023,Ziaire Williams,21,SF,MEM,37,0.429,25,0.773,79,35,14,6,210


In [9]:
# Create a new dataframe for stats percentile calculations
percentile_df = pd.DataFrame(columns = ['Year', 'Player', 'Age', 'Pos', 'Tm', '3P_Percentile', 'FG%_Percentile', 'TRB_Percentile', 'FT%_Percentile', 'AST_Percentile', 'STL_Percentile', 'BLK_Percentile', 'PTS_Percentile'])

# Carry over your non-numeric columnns from final_yahoo_nba_df
percentile_df['Year'] = final_yahoo_nba_df['Year']
percentile_df['Player'] = final_yahoo_nba_df['Player']
percentile_df['Age'] = final_yahoo_nba_df['Age']
percentile_df['Pos'] = final_yahoo_nba_df['Pos']
percentile_df['Tm'] = final_yahoo_nba_df['Tm']

# Calculate the percentile rank for each player in each season, seperately, then add all the seasons in one dataframe
for year in last_five_years:
    year_df = final_yahoo_nba_df.loc[final_yahoo_nba_df['Year'] == year]
    year_df['3P_Percentile'] = year_df['3P'].rank(pct=True)
    year_df['FG%_Percentile'] = year_df['FG%'].rank(pct=True)
    year_df['TRB_Percentile'] = year_df['TRB'].rank(pct=True)
    year_df['FT%_Percentile'] = year_df['FT%'].rank(pct=True)
    year_df['AST_Percentile'] = year_df['AST'].rank(pct=True)
    year_df['STL_Percentile'] = year_df['STL'].rank(pct=True)
    year_df['BLK_Percentile'] = year_df['BLK'].rank(pct=True)
    year_df['PTS_Percentile'] = year_df['PTS'].rank(pct=True)
    year_df.sort_values('Player', ascending=True)

    # Each of the seasons are added back to the percentile dataframe
    percentile_df = percentile_df.append(year_df, ignore_index=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['3P_Percentile'] = year_df['3P'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['FG%_Percentile'] = year_df['FG%'].rank(pct=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  year_df['TRB_Percentile'] = year_df['TRB'].rank(pct=True)
A value is trying to be set on a 

In [10]:
# Clean up the percentile dataframe, drop NaNs and remove unnecessary columns
percentile_df = percentile_df.dropna()
percentile_df = percentile_df.drop(['Tm','G','FG%','3P','FT%','TRB','AST','STL','BLK','PTS'], axis = 1)

# Add a rank column that adds the percentiles from each category
# Average out percentile categories for FG% and FT% with PTS, as we might still want a player with somewhat  
## lower FG% and FT% if they score a lot of points
percentile_df['Rank'] = (percentile_df['3P_Percentile'] + ((percentile_df['FG%_Percentile'] + percentile_df['PTS_Percentile']) / 2) + percentile_df['TRB_Percentile'] + ((percentile_df['FT%_Percentile'] + percentile_df['PTS_Percentile']) / 2) + percentile_df['AST_Percentile'] + percentile_df['STL_Percentile'] + percentile_df['BLK_Percentile'] + percentile_df['PTS_Percentile'])


In [11]:
# Create a list of each unique player we have in our dataframe
player_list = percentile_df.Player.unique().tolist()

# Create a new dataframe for combined, averaged percentiles over the past 5 seasons
new_df = pd.DataFrame(columns = ['Player', 'Rank', 'Trend', 'Pos', 'Years', '3P_Percentile', 'FG%_Percentile', 'TRB_Percentile', 'FT%_Percentile', 'AST_Percentile', 'STL_Percentile', 'BLK_Percentile', 'PTS_Percentile'])

# Update new dataframe with unique player list
new_df['Player'] = player_list

# Create a list for each percentile stat category for upcoming loop
player_trends = []
average_3P = []
average_FG = []
average_TRB = []
average_FT = []
average_AST = []
average_STL = []
average_BLK = []
average_PTS = []
average_Rank = []
year_count = []
pos = []

# Loop through each player, locate their percentile stats for each season, average them out
for player in player_list:
    player_df = percentile_df.loc[percentile_df['Player'] == player]
    
    # We want to find the slope of the line of best fit for each player's overall ranking each season
    x = np.array(player_df['Year'], dtype = float)
    y = np.array(player_df['Rank'], dtype = float)
    slope, intercept = np.polyfit(x, y, 1)
    player_trends.append(slope)
    
    # Find average of each player's percentiles from previous 5 seasons
    average_3P.append(sum(player_df['3P_Percentile']) / len(player_df['3P_Percentile']))
    average_FG.append(sum(player_df['FG%_Percentile']) / len(player_df['FG%_Percentile']))
    average_TRB.append(sum(player_df['TRB_Percentile']) / len(player_df['TRB_Percentile']))
    average_FT.append(sum(player_df['FT%_Percentile']) / len(player_df['FT%_Percentile']))
    average_AST.append(sum(player_df['AST_Percentile']) / len(player_df['AST_Percentile']))
    average_STL.append(sum(player_df['STL_Percentile']) / len(player_df['STL_Percentile']))
    average_BLK.append(sum(player_df['BLK_Percentile']) / len(player_df['BLK_Percentile']))
    average_PTS.append(sum(player_df['PTS_Percentile']) / len(player_df['PTS_Percentile']))
    average_Rank.append(sum(player_df['Rank']) / len(player_df['Rank']))
    year_count.append(len(x))
    
    # Keep player positions for reference purposes during the draft
    pos.append(player_df['Pos'].unique())

# Update new dataframe with the list data from each stat
new_df['Pos'] = pos
new_df['Trend'] = player_trends
new_df['3P_Percentile'] = average_3P
new_df['FG%_Percentile'] = average_FG
new_df['TRB_Percentile'] = average_TRB
new_df['FT%_Percentile'] = average_FT
new_df['AST_Percentile'] = average_AST
new_df['STL_Percentile'] = average_STL
new_df['BLK_Percentile'] = average_BLK
new_df['PTS_Percentile'] = average_PTS
new_df['Rank'] = average_Rank

# Keep track of how many seasons are being considered, so we know how reliable the data is
new_df['Years'] = year_count



  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [12]:
# Create a weighted rank column by adding trend data to the rank data and account for number of seasons played
# Basically, if you played all 5 seasons, your trend stat is added directly
# If you played fewer than all 5 seasons, your trend stat is reduced depending on how few seasons you played
new_df['Weighted Rank'] = (new_df['Rank'] + ((new_df['Trend'] * (new_df['Years'] - 1) / 4)))

# shift column 'Weighted Rank' to first position
first_column = new_df.pop('Weighted Rank')
  
# insert column using insert(position,column_name,first_column) function
new_df.insert(1, 'Weighted Rank', first_column)

new_df = new_df.sort_values('Weighted Rank', ascending = False)
new_df.head(50)

Unnamed: 0,Player,Weighted Rank,Rank,Trend,Pos,Years,3P_Percentile,FG%_Percentile,TRB_Percentile,FT%_Percentile,AST_Percentile,STL_Percentile,BLK_Percentile,PTS_Percentile
188,Jayson Tatum,7.215687,7.059511,0.156175,"[SF, PF]",5,0.942598,0.540872,0.928096,0.800661,0.841505,0.906003,0.846255,0.962144
310,Nikola Jokić,7.202487,7.200592,0.001895,[C],5,0.653904,0.88844,0.989854,0.719923,0.990714,0.95454,0.867615,0.969892
265,Luka Dončić,6.98917,6.826627,0.162543,"[PG, SG]",5,0.95225,0.586314,0.940685,0.36745,0.97768,0.873002,0.649173,0.978477
18,Anthony Edwards,6.969735,6.827188,0.285094,[SG],3,0.964282,0.393868,0.824562,0.4859,0.862622,0.972715,0.836178,0.963473
199,Joel Embiid,6.883997,6.725734,0.158262,[C],5,0.596491,0.772548,0.972722,0.716657,0.788697,0.728618,0.9629,0.965852
311,Nikola Vučević,6.859427,6.879796,-0.020369,[C],5,0.784484,0.729476,0.991383,0.612726,0.849675,0.814727,0.900182,0.934122
178,James Harden,6.788852,7.02366,-0.234808,"[PG, SG]",5,0.921297,0.435728,0.868635,0.886135,0.98971,0.908196,0.806794,0.934048
363,Stephen Curry,6.763702,6.816643,-0.070588,[PG],4,0.998173,0.617785,0.782909,0.976174,0.937883,0.864827,0.484271,0.9758
290,Mikal Bridges,6.742914,6.359819,0.383095,"[SF-SG, SF]",5,0.79086,0.69975,0.734649,0.771189,0.737452,0.958706,0.842455,0.780114
134,Giannis Antetokounmpo,6.694353,6.79959,-0.105238,[PF],5,0.561855,0.895047,0.984245,0.228473,0.939378,0.834456,0.940675,0.98861


In [13]:
# new_df.to_csv("/Users/michaelbinger/Documents/Projects/Fantasy-Baseball-Analysis/Fantasy_Basketball/nba_trends.csv")
# final_yahoo_nba_df.to_csv("/Users/michaelbinger/Documents/Projects/Fantasy-Baseball-Analysis/Fantasy_Basketball/nba_stats.csv")


In [14]:
new_pos_list = []
pos_list = new_df['Pos'].tolist()
for item in pos_list:
    new_string = []
    for pos in item:
        string = str(pos)
        new_string = f'{new_string},{string}'
    new_pos_list.append(new_string)

final_pos_list = []
for i in new_pos_list:
    i = i.replace('[],', '')
    final_pos_list.append(i)
    
final_pos_list

new_df.drop('Pos', axis = 1, inplace = True)
new_df['Pos'] = final_pos_list

new_df

Unnamed: 0,Player,Weighted Rank,Rank,Trend,Years,3P_Percentile,FG%_Percentile,TRB_Percentile,FT%_Percentile,AST_Percentile,STL_Percentile,BLK_Percentile,PTS_Percentile,Pos
188,Jayson Tatum,7.215687,7.059511,0.156175,5,0.942598,0.540872,0.928096,0.800661,0.841505,0.906003,0.846255,0.962144,"SF,PF"
310,Nikola Jokić,7.202487,7.200592,0.001895,5,0.653904,0.888440,0.989854,0.719923,0.990714,0.954540,0.867615,0.969892,C
265,Luka Dončić,6.989170,6.826627,0.162543,5,0.952250,0.586314,0.940685,0.367450,0.977680,0.873002,0.649173,0.978477,"PG,SG"
18,Anthony Edwards,6.969735,6.827188,0.285094,3,0.964282,0.393868,0.824562,0.485900,0.862622,0.972715,0.836178,0.963473,SG
199,Joel Embiid,6.883997,6.725734,0.158262,5,0.596491,0.772548,0.972722,0.716657,0.788697,0.728618,0.962900,0.965852,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
474,Leandro Bolmaro,0.633413,0.633413,0.000157,1,0.105769,0.012019,0.020433,0.786058,0.067308,0.024038,0.007212,0.004808,SF
467,Keljin Blevins,0.536659,0.536659,0.000133,1,0.242788,0.009615,0.028846,0.042067,0.049279,0.106971,0.020433,0.031250,SF
524,Juwan Morgan,0.501235,0.501235,0.000124,1,0.111111,0.590123,0.014815,0.002469,0.017284,0.018519,0.038272,0.002469,PF
584,Malcolm Miller,0.325858,0.325858,0.000081,1,0.149077,0.241425,0.002639,0.006596,0.007916,0.011873,0.025066,0.002639,SF


In [15]:
draft_df = new_df

In [16]:
######################################################################################################################
######################################################################################################################
######################################################################################################################
#### DRAFT DAY FUNCTIONS
   
# DROP A PLAYER 
def drafted(player):
    global draft_df
    draft_df = draft_df[draft_df.Player != player]
    return draft_df.head(25)
    
# FILTER PLAYERS BY POSITION
def position_filter(Pos):
    filtered_draft_df = draft_df[draft_df['Pos'].str.contains(Pos)]
    return filtered_draft_df.head(25)

# PULL STAT CATEGORY LEADERS
def stat_leaders(CAT):
    global final_yahoo_nba_df
    final_yahoo_nba_df = final_yahoo_nba_df.sort_values([CAT], ascending=[False])
    return final_yahoo_nba_df.head(25)

In [17]:
# draft_df = draft_df.sort_values(['Weighted Rank'], ascending=[False])
# draft_df.head(50)