In [1]:
# Import needed dependencies
import requests
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
today = date.today()

# dd/mm/YY
current_year = today.strftime("%Y")

last_year = int(current_year) - 1

In [3]:
last_five_years = []
for i in range(0,5):
    last_five_years.append(int(current_year) - i)

In [4]:
# Create a list to help create a dataframe from batter statistics data
nba_stats = []

# Create a loop to create a dataframe from Basketball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_totals.html'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has NBA player statistics
    table = soup.select_one('#div_totals_stats') 

    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        nba_stats.append(tds)

In [5]:
# Create dataframe for batter statistics
nba_stats_df = pd.DataFrame(nba_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
nba_stats_df.columns = df_headers

In [6]:
# Change types of columns to numeric for columns with number values
nba_stats_df[['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']] = nba_stats_df[['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']].apply(pd.to_numeric)

# Drop all players with NaN games to remove null values
nba_stats_df.dropna(subset=['G'], axis = 0 , inplace= True)

# Remove any players with fewer than 27 games played (1/3rd of the season)
filtered_nba_stats_df = nba_stats_df[nba_stats_df['G'] >= 27]

# points, rebounds, assists, steals, blocks, threes, field-goal percentage, and free-throw percentage

# Select the columns we want for our batter analysis
filtered_nba_stats_df = filtered_nba_stats_df[['Year','Player','Age','Pos', 'Tm','G','GS','FG','FGA','FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PTS']]


In [7]:
# Create variables for Z scores for all stats counted in Yahoo Fantasy Basketball category leagues
PTS_zscores = stats.zscore(filtered_nba_stats_df['PTS'])
TRB_zscores = stats.zscore(filtered_nba_stats_df['TRB'])
AST_zscores = stats.zscore(filtered_nba_stats_df['AST'])
STL_zscores = stats.zscore(filtered_nba_stats_df['STL'])  #/5 # Steal Z scores get out of hand, so dividing by 5
BLK_zscores = stats.zscore(filtered_nba_stats_df['BLK'])
Three_P_zscores = stats.zscore(filtered_nba_stats_df['3P'])
FG_Per_zscores = stats.zscore(filtered_nba_stats_df['FG%'])
FT_Per_zscores = stats.zscore(filtered_nba_stats_df['FT%'])

# Add Z scores to a new DataFrame
compare_players_df = pd.DataFrame({
    'Z_PTS': PTS_zscores,
    'Z_TRB': TRB_zscores,
    'Z_AST': AST_zscores,
    'Z_STL': STL_zscores,
    'Z_BLK': BLK_zscores,
    'Z_3P': Three_P_zscores,
    'Z_FG%': FG_Per_zscores,
    'Z_FT%': FT_Per_zscores,})

# Calculate average of 5 Z scores, stadard deviation, and a confidence level in the average Z score
# Confidence level is to try to avoid players with just one, large Z-score
compare_players_df['average_z'] = compare_players_df.mean(axis=1)
compare_players_df['std_z'] = compare_players_df.std(axis=1)
compare_players_df['avg_confidence'] = (compare_players_df['average_z'] - compare_players_df['std_z'])
compare_players_df['position'] = filtered_nba_stats_df['Pos']
final_compare_players_df = compare_players_df.sort_values(by=['avg_confidence'], ascending=False)
final_compare_players_df

Unnamed: 0,Z_PTS,Z_TRB,Z_AST,Z_STL,Z_BLK,Z_3P,Z_FG%,Z_FT%,average_z,std_z,avg_confidence,position
3180,3.887925,2.730618,5.082923,2.744353,1.709586,1.467537,1.148180,,2.681589,1.309201,1.372388,PF
391,3.304843,4.542569,3.734481,2.485655,1.404020,0.558486,1.722522,,2.536082,1.304389,1.231693,C
736,2.865760,2.835828,1.128596,1.118250,2.167936,1.485018,0.966071,,1.795351,0.761036,1.034316,C
2187,3.281236,3.835323,2.410857,1.857388,3.199224,-0.228192,1.652480,,2.286902,1.261896,1.025006,PF
2438,5.226418,1.614222,3.751026,4.296542,1.213040,5.470855,-0.252655,,3.045636,2.034907,1.010729,PG
...,...,...,...,...,...,...,...,...,...,...,...,...
950,-1.057652,-0.858215,-0.956112,-1.246991,-0.925926,-1.119761,2.941249,,-0.460487,1.393972,-1.854459,SF-PF
952,-1.062374,-0.869905,-0.972657,-1.246991,-0.925926,-1.119761,3.081332,,-0.445183,1.444443,-1.889626,SF
216,-1.218177,-1.092015,-1.038838,-1.505689,-0.582164,-1.119761,2.731123,,-0.546503,1.361926,-1.908429,C
161,-1.359816,-1.039410,-1.038838,-1.431775,-0.658555,-1.137243,3.193399,,-0.496034,1.524144,-2.020178,C


In [8]:
# Add avg Z scores, confidence columns to batter stats DataFrame
filtered_nba_stats_df['Average Z'] = final_compare_players_df['average_z']
filtered_nba_stats_df['Z Confidence'] = final_compare_players_df['avg_confidence']
filtered_nba_stats_df = filtered_nba_stats_df.sort_values(by=['Z Confidence'], ascending=False)
yahoo_nba_df = filtered_nba_stats_df.drop(columns=['GS', 'FG', 'FGA', '3PA', '3P%', 'FT', 'FTA', 'TOV'])


In [9]:
# Sort by index to prepare to drop duplicates
yahoo_nba_df = yahoo_nba_df.sort_index()


In [10]:
# Drop duplicate entries of Player Name and Year
# This is to eliminate partial season data for players who played for 2+ teams in one season
yahoo_nba_df = yahoo_nba_df.drop_duplicates(subset=['Year', 'Player'])

In [11]:
# Sort data by name alphabetically, then by year in descending order
final_yahoo_nba_df = yahoo_nba_df.sort_values(['Player','Year'], ascending=[True, False])
final_yahoo_nba_df

Unnamed: 0,Year,Player,Age,Pos,Tm,G,FG%,3P,FT%,TRB,AST,STL,BLK,PTS,Average Z,Z Confidence
2955,2018,Aaron Brooks,33,PG,MIN,32,0.406,11,0.727,17,20,6,0,75,-1.074201,-1.278398
260,2022,Aaron Gordon,26,PF,DEN,75,0.520,87,0.743,439,188,44,44,1126,0.689798,0.303275
1043,2021,Aaron Gordon,25,PF,TOT,50,0.463,59,0.651,284,161,33,34,618,0.060512,-0.145339
1749,2020,Aaron Gordon,24,PF,ORL,62,0.437,73,0.674,475,228,51,39,894,0.497503,0.005478
2417,2019,Aaron Gordon,23,PF,ORL,78,0.449,121,0.731,574,289,57,56,1246,1.039293,0.410201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1286,2021,Zeke Nnaji,20,PF,DEN,42,0.481,24,0.800,65,10,8,4,136,-0.810427,-1.288172
790,2022,Ziaire Williams,20,SF,MEM,62,0.450,76,0.782,129,65,35,12,501,-0.314868,-0.589240
1500,2021,Zion Williamson,20,PF,NOP,61,0.611,10,0.698,441,226,57,39,1647,0.943238,-0.108326
2168,2019,Álex Abrines,25,SG,OKC,31,0.357,41,0.923,48,20,17,6,165,-0.950287,-1.242474


In [12]:
percentile_df = pd.DataFrame(columns = ['Year', 'Player', 'Age', 'Tm', '3P_Percentile', 'FG%_Percentile', 'TRB_Percentile', 'FT%_Percentile', 'AST_Percentile', 'STL_Percentile', 'BLK_Percentile', 'PTS_Percentile'])
percentile_df['Year'] = final_yahoo_nba_df['Year']
percentile_df['Player'] = final_yahoo_nba_df['Player']
percentile_df['Age'] = final_yahoo_nba_df['Age']
percentile_df['Tm'] = final_yahoo_nba_df['Tm']
percentile_df['3P_Percentile'] = (final_yahoo_nba_df['3P'] / final_yahoo_nba_df['3P'].max())
percentile_df['FG%_Percentile'] = (final_yahoo_nba_df['FG%'] / final_yahoo_nba_df['FG%'].max())
percentile_df['TRB_Percentile'] = (final_yahoo_nba_df['TRB'] / final_yahoo_nba_df['TRB'].max())
percentile_df['FT%_Percentile'] = (final_yahoo_nba_df['FT%'] / final_yahoo_nba_df['FT%'].max())
percentile_df['AST_Percentile'] = (final_yahoo_nba_df['AST'] / final_yahoo_nba_df['AST'].max())
percentile_df['STL_Percentile'] = (final_yahoo_nba_df['STL'] / final_yahoo_nba_df['STL'].max())
percentile_df['BLK_Percentile'] = (final_yahoo_nba_df['BLK'] / final_yahoo_nba_df['BLK'].max())
percentile_df['PTS_Percentile'] = (final_yahoo_nba_df['PTS'] / final_yahoo_nba_df['PTS'].max())


In [13]:
percentile_df['Rank'] = (percentile_df['3P_Percentile'] + percentile_df['FG%_Percentile'] + percentile_df['TRB_Percentile'] + percentile_df['FT%_Percentile'] + percentile_df['AST_Percentile'] + percentile_df['STL_Percentile'] + percentile_df['BLK_Percentile'] + percentile_df['PTS_Percentile'])


In [14]:
percentile_df = percentile_df.sort_values(['Player', 'Year'], ascending = [True, True])
percentile_df.Player.nunique()

687

In [15]:
# #Example data 


# #Fit line
# slope, intercept = np.polyfit(x, y, 3)
# print(slope)
player_list = percentile_df.Player.unique().tolist()
# year_list = []
# rank_list = []
new_df = pd.DataFrame(columns = ['Player', 'Rank', 'Trend', '3P_Percentile', 'FG%_Percentile', 'TRB_Percentile', 'FT%_Percentile', 'AST_Percentile', 'STL_Percentile', 'BLK_Percentile', 'PTS_Percentile'])
new_df['Player'] = player_list
player_trends = []
average_3P = []
average_FG = []
average_TRB = []
average_FT = []
average_AST = []
average_STL = []
average_BLK = []
average_PTS = []
average_Rank = []

for player in player_list:
    player_df = percentile_df.loc[percentile_df['Player'] == player]
    x = np.array(player_df['Year'])
    y = np.array(player_df['Rank'])
    slope, intercept = np.polyfit(x, y, 1)
    player_trends.append(slope)
    average_3P.append(sum(player_df['3P_Percentile']) / len(player_df['3P_Percentile']))
    average_FG.append(sum(player_df['FG%_Percentile']) / len(player_df['FG%_Percentile']))
    average_TRB.append(sum(player_df['TRB_Percentile']) / len(player_df['TRB_Percentile']))
    average_FT.append(sum(player_df['FT%_Percentile']) / len(player_df['FT%_Percentile']))
    average_AST.append(sum(player_df['AST_Percentile']) / len(player_df['AST_Percentile']))
    average_STL.append(sum(player_df['STL_Percentile']) / len(player_df['STL_Percentile']))
    average_BLK.append(sum(player_df['BLK_Percentile']) / len(player_df['BLK_Percentile']))
    average_PTS.append(sum(player_df['PTS_Percentile']) / len(player_df['PTS_Percentile']))
    average_Rank.append(sum(player_df['Rank']) / len(player_df['Rank']))

#    average = sum(numbers) / len(numbers)
#     year_list.append(x)
#     rank_list.append(y)

new_df['Trend'] = player_trends
new_df['3P_Percentile'] = average_3P
new_df['FG%_Percentile'] = average_FG
new_df['TRB_Percentile'] = average_TRB
new_df['FT%_Percentile'] = average_FT
new_df['AST_Percentile'] = average_AST
new_df['STL_Percentile'] = average_STL
new_df['BLK_Percentile'] = average_BLK
new_df['PTS_Percentile'] = average_PTS
new_df['Rank'] = average_Rank


  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [20]:
new_df = new_df.sort_values('Rank', ascending = False)
new_df.head(50)

Unnamed: 0,Player,Rank,Trend,3P_Percentile,FG%_Percentile,TRB_Percentile,FT%_Percentile,AST_Percentile,STL_Percentile,BLK_Percentile,PTS_Percentile
283,James Harden,4.693899,-0.386423,0.640741,0.579554,0.353007,0.868,0.7,0.614689,0.238191,0.699716
507,Nikola Jokić,4.534222,0.148757,0.244974,0.704325,0.670088,0.8332,0.666585,0.550282,0.272362,0.592406
220,Giannis Antetokounmpo,4.286589,-0.101342,0.17037,0.729227,0.634483,0.7058,0.459512,0.458757,0.448241,0.680199
597,Stephen Curry,4.18552,0.082368,0.785714,0.617955,0.26263,0.919,0.438415,0.471751,0.080402,0.609652
566,Russell Westbrook,4.176229,-0.307182,0.230688,0.585059,0.544026,0.6958,0.809268,0.616949,0.116583,0.577857
429,Luka Dončić,4.061552,0.172128,0.484127,0.598296,0.452285,0.73625,0.640854,0.392655,0.139447,0.617637
421,LeBron James,4.023044,-0.28206,0.356085,0.676802,0.401604,0.7086,0.630244,0.437288,0.225126,0.587296
369,Karl-Anthony Towns,3.976476,-0.160106,0.342857,0.677851,0.577386,0.8342,0.269512,0.310734,0.424121,0.539815
631,Trae Young,3.970634,0.155743,0.482804,0.574377,0.217522,0.86975,0.77561,0.370056,0.052764,0.62775
114,Damian Lillard,3.919866,-0.389276,0.58254,0.576409,0.221812,0.9044,0.555854,0.354802,0.111558,0.612491


In [17]:
# new_df.loc[new_df['Player'] == 'Rudy Gobert']
# percentile_df.loc[percentile_df['Player'] == 'Rudy Gobert']