In [1]:
# Import needed dependencies
import requests
import pandas as pd
import numpy as np
import scipy.stats as stats
from datetime import date
from bs4 import BeautifulSoup, Comment

In [2]:
today = date.today()

# dd/mm/YY
current_year = today.strftime("%Y")

last_year = int(current_year) - 1

In [3]:
last_five_years = []
for i in range(1,6):
    last_five_years.append(int(current_year) - i)

In [4]:
# Create a list to help create a dataframe from batter statistics data
nba_stats = []

# Create a loop to create a dataframe from Basketball Reference tables
for year in last_five_years:
    
    # input URL and use BeautifulSoup to parse through the page
    url = f'https://www.basketball-reference.com/leagues/NBA_{year}_totals.html'
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')

    # Grab the table element that has NBA player statistics
    table = soup.select_one('#div_totals_stats') 

    # Grab data from table and put it into the list created above
    for tr in table.select('tr:has(td)'):
        tds = [td.get_text(strip=True) for td in tr.select('td')]
        tds.append(year)
        nba_stats.append(tds)

In [5]:
# Create dataframe for batter statistics
nba_stats_df = pd.DataFrame(nba_stats)

# Create an empty list to store dataframe header information
header_list = []

# Grab the table header information to use as column headers in our dataframe
for tr in table.select('tr:has(th)'):
    ths = [th.get_text(strip=True) for th in tr.select('th')]
    header_list.append(ths)

# For loop returns a list of lists, and we only need the first list 
df_headers = header_list[0]

# Remove the first item from our headers list, it is the index header that we do not need
df_headers.remove('Rk')
df_headers.append("Year")

# Set column headers equal to our list
nba_stats_df.columns = df_headers

In [6]:
# Change types of columns to numeric for columns with number values
nba_stats_df[['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']] = nba_stats_df[['Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']].apply(pd.to_numeric)

# Drop all players with NaN games to remove null values
nba_stats_df.dropna(subset=['G'], axis = 0 , inplace= True)

# Remove any players with fewer than 27 games played (1/3rd of the season)
filtered_nba_stats_df = nba_stats_df[nba_stats_df['G'] >= 27]

# points, rebounds, assists, steals, blocks, threes, field-goal percentage, and free-throw percentage

# Select the columns we want for our batter analysis
filtered_nba_stats_df = filtered_nba_stats_df[['Year','Player','Age','Pos', 'Tm','G','GS','FG','FGA','FG%', '3P', '3PA', '3P%', 'FT', 'FTA', 'FT%', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PTS']]


In [7]:
# Create variables for Z scores for all stats counted in Yahoo Fantasy Basketball category leagues
PTS_zscores = stats.zscore(filtered_nba_stats_df['PTS'])
TRB_zscores = stats.zscore(filtered_nba_stats_df['TRB'])
AST_zscores = stats.zscore(filtered_nba_stats_df['AST'])
STL_zscores = stats.zscore(filtered_nba_stats_df['STL'])  #/5 # Steal Z scores get out of hand, so dividing by 5
BLK_zscores = stats.zscore(filtered_nba_stats_df['BLK'])
Three_P_zscores = stats.zscore(filtered_nba_stats_df['3P'])
FG_Per_zscores = stats.zscore(filtered_nba_stats_df['FG%'])
FT_Per_zscores = stats.zscore(filtered_nba_stats_df['FT%'])

# Add Z scores to a new DataFrame
compare_players_df = pd.DataFrame({
    'Z_PTS': PTS_zscores,
    'Z_TRB': TRB_zscores,
    'Z_AST': AST_zscores,
    'Z_STL': STL_zscores,
    'Z_BLK': BLK_zscores,
    'Z_3P': Three_P_zscores,
    'Z_FG%': FG_Per_zscores,
    'Z_FT%': FT_Per_zscores,})

# Calculate average of 5 Z scores, stadard deviation, and a confidence level in the average Z score
# Confidence level is to try to avoid players with just one, large Z-score
compare_players_df['average_z'] = compare_players_df.mean(axis=1)
compare_players_df['std_z'] = compare_players_df.std(axis=1)
compare_players_df['avg_confidence'] = (compare_players_df['average_z'] - compare_players_df['std_z'])
compare_players_df['position'] = filtered_nba_stats_df['Pos']
final_compare_players_df = compare_players_df.sort_values(by=['avg_confidence'], ascending=False)
final_compare_players_df

Unnamed: 0,Z_PTS,Z_TRB,Z_AST,Z_STL,Z_BLK,Z_3P,Z_FG%,Z_FT%,average_z,std_z,avg_confidence,position
2368,3.810919,2.645203,5.002843,2.623019,1.618992,1.525896,1.208493,,2.633624,1.269815,1.363809,PF
1375,3.214766,3.722462,2.375251,1.766007,3.046665,-0.193780,1.730343,,2.237388,1.208572,1.028816,PF
1626,5.126166,1.556544,3.693115,4.122788,1.143101,5.585752,-0.241090,,2.998054,2.034841,0.963213,PG
344,2.992079,3.049887,3.798869,1.873134,0.777031,0.515365,1.556393,,2.080394,1.145790,0.934604,C
2871,2.196436,1.528046,1.366517,0.837578,2.643988,0.958580,1.136014,,1.523880,0.617919,0.905961,PF
...,...,...,...,...,...,...,...,...,...,...,...,...
2010,-1.134597,-0.882962,-0.846192,-1.162115,0.154712,-1.115669,3.411860,,-0.224995,1.544047,-1.769042,C
2042,-1.222744,-0.934260,-1.017026,-1.197823,0.484175,-1.115669,3.585810,,-0.202505,1.642611,-1.845116,C
138,-1.048769,-0.854463,-0.935677,-1.233532,-0.906891,-1.097941,3.063960,,-0.430473,1.431570,-1.862043,SF-PF
140,-1.053408,-0.865863,-0.951947,-1.233532,-0.906891,-1.097941,3.208919,,-0.414380,1.483735,-1.898116,SF


In [8]:
# Add avg Z scores, confidence columns to batter stats DataFrame
filtered_nba_stats_df['Average Z'] = final_compare_players_df['average_z']
filtered_nba_stats_df['Z Confidence'] = final_compare_players_df['avg_confidence']
filtered_nba_stats_df = filtered_nba_stats_df.sort_values(by=['Z Confidence'], ascending=False)
yahoo_nba_df = filtered_nba_stats_df.drop(columns=['GS', 'FG', 'FGA', '3PA', '3P%', 'FT', 'FTA', 'TOV'])


In [9]:
# Sort by index to prepare to drop duplicates
yahoo_nba_df = yahoo_nba_df.sort_index()


In [10]:
# Drop duplicate entries of Player Name and Year
# This is to eliminate partial season data for players who played for 2+ teams in one season
yahoo_nba_df = yahoo_nba_df.drop_duplicates(subset=['Year', 'Player'])

In [11]:
# Sort data by name alphabetically, then by year in descending order
final_yahoo_nba_df = yahoo_nba_df.sort_values(['Player','Year'], ascending=[True, False])
final_yahoo_nba_df

Unnamed: 0,Year,Player,Age,Pos,Tm,G,FG%,3P,FT%,TRB,AST,STL,BLK,PTS,Average Z,Z Confidence
2143,2018,Aaron Brooks,33,PG,MIN,32,0.406,11,0.727,17,20,6,0,75,-1.059376,-1.260982
2797,2017,Aaron Brooks,32,PG,IND,65,0.403,48,0.800,69,125,25,9,322,-0.581673,-0.878535
231,2021,Aaron Gordon,25,PF,TOT,50,0.463,59,0.651,284,161,33,34,618,0.056976,-0.142315
937,2020,Aaron Gordon,24,PF,ORL,62,0.437,73,0.674,475,228,51,39,894,0.481408,0.008013
1605,2019,Aaron Gordon,23,PF,ORL,78,0.449,121,0.731,574,289,57,56,1246,1.015484,0.404793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
688,2021,Zion Williamson,20,PF,NOP,61,0.611,10,0.698,441,226,57,39,1647,0.932304,-0.122103
1356,2019,Álex Abrines,25,SG,OKC,31,0.357,41,0.923,48,20,17,6,165,-0.942309,-1.250053
2064,2018,Álex Abrines,24,SG,OKC,75,0.395,84,0.848,114,28,38,8,353,-0.511790,-0.939159
2728,2017,Álex Abrines,23,SG,OKC,68,0.393,94,0.898,86,40,37,8,406,-0.486997,-0.974040


In [15]:
percentile_df = pd.DataFrame(columns = ['Year', 'Player', 'Age', 'Tm', '3P_Percentile', 'FG%_Percentile', 'TRB_Percentile', 'FT%_Percentile', 'AST_Percentile', 'STL_Percentile', 'BLK_Percentile', 'PTS_Percentile'])
percentile_df['Year'] = final_yahoo_nba_df['Year']
percentile_df['Player'] = final_yahoo_nba_df['Player']
percentile_df['Age'] = final_yahoo_nba_df['Age']
percentile_df['Tm'] = final_yahoo_nba_df['Tm']
percentile_df['3P_Percentile'] = (final_yahoo_nba_df['3P'] / final_yahoo_nba_df['3P'].max())
percentile_df['FG%_Percentile'] = (final_yahoo_nba_df['FG%'] / final_yahoo_nba_df['FG%'].max())
percentile_df['TRB_Percentile'] = (final_yahoo_nba_df['TRB'] / final_yahoo_nba_df['TRB'].max())
percentile_df['FT%_Percentile'] = (final_yahoo_nba_df['FT%'] / final_yahoo_nba_df['FT%'].max())
percentile_df['AST_Percentile'] = (final_yahoo_nba_df['AST'] / final_yahoo_nba_df['AST'].max())
percentile_df['STL_Percentile'] = (final_yahoo_nba_df['STL'] / final_yahoo_nba_df['STL'].max())
percentile_df['BLK_Percentile'] = (final_yahoo_nba_df['BLK'] / final_yahoo_nba_df['BLK'].max())
percentile_df['PTS_Percentile'] = (final_yahoo_nba_df['PTS'] / final_yahoo_nba_df['PTS'].max())


Unnamed: 0,Year,Player,Age,Tm,3P_Percentile,FG%_Percentile,TRB_Percentile,FT%_Percentile,AST_Percentile,STL_Percentile,BLK_Percentile,PTS_Percentile
2143,2018,Aaron Brooks,33,MIN,0.029101,0.532110,0.013633,0.727,0.022051,0.033898,0.000000,0.026615
2797,2017,Aaron Brooks,32,IND,0.126984,0.528178,0.055333,0.800,0.137817,0.141243,0.042056,0.114265
231,2021,Aaron Gordon,25,TOT,0.156085,0.606815,0.227747,0.651,0.177508,0.186441,0.158879,0.219304
937,2020,Aaron Gordon,24,ORL,0.193122,0.572739,0.380914,0.674,0.251378,0.288136,0.182243,0.317246
1605,2019,Aaron Gordon,23,ORL,0.320106,0.588467,0.460305,0.731,0.318633,0.322034,0.261682,0.442158
...,...,...,...,...,...,...,...,...,...,...,...,...
688,2021,Zion Williamson,20,NOP,0.026455,0.800786,0.353649,0.698,0.249173,0.322034,0.182243,0.584457
1356,2019,Álex Abrines,25,OKC,0.108466,0.467890,0.038492,0.923,0.022051,0.096045,0.028037,0.058552
2064,2018,Álex Abrines,24,OKC,0.222222,0.517693,0.091419,0.848,0.030871,0.214689,0.037383,0.125266
2728,2017,Álex Abrines,23,OKC,0.248677,0.515072,0.068966,0.898,0.044101,0.209040,0.037383,0.144074


In [19]:
percentile_df['Rank'] = (percentile_df['3P_Percentile'] + percentile_df['FG%_Percentile'] + percentile_df['TRB_Percentile'] + percentile_df['FT%_Percentile'] + percentile_df['AST_Percentile'] + percentile_df['STL_Percentile'] + percentile_df['BLK_Percentile'] + percentile_df['PTS_Percentile'])
percentile_df = percentile_df.sort_values('Rank', ascending = False)
percentile_df


Unnamed: 0,Year,Player,Age,Tm,3P_Percentile,FG%_Percentile,TRB_Percentile,FT%_Percentile,AST_Percentile,STL_Percentile,BLK_Percentile,PTS_Percentile,Rank
1626,2019,James Harden,29,HOU,1.000000,0.579292,0.415397,0.879,0.646086,0.892655,0.271028,1.000000,5.683459
3286,2017,Russell Westbrook,28,OKC,0.529101,0.557012,0.692863,0.845,0.926130,0.745763,0.144860,0.907736,5.348464
2939,2017,James Harden,27,HOU,0.693122,0.576671,0.528468,0.847,1.000000,0.683616,0.177570,0.836054,5.342501
2368,2018,LeBron James,33,CLE,0.394180,0.710354,0.568565,0.731,0.823594,0.655367,0.331776,0.798793,5.013629
963,2020,James Harden,30,HOU,0.791005,0.581913,0.357658,0.865,0.564498,0.706215,0.280374,0.828602,4.975266
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2420,2018,Jake Layman,23,POR,0.010582,0.390564,0.012831,0.667,0.012128,0.033898,0.018692,0.012065,1.157759
458,2021,Juwan Morgan,23,UTA,0.010582,0.612058,0.022454,0.429,0.009923,0.022599,0.004673,0.012420,1.123708
1126,2020,Malcolm Miller,26,TOR,0.021164,0.542595,0.012831,0.375,0.012128,0.028249,0.009346,0.012420,1.013732
1561,2019,Jacob Evans,21,GSW,0.010582,0.445609,0.020048,0.000,0.025358,0.028249,0.014019,0.014194,0.558060
