In [1]:
import pandas as pd

In [2]:
import requests

In [3]:
pd.set_option('display.max_columns', None) #see columns in a wide dataframe

In [4]:
import time

In [5]:
import numpy as np

In [6]:
import plotly.express as px
import plotly.graph_objects as go

In [7]:
#Scraping player stats of the last 10 years from the NBA website

In [8]:
stats_url = 'https://stats.nba.com/stats/leagueLeaders?LeagueID=00&PerMode=Totals&Scope=S&Season=2013-14&SeasonType=Regular%20Season&StatCategory=PTS'

In [9]:
data = requests.get(url = stats_url).json()

In [10]:
table_headers = data['resultSet']['headers']

In [11]:
df_columns = ['Year', 'Season_type'] + table_headers

In [12]:
df = pd.DataFrame(columns = df_columns)
season_types = ['Regular%20Season', 'Playoffs']
years = ['2013-14','2014-15','2015-16','2016-17','2017-18','2018-19','2019-20','2020-21','2021-22','2022-23']
begin_loop = time.time()
# Looping through each combination of year and season
for year in years:
    for season in season_types:
        api_url = 'https://stats.nba.com/stats/leagueLeaders?LeagueID=00&PerMode=Totals&Scope=S&Season='+year+'&SeasonType='+season+'&StatCategory=PTS'
        data = requests.get(url=api_url).json()
        temp_df1 = pd.DataFrame(data['resultSet']['rowSet'], columns = table_headers)
        # Adding 'Year' and 'Season_type' columns and concatenating with main DataFrame
        #temp_df2 = pd.DataFrame({'Year':['2013-14' for i in range(len(temp_df1))],
        #                         'Season_type':['Regular%20Season' for i in range (len(temp_df1))]
        #})
        temp_df2 = pd.DataFrame({'Year': [year] * len(temp_df1),
                                 'Season_type': [season] * len(temp_df1)
                                })
        temp_df3 = pd.concat([temp_df2,temp_df1], axis = 1)
        df = pd.concat([df, temp_df3], axis = 0)
        print(f'Finished scraping data for the {year} {season}.')
        # Introducing a random delay to allow scraping from the website
        lag = np.random.uniform(low=5, high=30)
        print(f'... waiting {round(lag, 1)} seconds')
        time.sleep(lag)
print(f'Process Completed! Total runtime: {round((time.time() - begin_loop)/60,2)}')

# Saving the DataFrame to a CSV file
df.to_csv('nba_player_data.csv', index = False)

  df = pd.concat([df, temp_df3], axis = 0)


Finished scraping data for the 2013-14 Regular%20Season.
... waiting 25.9 seconds
Finished scraping data for the 2013-14 Playoffs.
... waiting 21.7 seconds
Finished scraping data for the 2014-15 Regular%20Season.
... waiting 19.6 seconds
Finished scraping data for the 2014-15 Playoffs.
... waiting 13.3 seconds
Finished scraping data for the 2015-16 Regular%20Season.
... waiting 23.0 seconds
Finished scraping data for the 2015-16 Playoffs.
... waiting 24.0 seconds
Finished scraping data for the 2016-17 Regular%20Season.
... waiting 22.9 seconds
Finished scraping data for the 2016-17 Playoffs.
... waiting 14.0 seconds
Finished scraping data for the 2017-18 Regular%20Season.
... waiting 28.8 seconds
Finished scraping data for the 2017-18 Playoffs.
... waiting 27.1 seconds
Finished scraping data for the 2018-19 Regular%20Season.
... waiting 13.1 seconds
Finished scraping data for the 2018-19 Playoffs.
... waiting 18.9 seconds
Finished scraping data for the 2019-20 Regular%20Season.
... wai

In [13]:
NBAdata = pd.read_csv('nba_player_data.csv')

In [14]:
#cleaning data

In [15]:
NBAdata.drop(columns = ['RANK','EFF'],inplace = True)

In [16]:
NBAdata['season_start_year'] = NBAdata['Year'].str[:4].astype(int)

In [17]:
NBAdata['Season_type'].replace('Regular%20Season','Regular Season', inplace = True)

In [18]:
rs_df = NBAdata[NBAdata['Season_type'] == 'Regular Season']
playoffs_df = NBAdata[NBAdata['Season_type'] == 'Playoffs']

In [19]:
total_columns = ['MIN','FGM','FGA','FG3M','FG3A','FTM','FTA','OREB',
                 'DREB','REB','AST','STL','BLK','TOV', 'PF','PTS',]

In [20]:
#analyzing player stats correlations

In [21]:
#analyzing data per minute played
data_per_min = NBAdata.groupby(['PLAYER','PLAYER_ID','Year'])[total_columns].sum().reset_index()
for col in data_per_min.columns[4:]:
    data_per_min[col] = data_per_min[col]/data_per_min['MIN']

#calculating ratios
data_per_min['FG%'] = data_per_min['FGM']/data_per_min['FGA']
data_per_min['3PT%'] = data_per_min['FG3M']/data_per_min['FG3A']
data_per_min['FT%'] = data_per_min['FTM']/data_per_min['FTA']
data_per_min['FG3A%'] = data_per_min['FG3A']/data_per_min['FGA']
data_per_min['PTS/FGA'] = data_per_min['PTS']/data_per_min['FGA']
data_per_min ['FG3M/FGM'] = data_per_min['FG3M']/data_per_min ['FGM' ]
data_per_min['FTA/FGA'] = data_per_min['FTA']/data_per_min['FGA']
data_per_min['TRU%'] = 0.5*data_per_min['PTS']/ (data_per_min['FGA']+0.475*data_per_min['FTA'])
data_per_min['AST_TOV'] = data_per_min['AST']/data_per_min['TOV']

#filtering out players who barley played and have a too small sample size
data_per_min = data_per_min[data_per_min['MIN']>=25]

data_per_min.drop(columns = 'PLAYER_ID', inplace=True)
data_per_min.drop(columns = 'Year', inplace=True)
data_per_min.drop(columns = 'PLAYER', inplace=True)

# Creating a heatmap of the correlation matrix
fig = px.imshow(data_per_min.corr())
fig.show(renderer='browser')

In [22]:
#analyzing the distribution of minutes played in the regular season vs the playoffs

In [23]:
#Creating a Plotly figure with overlaid histograms for 'Regular Season' and 'Playoffs' MIN/GP ratios,
fig = go.Figure()
fig.add_trace(go.Histogram(x=rs_df['MIN']/rs_df['GP'], histnorm = 'percent', name = 'Regular Season',
                          xbins = {'start':0,'end':46,'size':1}))
fig.add_trace(go.Histogram(x=playoffs_df['MIN']/playoffs_df['GP'], histnorm = 'percent', name = 'Playoffs',
                           xbins = {'start':0,'end':46,'size':1}))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity = 0.5)
fig.show(renderer='browser')

In [24]:
#analyzing how the game changed over the past 10 years

In [25]:
change_df = NBAdata.groupby('season_start_year')[total_columns].sum().reset_index()

#Estimating possessions and reordering the columns to make it more clear
change_df['POSS_est'] = change_df['FGA'] - change_df['OREB']+change_df['TOV']+0.44*change_df['FTA']
change_df = change_df[list(change_df.columns[0:2])+['POSS_est']+list(change_df.columns[2:-1])]

#Calculating ratios
change_df['FG%'] = change_df['FGM']/change_df['FGA']
change_df['3PT%'] = change_df['FG3M']/change_df['FG3A']
change_df['FT%'] = change_df['FTM']/change_df['FTA']
change_df['FG3A%'] = change_df['FG3A']/change_df['FGA']
change_df['PTS/FGA'] = change_df['PTS']/change_df['FGA']
change_df['FG3M/FGM'] = change_df['FG3M']/change_df['FGM']
change_df['FTA/FGA'] = change_df['FTA']/change_df['FGA']
change_df['TRU%'] = 0.5*change_df['PTS']/ (change_df['FGA']+0.475*change_df['FTA'])
change_df['AST_TOV'] = change_df['AST']/change_df['TOV']

In [26]:
change_per48_df = change_df.copy()

# Scaling statistics to a per-48-minute basis
for col in change_per48_df.columns[2:18]:
    change_per48_df[col] = change_per48_df[col]/change_per48_df['MIN'] *48*5 #average stats per minute per player

change_per48_df.drop(columns = 'MIN', inplace = True)

# Creating a Plotly figure to visualize changes over the years for various statistics
fig = go.Figure()
for col in change_per48_df.columns[1:]:
    fig.add_trace(go.Scatter(x=change_per48_df['season_start_year'],
                            y=change_per48_df[col], name = col))
fig.show(renderer='browser')