# NBA webscraping project

> ## The goal of this project was to scrape NBA.com for player data and concatinate it into a pandas dataframe to be used for later analysis.

In [1]:
import numpy as np
import pandas as pd
import requests
import time

### Using a test Url i extracted the columns from the dataframe on NBA.com to use for the columns in my own dataframe

In [2]:
test_url = 'https://stats.nba.com/stats/leagueLeaders?LeagueID=00&PerMode=PerGame&Scope=S&Season=2023-24&SeasonType=Regular%20Season&StatCategory=PTS'

In [3]:
r = requests.get(test_url).json()

In [4]:
table_headers = r['resultSet']['headers']

In [5]:
pd.DataFrame(r['resultSet']['rowSet'], columns = table_headers)

Unnamed: 0,PLAYER_ID,RANK,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,FGA,FG_PCT,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PTS,EFF
0,203954,1,Joel Embiid,1610612755,PHI,28,34.0,11.6,21.7,0.536,...,0.885,2.7,9.0,11.7,5.9,1.1,2.0,3.9,34.9,40.3
1,1629029,2,Luka Doncic,1610612742,DAL,34,36.8,11.4,23.5,0.485,...,0.778,0.7,7.4,8.1,9.1,1.4,0.6,3.9,33.6,34.8
2,1628983,3,Shai Gilgeous-Alexander,1610612760,OKC,37,34.5,11.3,20.3,0.558,...,0.891,0.8,5.0,5.9,6.4,2.4,0.8,1.9,31.5,35.0
3,203507,4,Giannis Antetokounmpo,1610612749,MIL,39,35.1,11.6,19.1,0.608,...,0.677,2.7,8.7,11.4,5.9,1.3,1.1,3.6,31.2,36.2
4,201142,5,Kevin Durant,1610612756,PHX,32,36.9,10.2,19.4,0.526,...,0.878,0.4,5.9,6.3,5.8,0.9,1.1,3.2,29.0,29.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,1630550,256,JT Thor,1610612766,CHA,31,14.9,1.3,3.8,0.342,...,0.429,1.1,1.7,2.7,0.4,0.2,0.4,0.2,3.2,4.1
256,1629637,257,Jaxson Hayes,1610612747,LAL,32,10.3,1.3,1.9,0.672,...,0.536,0.5,1.3,1.8,0.3,0.2,0.4,0.6,3.0,4.2
257,1629723,258,John Konchar,1610612763,MEM,31,16.2,1.1,3.1,0.347,...,0.818,1.0,2.8,3.8,1.3,0.7,0.7,0.4,3.0,7.0
258,1641748,259,Andre Jackson Jr.,1610612749,MIL,34,11.0,1.0,2.0,0.515,...,0.857,0.9,1.1,2.0,1.0,0.3,0.1,0.5,2.6,4.5


### I added a year and season_type column to differentiate the season that would be implimented.

In [6]:
temp_df1 = pd.DataFrame(r['resultSet']['rowSet'], columns = table_headers)
temp_df2 = pd.DataFrame({'Year':['2023-24' for i in range(len(temp_df1))],
                        'Season_type':['Regular%20Season' for i in range(len(temp_df1))]})
temp_df3 = pd.concat([temp_df2, temp_df1], axis=1)
temp_df3

Unnamed: 0,Year,Season_type,PLAYER_ID,RANK,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PTS,EFF
0,2023-24,Regular%20Season,203954,1,Joel Embiid,1610612755,PHI,28,34.0,11.6,...,0.885,2.7,9.0,11.7,5.9,1.1,2.0,3.9,34.9,40.3
1,2023-24,Regular%20Season,1629029,2,Luka Doncic,1610612742,DAL,34,36.8,11.4,...,0.778,0.7,7.4,8.1,9.1,1.4,0.6,3.9,33.6,34.8
2,2023-24,Regular%20Season,1628983,3,Shai Gilgeous-Alexander,1610612760,OKC,37,34.5,11.3,...,0.891,0.8,5.0,5.9,6.4,2.4,0.8,1.9,31.5,35.0
3,2023-24,Regular%20Season,203507,4,Giannis Antetokounmpo,1610612749,MIL,39,35.1,11.6,...,0.677,2.7,8.7,11.4,5.9,1.3,1.1,3.6,31.2,36.2
4,2023-24,Regular%20Season,201142,5,Kevin Durant,1610612756,PHX,32,36.9,10.2,...,0.878,0.4,5.9,6.3,5.8,0.9,1.1,3.2,29.0,29.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255,2023-24,Regular%20Season,1630550,256,JT Thor,1610612766,CHA,31,14.9,1.3,...,0.429,1.1,1.7,2.7,0.4,0.2,0.4,0.2,3.2,4.1
256,2023-24,Regular%20Season,1629637,257,Jaxson Hayes,1610612747,LAL,32,10.3,1.3,...,0.536,0.5,1.3,1.8,0.3,0.2,0.4,0.6,3.0,4.2
257,2023-24,Regular%20Season,1629723,258,John Konchar,1610612763,MEM,31,16.2,1.1,...,0.818,1.0,2.8,3.8,1.3,0.7,0.7,0.4,3.0,7.0
258,2023-24,Regular%20Season,1641748,259,Andre Jackson Jr.,1610612749,MIL,34,11.0,1.0,...,0.857,0.9,1.1,2.0,1.0,0.3,0.1,0.5,2.6,4.5


In [7]:
del temp_df1, temp_df2, temp_df3

In [8]:
df_cols = ['Year', 'Season_type'] + table_headers

In [9]:
pd.DataFrame(columns = df_cols)

Unnamed: 0,Year,Season_type,PLAYER_ID,RANK,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,...,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PTS,EFF


### I then scraped 12 years of regular season and post season data in a loop with a time lag to avoid any potential bans. I also included some printed text to show the progress of each iteration in the loop. I then saved the concatenated dataframe to an excel spreadsheet as a checkpoint.

In [10]:
df = pd.DataFrame(columns = df_cols)
season_types = ['Regular%20Season', 'Playoffs']
years = ['2012-13', '2013-14', '2014-15', '2015-16', '2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24']

for y in years:
    for s in season_types:
        api_url = 'https://stats.nba.com/stats/leagueLeaders?LeagueID=00&PerMode=PerGame&Scope=S&Season='+y+'&SeasonType='+s+'&StatCategory=PTS'
        r = requests.get(api_url).json()
        temp_df1 = pd.DataFrame(r['resultSet']['rowSet'], columns = table_headers)
        temp_df2 = pd.DataFrame({'Year':[y for i in range(len(temp_df1))],
                                'Season_type':[s for i in range(len(temp_df1))]})
        temp_df3 = pd.concat([temp_df2, temp_df1], axis=1)
        df = pd.concat([df, temp_df3], axis=0)
        print(f'Finished scraping data for the {y} {s}.')
        lag = np.random.uniform(low=5,high=20)
        print(f'...waiting {round(lag, 1)} seconds')
        time.sleep(lag)

print('Completed')
df.to_excel('nba_player_data.xlsx', index=False)

Finished scraping data for the 2012-13 Regular%20Season.
...waiting 5.6 seconds
Finished scraping data for the 2012-13 Playoffs.
...waiting 8.1 seconds
Finished scraping data for the 2013-14 Regular%20Season.
...waiting 15.1 seconds
Finished scraping data for the 2013-14 Playoffs.
...waiting 14.6 seconds
Finished scraping data for the 2014-15 Regular%20Season.
...waiting 12.9 seconds
Finished scraping data for the 2014-15 Playoffs.
...waiting 11.4 seconds
Finished scraping data for the 2015-16 Regular%20Season.
...waiting 5.3 seconds
Finished scraping data for the 2015-16 Playoffs.
...waiting 11.2 seconds
Finished scraping data for the 2016-17 Regular%20Season.
...waiting 8.6 seconds
Finished scraping data for the 2016-17 Playoffs.
...waiting 14.8 seconds
Finished scraping data for the 2017-18 Regular%20Season.
...waiting 15.0 seconds
Finished scraping data for the 2017-18 Playoffs.
...waiting 19.1 seconds
Finished scraping data for the 2018-19 Regular%20Season.
...waiting 12.3 seconds