In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import requests
from pathlib import Path

This project requires Python 3.7 or above:

In [5]:
import sys

assert sys.version_info >= (3,7)

It also requires Scikit-Learn >= 1.0.1:

In [7]:
from packaging import version

assert version.parse(sklearn.__version__) >= version.parse("1.0.1")

# Scrape the Data

In [37]:
table_headers = (r['resultSet']['headers'])
df_col = (['Year', 'Season_type']+ table_headers)
df = pd.DataFrame(columns=df_col)

seasonTypes = ['Regular%20Season', 'Playoffs']
years = ['2012-13','2013-14','2014-15','2015-16','2016-17','2017-18','2018-19','2020-21','2020-21','2020-21']
for y in years:
        for s in seasonTypes:
                r = requests.get(url = "https://stats.nba.com/stats/leagueLeaders?LeagueID=00&PerMode=PerGame&Scope=S&Season="+y+"&SeasonType="+s+"&StatCategory=PTS").json()
                tempdf1 = pd.DataFrame(r['resultSet']['rowSet'], columns=table_headers)
                tempdf2 = pd.DataFrame({'Year':[y for i in range(len(tempdf1))],
                                        'Season_type':[s for i in range(len(tempdf1))]})
                tempdf3 = pd.concat([tempdf2,tempdf1], axis=1)
                df = pd.concat([df, tempdf3], axis=0)

  df = pd.concat([df, tempdf3], axis=0)


## Data Cleaning

In [38]:
df.isna().sum() # Checks for 'Null' values

Year           0
Season_type    0
PLAYER_ID      0
RANK           0
PLAYER         0
TEAM_ID        0
TEAM           0
GP             0
MIN            0
FGM            0
FGA            0
FG_PCT         0
FG3M           0
FG3A           0
FG3_PCT        0
FTM            0
FTA            0
FT_PCT         0
OREB           0
DREB           0
REB            0
AST            0
STL            0
BLK            0
TOV            0
PTS            0
EFF            0
dtype: int64

In [42]:
df.drop(columns=['RANK','EFF'])

Unnamed: 0,Year,Season_type,PLAYER_ID,PLAYER,TEAM_ID,TEAM,GP,MIN,FGM,FGA,...,FTA,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TOV,PTS
0,2012-13,Regular%20Season,2546,Carmelo Anthony,1610612752,NYK,67,37.0,10.0,22.2,...,7.6,0.830,2.0,4.9,6.9,2.6,0.8,0.5,2.6,28.7
1,2012-13,Regular%20Season,201142,Kevin Durant,1610612760,OKC,81,38.5,9.0,17.7,...,9.3,0.905,0.6,7.3,7.9,4.6,1.4,1.3,3.5,28.1
2,2012-13,Regular%20Season,977,Kobe Bryant,1610612747,LAL,78,38.6,9.5,20.4,...,8.0,0.839,0.8,4.7,5.6,6.0,1.4,0.3,3.7,27.3
3,2012-13,Regular%20Season,2544,LeBron James,1610612748,MIA,76,37.9,10.1,17.8,...,7.0,0.753,1.3,6.8,8.0,7.3,1.7,0.9,3.0,26.8
4,2012-13,Regular%20Season,201935,James Harden,1610612745,HOU,78,38.3,7.5,17.1,...,10.2,0.851,0.8,4.1,4.9,5.8,1.8,0.5,3.8,25.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,2020-21,Playoffs,203524,Solomon Hill,1610612737,ATL,14,10.4,0.4,1.7,...,0.3,0.500,0.2,1.1,1.4,0.2,0.1,0.1,0.1,1.2
143,2020-21,Playoffs,1627885,Shaquille Harrison,1610612743,DEN,9,4.4,0.3,0.4,...,0.3,0.667,0.0,0.9,0.9,0.3,0.3,0.3,0.6,1.0
144,2020-21,Playoffs,1630264,Anthony Gill,1610612764,WAS,4,8.3,0.0,0.8,...,0.0,0.000,0.3,0.8,1.0,0.0,0.0,0.0,0.8,0.0
145,2020-21,Playoffs,1629067,Isaac Bonga,1610612764,WAS,4,2.5,0.0,1.3,...,0.0,0.000,0.0,0.0,0.0,0.0,0.0,0.3,0.0,0.0


In [46]:
df['Season_Start_Year'] = df['Year'].str[:4].astype(int)

In [50]:
df['TEAM'] = df['TEAM'].replace(to_replace=['NOP','NOH'], value='NO')