# NBA Data Gathering

In [8]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import sqlalchemy as db
import os

## Data Gathering

### Nba Stats per Season (Web Scrapping)

In [9]:
url = "https://www.basketball-reference.com/leagues/NBA_stats_per_game.html"
response = requests.get(url)
soup = BeautifulSoup(response.content)
table = soup.find_all("table", attrs={"id":"stats"})[0]

In [10]:
headers = [header.text.strip() for header in table.find_all('th',attrs={"scope":"col"})][1:]
headers

['Season',
 'Lg',
 'Age',
 'Ht',
 'Wt',
 'G',
 'MP',
 'FG',
 'FGA',
 '3P',
 '3PA',
 'FT',
 'FTA',
 'ORB',
 'DRB',
 'TRB',
 'AST',
 'STL',
 'BLK',
 'TOV',
 'PF',
 'PTS',
 'FG%',
 '3P%',
 'FT%',
 'Pace',
 'eFG%',
 'TOV%',
 'ORB%',
 'FT/FGA',
 'ORtg']

In [11]:
data = [data.text.strip() for data in table.find_all('td')]
data

['2019-20',
 'NBA',
 '26.0',
 '6-6',
 '216',
 '971',
 '241.7',
 '40.8',
 '88.8',
 '12.1',
 '33.9',
 '17.7',
 '22.9',
 '10.1',
 '34.7',
 '44.9',
 '24.3',
 '7.7',
 '4.9',
 '14.5',
 '20.6',
 '111.4',
 '.460',
 '.357',
 '.771',
 '100.2',
 '.528',
 '12.8',
 '22.6',
 '.199',
 '110.4',
 '2018-19',
 'NBA',
 '26.3',
 '6-6',
 '217',
 '1230',
 '241.6',
 '41.1',
 '89.2',
 '11.4',
 '32.0',
 '17.7',
 '23.1',
 '10.3',
 '34.8',
 '45.2',
 '24.6',
 '7.6',
 '5.0',
 '14.1',
 '20.9',
 '111.2',
 '.461',
 '.355',
 '.766',
 '100.0',
 '.524',
 '12.4',
 '22.9',
 '.198',
 '110.4',
 '2017-18',
 'NBA',
 '26.4',
 '6-7',
 '219',
 '1230',
 '241.4',
 '39.6',
 '86.1',
 '10.5',
 '29.0',
 '16.6',
 '21.7',
 '9.7',
 '33.8',
 '43.5',
 '23.2',
 '7.7',
 '4.8',
 '14.3',
 '19.9',
 '106.3',
 '.460',
 '.362',
 '.767',
 '97.3',
 '.521',
 '13.0',
 '22.3',
 '.193',
 '108.6',
 '2016-17',
 'NBA',
 '26.6',
 '6-7',
 '220',
 '1230',
 '241.6',
 '39.0',
 '85.4',
 '9.7',
 '27.0',
 '17.8',
 '23.1',
 '10.1',
 '33.4',
 '43.5',
 '22.6',
 '7.7',

In [12]:
ncols = len(headers)
nrows = int(len(data)/ncols)

In [13]:
seasons = pd.DataFrame(np.array(data).reshape((nrows, ncols)), columns=headers)

In [14]:
#full table from basketball reference, stats are per game
pd.set_option('max_columns', 32)
seasons.head(10)

Unnamed: 0,Season,Lg,Age,Ht,Wt,G,MP,FG,FGA,3P,3PA,FT,FTA,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,FG%,3P%,FT%,Pace,eFG%,TOV%,ORB%,FT/FGA,ORtg
0,2019-20,NBA,26.0,6-6,216,971,241.7,40.8,88.8,12.1,33.9,17.7,22.9,10.1,34.7,44.9,24.3,7.7,4.9,14.5,20.6,111.4,0.46,0.357,0.771,100.2,0.528,12.8,22.6,0.199,110.4
1,2018-19,NBA,26.3,6-6,217,1230,241.6,41.1,89.2,11.4,32.0,17.7,23.1,10.3,34.8,45.2,24.6,7.6,5.0,14.1,20.9,111.2,0.461,0.355,0.766,100.0,0.524,12.4,22.9,0.198,110.4
2,2017-18,NBA,26.4,6-7,219,1230,241.4,39.6,86.1,10.5,29.0,16.6,21.7,9.7,33.8,43.5,23.2,7.7,4.8,14.3,19.9,106.3,0.46,0.362,0.767,97.3,0.521,13.0,22.3,0.193,108.6
3,2016-17,NBA,26.6,6-7,220,1230,241.6,39.0,85.4,9.7,27.0,17.8,23.1,10.1,33.4,43.5,22.6,7.7,4.7,14.0,19.9,105.6,0.457,0.358,0.772,96.4,0.514,12.7,23.3,0.209,108.8
4,2015-16,NBA,26.7,6-7,221,1230,241.8,38.2,84.6,8.5,24.1,17.7,23.4,10.4,33.3,43.8,22.3,7.8,5.0,14.4,20.3,102.7,0.452,0.354,0.757,95.8,0.502,13.2,23.8,0.209,106.4
5,2014-15,NBA,26.7,6-7,222,1230,242.0,37.5,83.6,7.8,22.4,17.1,22.8,10.9,32.4,43.3,22.0,7.7,4.8,14.4,20.2,100.0,0.449,0.35,0.75,93.9,0.496,13.3,25.1,0.205,105.6
6,2013-14,NBA,26.5,6-7,223,1230,242.0,37.7,83.0,7.7,21.5,17.8,23.6,10.9,31.8,42.7,22.0,7.7,4.7,14.6,20.7,101.0,0.454,0.36,0.756,93.9,0.501,13.6,25.5,0.215,106.6
7,2012-13,NBA,26.7,6-7,223,1229,241.9,37.1,82.0,7.2,20.0,16.7,22.2,11.2,31.0,42.1,22.1,7.8,5.1,14.6,19.8,98.1,0.453,0.359,0.753,92.0,0.496,13.7,26.5,0.204,105.8
8,2011-12,NBA,26.6,6-7,223,990,241.9,36.5,81.4,6.4,18.4,16.9,22.5,11.4,30.8,42.2,21.0,7.7,5.1,14.6,19.6,96.3,0.448,0.349,0.752,91.3,0.487,13.8,27.0,0.208,104.6
9,2010-11,NBA,26.6,6-7,223,1230,241.9,37.2,81.2,6.5,18.0,18.6,24.4,10.9,30.5,41.4,21.5,7.3,4.9,14.3,20.7,99.6,0.459,0.358,0.763,92.1,0.498,13.4,26.4,0.229,107.3


### NBA Stats per players (API)

#### All players database

In [15]:
response=requests.get("https://www.balldontlie.io/api/v1/players?per_page=100&page=1")
response

<Response [200]>

In [16]:
pagesrange=range(1,34)

In [18]:
# full table of all players from balldontlie API
players = pd.DataFrame()
for i in pagesrange:
    playerpage=pd.DataFrame(requests.get(f"https://www.balldontlie.io/api/v1/players?per_page=100&page={i}").json()["data"])
    players=pd.concat([players,playerpage])
players.head()

Unnamed: 0,id,first_name,height_feet,height_inches,last_name,position,team,weight_pounds
0,14,Ike,,,Anigbogu,C,"{'id': 12, 'abbreviation': 'IND', 'city': 'Ind...",
1,25,Ron,,,Baker,G,"{'id': 20, 'abbreviation': 'NYK', 'city': 'New...",
2,47,Jabari,,,Bird,G,"{'id': 2, 'abbreviation': 'BOS', 'city': 'Bost...",
3,67,MarShon,,,Brooks,G,"{'id': 15, 'abbreviation': 'MEM', 'city': 'Mem...",
4,71,Lorenzo,,,Brown,G,"{'id': 28, 'abbreviation': 'TOR', 'city': 'Tor...",


In [30]:
players.loc[players["last_name"]=="Carter",:]

Unnamed: 0,id,first_name,height_feet,height_inches,last_name,position,team,weight_pounds
53,1224,Anthony,,,Carter,,"{'id': 16, 'abbreviation': 'MIA', 'city': 'Mia...",
51,1523,Maurice,,,Carter,,"{'id': 14, 'abbreviation': 'LAL', 'city': 'Los...",
59,2469,Ron,,,Carter,,"{'id': 12, 'abbreviation': 'IND', 'city': 'Ind...",
20,2530,Reggie,,,Carter,,"{'id': 20, 'abbreviation': 'NYK', 'city': 'New...",
21,2531,Butch,,,Carter,,"{'id': 14, 'abbreviation': 'LAL', 'city': 'Los...",
46,2656,Howard,,,Carter,,"{'id': 8, 'abbreviation': 'DEN', 'city': 'Denv...",
36,87,Jevon,6.0,2.0,Carter,G,"{'id': 24, 'abbreviation': 'PHX', 'city': 'Pho...",196.0
37,88,Vince,6.0,6.0,Carter,F-G,"{'id': 1, 'abbreviation': 'ATL', 'city': 'Atla...",220.0


In [20]:
players.shape

(3268, 8)

#### Seasons Average database

In [21]:
response = requests.get("https://www.balldontlie.io/api/v1/season_averages?season=1979&player_ids[]=577")
response.json()

{'data': []}

In [31]:
seasons_range=list(range(2010,2019))
seasons_range

[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]

In [23]:
playersidrange = list(players["id"].unique())
len(playersidrange)

3268

In [24]:
playerseason=pd.DataFrame(requests.get(f"https://www.balldontlie.io/api/v1/season_averages?player_ids[]=115&season=2016").json()["data"])
playerseason

Unnamed: 0,games_played,player_id,season,min,fgm,fga,fg3m,fg3a,ftm,fta,oreb,dreb,reb,ast,stl,blk,turnover,pf,pts,fg_pct,fg3_pct,ft_pct
0,79,115,2016,33:23,8.54,18.27,4.1,9.99,4.11,4.58,0.77,3.7,4.47,6.63,1.8,0.22,3.03,2.32,25.3,0.468,0.411,0.898


In [None]:
players_season_average_2010s = pd.DataFrame()
for s in seasons_range:
    for f in tqdm(playersidrange):
        try:
            playerseason=pd.DataFrame(requests.get(f"https://www.balldontlie.io/api/v1/season_averages?season={s}&player_ids[]={f}").json()["data"])
            players_season_average_2010s=pd.concat([players_season_average_2010s,playerseason])
        except:
            pass
players_season_average_2010s.head()

HBox(children=(IntProgress(value=0, max=3268), HTML(value='')))

From cffi callback <function _verify_callback at 0x0000024DCF95AAF8>:
Traceback (most recent call last):
  File "C:\Users\Pedro\Anaconda3\lib\site-packages\OpenSSL\SSL.py", line 306, in wrapper
    @wraps(callback)
KeyboardInterrupt
From cffi callback <function _verify_callback at 0x0000024DCF099798>:
Traceback (most recent call last):
  File "C:\Users\Pedro\Anaconda3\lib\site-packages\OpenSSL\SSL.py", line 306, in wrapper
    @wraps(callback)
KeyboardInterrupt
From cffi callback <function _verify_callback at 0x0000024DCDF543A8>:
Traceback (most recent call last):
  File "C:\Users\Pedro\Anaconda3\lib\site-packages\OpenSSL\SSL.py", line 306, in wrapper
    @wraps(callback)
KeyboardInterrupt


## Data Cleaning and Manipulation

### NBA Stats per Season

In [139]:
seasons_3 = seasons.loc[:,["Season","3P","3PA","PTS","3P%"]]
seasons_3

Unnamed: 0,Season,3P,3PA,PTS,3P%
0,2019-20,12.1,33.9,111.4,.357
1,2018-19,11.4,32.0,111.2,.355
2,2017-18,10.5,29.0,106.3,.362
3,2016-17,9.7,27.0,105.6,.358
4,2015-16,8.5,24.1,102.7,.354
...,...,...,...,...,...
69,1950-51,,,84.1,
70,1949-50,,,80.0,
71,1948-49,,,80.0,
72,1947-48,,,72.7,


In [140]:
seasons_3.dtypes

Season    object
3P        object
3PA       object
PTS       object
3P%       object
dtype: object

In [141]:
#replacing empty values for nan
seasons_3=seasons_3.replace(r'^\s*$', np.nan, regex=True)

In [142]:
seasons_3["Season"]=seasons_3.loc[:,"Season"].apply(lambda x: x[0:4])

In [143]:
seasons_3["Season"]=seasons_3["Season"].astype(int)
seasons_3["3P"]=seasons_3["3P"].astype(float)
seasons_3["3PA"]=seasons_3["3PA"].astype(float)
seasons_3["PTS"]=seasons_3["PTS"].astype(float)
seasons_3["3P%"]=seasons_3["3P%"].astype(float)

In [144]:
seasons_3.dtypes

Season      int32
3P        float64
3PA       float64
PTS       float64
3P%       float64
dtype: object

In [145]:
seasons_3 = seasons_3.dropna(subset=["3P"])

In [146]:
seasons_3 = seasons_3.rename(columns={"3P%":"3P_pct"})

In [148]:
seasons_3=seasons_3.sort_values(by="Season")

In [149]:
#Pct_change_3P calculates the increase in 3Ps from an year to another
#3P%_in_PTS calculates the importance of 3Ps in PTS made
seasons_3["Pct_change_3P"]=seasons_3["3P"].pct_change()
seasons_3["3P_pct_in_PTS"]=(seasons_3["3P"]*3)/seasons_3["PTS"]

In [150]:
seasons_3

Unnamed: 0,Season,3P,3PA,PTS,3P_pct,Pct_change_3P,3P_pct_in_PTS
40,1979,0.8,2.8,109.3,0.28,,0.021958
39,1980,0.5,2.0,108.1,0.245,-0.375,0.013876
38,1981,0.6,2.3,108.6,0.262,0.2,0.016575
37,1982,0.5,2.3,108.5,0.238,-0.166667,0.013825
36,1983,0.6,2.4,110.1,0.25,0.2,0.016349
35,1984,0.9,3.1,110.8,0.282,0.5,0.024368
34,1985,0.9,3.3,110.2,0.282,0.0,0.024501
33,1986,1.4,4.7,109.9,0.301,0.555556,0.038217
32,1987,1.6,5.0,108.2,0.316,0.142857,0.044362
31,1988,2.1,6.6,109.2,0.323,0.3125,0.057692


### NBA Stats per players

In [160]:
players = players.loc[:,["id","first_name","last_name"]]

Unnamed: 0,id,first_name,last_name
0,14,Ike,Anigbogu
1,25,Ron,Baker
2,47,Jabari,Bird
3,67,MarShon,Brooks
4,71,Lorenzo,Brown


In [165]:
players = players.rename(columns={"id":"player_id"})
players.head(5)

Unnamed: 0,player_id,first_name,last_name
0,14,Ike,Anigbogu
1,25,Ron,Baker
2,47,Jabari,Bird
3,67,MarShon,Brooks
4,71,Lorenzo,Brown


In [231]:
players_season_average_1516 = pd.concat([players_season_average_2015,players_season_average_2016],ignore_index=True)

In [232]:
players_season_average_1516

Unnamed: 0,games_played,player_id,season,min,fgm,fga,fg3m,fg3a,ftm,fta,oreb,dreb,reb,ast,stl,blk,turnover,pf,pts,fg_pct,fg3_pct,ft_pct
0,8,71,2015,7:35,1.00,3.13,0.13,1.00,0.38,0.50,0.00,0.88,0.88,1.38,0.38,0.13,1.13,0.50,2.50,0.320,0.125,0.750
1,69,90,2015,27:15,4.33,9.01,1.62,3.97,1.49,2.30,0.84,5.10,5.94,1.38,0.81,0.25,1.36,2.23,11.78,0.481,0.409,0.648
2,75,179,2015,30:04,5.77,10.19,0.00,0.01,1.95,2.76,2.96,6.92,9.88,1.41,0.63,1.28,1.61,2.57,13.49,0.567,0.000,0.705
3,70,1593,2015,20:43,2.50,3.99,0.01,0.01,0.34,0.71,1.73,5.30,7.03,2.31,0.47,1.63,1.19,3.16,5.36,0.627,1.000,0.480
4,79,241,2015,22:47,3.15,5.39,0.13,0.54,0.87,1.53,2.25,4.14,6.39,1.75,0.66,1.05,1.19,2.71,7.30,0.585,0.233,0.570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
679,80,212,2016,29:40,2.29,5.98,1.18,3.38,1.29,1.60,0.64,3.20,3.84,1.76,0.90,0.40,1.03,2.28,7.04,0.383,0.348,0.805
680,67,214,2016,32:41,6.04,13.33,1.49,4.19,1.78,2.51,0.69,3.25,3.94,7.28,1.49,0.66,2.90,1.99,15.36,0.454,0.356,0.708
681,82,215,2016,19:59,2.84,6.56,1.18,3.33,0.80,0.98,0.33,2.41,2.74,1.24,0.79,0.38,0.80,1.33,7.67,0.433,0.355,0.825
682,78,216,2016,22:34,3.01,6.95,0.19,0.86,2.44,3.24,1.23,4.56,5.79,1.97,1.05,0.56,1.49,2.27,8.65,0.434,0.224,0.751


In [233]:
players_season_average_1516=players_season_average_1516.loc[:,["games_played","player_id","season","fg3m","fg3a","fg3_pct","pts"]].reset_index(drop=True)

In [234]:
players_season_average_1516

Unnamed: 0,games_played,player_id,season,fg3m,fg3a,fg3_pct,pts
0,8,71,2015,0.13,1.00,0.125,2.50
1,69,90,2015,1.62,3.97,0.409,11.78
2,75,179,2015,0.00,0.01,0.000,13.49
3,70,1593,2015,0.01,0.01,1.000,5.36
4,79,241,2015,0.13,0.54,0.233,7.30
...,...,...,...,...,...,...,...
679,80,212,2016,1.18,3.38,0.348,7.04
680,67,214,2016,1.49,4.19,0.356,15.36
681,82,215,2016,1.18,3.33,0.355,7.67
682,78,216,2016,0.19,0.86,0.224,8.65


## Data Storage

In [97]:
load_dotenv("./.env")

True

In [98]:
server = 'postgresql'
username = 'postgres'
password = os.getenv("password")
ip = 'localhost'
database = 'nba3pts'

# create the engine
engine = db.create_engine(f'{server}://{username}:{password}@{ip}/{database}')

# open the connection
conn = engine.connect()

In [235]:
players_season_average_1516.to_sql('players_season_average_1516', con=conn, if_exists='replace', index=False)

In [166]:
players.to_sql('players', con=conn, if_exists='replace', index=False)

In [151]:
seasons_3.to_sql('seasons_3', con=conn, if_exists='replace', index=False)