## Advanced NBA Statistics Capstone
# 1. Web Scraping

- The data we will use for this project is from www.nba.com/stats/.
- We will use a combination of Selenium, Beautiful Soup, and Pandas to scrape the data.
- After scraping and merging, this initial dataset will have 540 rows of player data and 132 features of player statistics
- The data that we will use for this project is stored in tabular format on the following webpages:


[Traditional Stats 2017-2018](https://www.nba.com/stats/players/traditional/?Season=2017-18&SeasonType=Regular%20Season&sort=PTS&dir=-1)

[Traditional Stats 2018-2019](https://www.nba.com/stats/players/traditional/?Season=2018-19&SeasonType=Regular%20Season&sort=PTS&dir=-1)

[Advanced Stats 2017-2018](https://www.nba.com/stats/players/advanced/?sort=GP&dir=-1&Season=2017-18&SeasonType=Regular%20Season)

[Advanced Stats 2018-2019](https://www.nba.com/stats/players/advanced/?sort=GP&dir=-1&Season=2018-19&SeasonType=Regular%20Season)

[Hustle Stats 2017-2018](https://www.nba.com/stats/players/hustle/?Season=2017-18&SeasonType=Regular%20Season)

[Shooting Stats 2017-2018](https://www.nba.com/stats/players/shooting/?Season=2017-18&SeasonType=Regular%20Season)

[Opponent Shooting Stats 2017-2018](https://www.nba.com/stats/players/opponent-shooting/?Season=2017-18&SeasonType=Regular%20Season)

[Passing Stats 2017-2018](https://www.nba.com/stats/players/passing/?Season=2017-18&SeasonType=Regular%20Season)

[Rebounding Stats 2017-2018](https://www.nba.com/stats/players/rebounding/?Season=2017-18&SeasonType=Regular%20Season)

[Boxout Stats 2017-2018](https://www.nba.com/stats/players/box-outs/?Season=2017-18&SeasonType=Regular%20Season)

[Player Bio Stats 2017-2018](https://www.nba.com/stats/players/bio/?Season=2017-18&SeasonType=Regular%20Season)
___

## Web Scraping with Selenium + Beautiful Soup

In [1]:
# importing packages

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import pandas as pd
import time

pd.set_option('display.max_columns', None)

In [None]:
# Scraping the Hustle Stats Table from the 2017-2018 season.

driver = webdriver.Chrome(ChromeDriverManager().install())
url = 'https://www.nba.com/stats/players/hustle/?Season=2017-18&SeasonType=Regular%20Season'
driver.get(url)

# waiting five seconds for webpage to fully load
time.sleep(5)

# Selecting xpath for the pagination button on the website
select = Select(driver.find_element_by_xpath(r"/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select"))
select.select_by_index(0)

src = driver.page_source
parser = BeautifulSoup(src, 'lxml')
table = parser.find('div', attrs = {'class': 'nba-stat-table__overflow'})

headers = table.findAll('th')

headerlist = [h.text.strip() for h in headers]
rows = table.findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')] for  i in range(len(rows))]

hustle_stats = pd.DataFrame(player_stats, columns= headerlist)

In [3]:
hustle_stats.head()

Unnamed: 0,Player,TEAM,AGE,GP,MIN,ScreenAssists,ScreenAssists PTS,Deflections,OFF Loose BallsRecovered,DEF Loose BallsRecovered,Loose BallsRecovered,% Loose BallsRecovered OFF,% Loose BallsRecovered DEF,ChargesDrawn,Contested2PT Shots,Contested3PT Shots,ContestedShots
0,Aaron Brooks,MIN,33,32,5.9,0.0,0.1,0.3,0.1,0.1,0.2,57.1,42.9,0.03,0.6,0.9,1.5
1,Aaron Gordon,ORL,22,58,32.9,0.9,1.9,1.4,0.5,0.7,1.2,42.9,57.1,0.07,4.7,1.8,6.5
2,Aaron Harrison,DAL,23,9,25.9,0.1,0.3,2.0,0.2,0.7,0.9,25.0,75.0,0.0,3.3,3.2,6.6
3,Aaron Jackson,HOU,32,1,34.5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Abdel Nader,BOS,24,48,10.9,0.1,0.3,0.7,0.1,0.2,0.4,38.9,61.1,0.0,1.5,1.3,2.8


In [None]:
# Scraping the SHOOTING Table from the 2017-2018 season.

driver = webdriver.Chrome(ChromeDriverManager().install())
url = 'https://www.nba.com/stats/players/shooting/?Season=2017-18&SeasonType=Regular%20Season'
driver.get(url)

# waiting five seconds for webpage to fully load
time.sleep(5)

select = Select(driver.find_element_by_xpath(r"/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select"))
select.select_by_index(0)

src = driver.page_source
parser = BeautifulSoup(src, 'lxml')
table = parser.find('div', attrs = {'class': 'nba-stat-table__overflow'})

headers = table.findAll('th')
headerlist = [h.text.strip() for h in headers]
rows = table.findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')] for  i in range(len(rows))]

headerlist1 = headerlist[-24:]
player_stats1 = player_stats[5:]
shooting_stats = pd.DataFrame(player_stats1, columns= headerlist1)

In [24]:
# Note: still need to rename column names to include range of shots.

shooting_stats = shooting_stats.iloc[:540]

In [25]:
shooting_stats.head()

Unnamed: 0,Player,TEAM,AGE,FGM,FGA,FG%,FGM.1,FGA.1,FG%.1,FGM.2,FGA.2,FG%.2,FGM.3,FGA.3,FG%.3,FGM.4,FGA.4,FG%.4,FGM.5,FGA.5,FG%.5,FGM.6,FGA.6,FG%.6
0,Aaron Brooks,MIN,33,0.3,0.6,46.7,0.1,0.3,25.0,0.3,0.5,55.6,0.1,0.2,75.0,0.2,0.4,54.5,0.2,0.7,26.3,,,
1,Aaron Gordon,ORL,22,3.2,4.9,66.7,0.2,0.9,20.0,0.4,1.2,35.7,0.5,1.8,29.8,0.7,2.1,35.3,1.4,4.1,33.5,,,
2,Aaron Harrison,DAL,23,0.6,1.0,55.6,0.1,0.2,50.0,0.1,0.4,25.0,0.3,1.1,30.0,0.4,2.6,17.4,0.6,2.3,23.8,,,
3,Aaron Jackson,HOU,32,1.0,4.0,25.0,1.0,1.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,50.0,0.0,2.0,0.0,,,
4,Abdel Nader,BOS,24,0.4,1.1,40.4,0.1,0.5,20.0,0.0,0.1,20.0,0.0,0.1,0.0,0.3,0.7,38.7,0.3,0.8,32.4,,,


In [None]:
# Scraping the OPPONENT SHOOTING Stats Table from the 2017-2018 season.

driver = webdriver.Chrome(ChromeDriverManager().install())
url = 'https://www.nba.com/stats/players/opponent-shooting/?Season=2017-18&SeasonType=Regular%20Season'
driver.get(url)

# waiting five seconds for webpage to fully load
time.sleep(5)

select = Select(driver.find_element_by_xpath(r"/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select"))
select.select_by_index(0)

src = driver.page_source
parser = BeautifulSoup(src, 'lxml')
table = parser.find('div', attrs = {'class': 'nba-stat-table__overflow'})

headers = table.findAll('th')
headerlist = [h.text.strip() for h in headers]
rows = table.findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')] for  i in range(len(rows))]

headerlist1 = headerlist[-21:]
player_stats1 = player_stats[5:]
opp_shooting_stats = pd.DataFrame(player_stats1, columns= headerlist1)
opp_shooting_stats = opp_shooting_stats.iloc[:540]

In [27]:
#Note: Still Need to rename column names to include range of shots.
opp_shooting_stats.head()

Unnamed: 0,Player,TEAM,AGE,FGM,FGA,FG%,FGM.1,FGA.1,FG%.1,FGM.2,FGA.2,FG%.2,FGM.3,FGA.3,FG%.3,FGM.4,FGA.4,FG%.4,FGM.5,FGA.5,FG%.5
0,Aaron Brooks,MIN,33,2.6,3.6,72.4,0.2,0.8,27.3,0.6,1.2,47.8,0.5,1.1,44.4,1.6,3.4,48.8,0.8,2.3,32.8
1,Aaron Gordon,ORL,22,14.4,22.8,63.2,2.3,6.1,37.1,2.2,5.1,42.4,2.5,5.9,42.4,3.7,9.3,40.3,3.8,11.1,33.9
2,Aaron Harrison,DAL,23,9.9,14.3,69.0,1.9,5.3,35.4,1.8,2.7,66.7,2.4,4.7,52.4,2.9,8.7,33.3,3.2,8.4,38.2
3,Aaron Jackson,HOU,32,16.0,24.0,66.7,2.0,7.0,28.6,1.0,3.0,33.3,1.0,4.0,25.0,6.0,11.0,54.5,2.0,9.0,22.2
4,Abdel Nader,BOS,24,4.2,7.1,59.8,0.5,1.9,26.8,0.6,1.5,41.9,0.8,1.9,42.2,1.3,3.0,43.1,1.7,4.2,39.8


In [None]:
# Scraping the PASSING Stats Table from the 2017-2018 season.

driver = webdriver.Chrome(ChromeDriverManager().install())
url = 'https://www.nba.com/stats/players/passing/?Season=2017-18&SeasonType=Regular%20Season'
driver.get(url)

# waiting five seconds for webpage to fully load
time.sleep(5)

select = Select(driver.find_element_by_xpath(r"/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select"))
select.select_by_index(0)

src = driver.page_source
parser = BeautifulSoup(src, 'lxml')
table = parser.find('div', attrs = {'class': 'nba-stat-table__overflow'})

headers = table.findAll('th')
headerlist = [h.text.strip() for h in headers]
rows = table.findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')] for  i in range(len(rows))]
passing_stats = pd.DataFrame(player_stats, columns= headerlist)

In [22]:
passing_stats.head()

Unnamed: 0,Player,Team,GP,W,L,MIN,PassesMade,PassesReceived,AST,SecondaryAST,PotentialAST,AST PTSCreated,AST PTSCreated.1,ASTAdj,AST ToPass%,AST ToPass% Adj
0,Aaron Brooks,MIN,32,19,13,5.9,9.0,10.5,0.6,0.0,1.3,,1.5,0.7,7.0,7.3
1,Aaron Gordon,ORL,58,19,39,32.9,40.2,32.5,2.3,0.2,4.6,,5.6,2.7,5.8,6.6
2,Aaron Harrison,DAL,9,2,7,25.9,19.6,21.1,1.2,0.0,2.8,,3.2,1.2,6.3,6.3
3,Aaron Jackson,HOU,1,0,1,34.5,49.0,51.0,1.0,0.0,9.0,,3.0,1.0,2.0,2.0
4,Abdel Nader,BOS,47,32,15,11.1,8.7,9.1,0.6,0.1,1.3,,1.5,0.7,6.4,8.6


In [None]:
# Scraping the REBOUNDING Table from the 2017-2018 season.

driver = webdriver.Chrome(ChromeDriverManager().install())
url = 'https://www.nba.com/stats/players/rebounding/?Season=2017-18&SeasonType=Regular%20Season'
driver.get(url)

# waiting five seconds for webpage to fully load
time.sleep(5)

select = Select(driver.find_element_by_xpath(r"/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select"))
select.select_by_index(0)

src = driver.page_source
parser = BeautifulSoup(src, 'lxml')
table = parser.find('div', attrs = {'class': 'nba-stat-table__overflow'})

headers = table.findAll('th')
headerlist = [h.text.strip() for h in headers]
rows = table.findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')] for  i in range(len(rows))]

rebounding_stats = pd.DataFrame(player_stats, columns= headerlist)

In [7]:
rebounding_stats.head()

Unnamed: 0,Player,Team,GP,W,L,MIN,REB,ContestedREB,ContestedREB%,REBChances,REBChance%,DeferredREB Chances,AdjustedREB Chance%,AVG REBDistance
0,Aaron Brooks,MIN,32,19,13,5.9,0.5,0.1,17.6,1.1,47.2,0.2,56.7,11.5
1,Aaron Gordon,ORL,58,19,39,32.9,7.9,2.6,33.5,13.6,58.1,0.7,61.1,6.0
2,Aaron Harrison,DAL,9,2,7,25.9,2.7,0.3,12.5,4.7,57.1,0.2,60.0,11.0
3,Aaron Jackson,HOU,1,0,1,34.5,3.0,1.0,33.3,7.0,42.9,0.0,42.9,7.8
4,Abdel Nader,BOS,47,32,15,11.1,1.5,0.4,29.6,2.6,57.7,0.2,62.3,7.0


In [None]:
# Scraping the BOX-OUT Table from the 2017-2018 season. 

driver = webdriver.Chrome(ChromeDriverManager().install())
url = 'https://www.nba.com/stats/players/box-outs/?Season=2017-18&SeasonType=Regular%20Season'
driver.get(url)

# waiting five seconds for webpage to fully load
time.sleep(5)

select = Select(driver.find_element_by_xpath(r"/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select"))
select.select_by_index(0)

src = driver.page_source
parser = BeautifulSoup(src, 'lxml')
table = parser.find('div', attrs = {'class': 'nba-stat-table__overflow'})

headers = table.findAll('th')
headerlist = [h.text.strip() for h in headers]
rows = table.findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')] for  i in range(len(rows))]

boxout_stats = pd.DataFrame(player_stats, columns= headerlist)

In [29]:
boxout_stats.head()

Unnamed: 0,Player,TEAM,AGE,GP,MIN,Box Outs,OFF Box Outs,DEF Box Outs,Team RebOn Box Outs,Player RebOn Box Outs,% Box Outs Off,% Box Outs Def,% Team RebWhen Box Out,% Player RebWhen Box Out
0,Aaron Brooks,MIN,33,32,5.9,0.2,0.0,0.2,0.1,0.0,0.0,100.0,75.0,25.0
1,Aaron Gordon,ORL,22,58,32.9,2.7,0.2,2.5,1.3,0.4,7.0,93.0,85.2,23.9
2,Aaron Harrison,DAL,23,9,25.9,0.8,0.0,0.8,0.7,0.0,0.0,100.0,85.7,0.0
3,Aaron Jackson,HOU,32,1,34.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Abdel Nader,BOS,24,48,10.9,0.8,0.1,0.7,0.5,0.1,17.5,82.5,92.0,12.0


In [None]:
# Scraping the BIOS Table from the 2017-2018 season. (features: height,weight, draft number?)

driver = webdriver.Chrome(ChromeDriverManager().install())
url = 'https://www.nba.com/stats/players/bio/?Season=2017-18&SeasonType=Regular%20Season'
driver.get(url)

# waiting five seconds for webpage to fully load
time.sleep(5)

select = Select(driver.find_element_by_xpath(r"/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select"))
select.select_by_index(0)

src = driver.page_source
parser = BeautifulSoup(src, 'lxml')
table = parser.find('div', attrs = {'class': 'nba-stat-table__overflow'})

headers = table.findAll('th')
headerlist = [h.text.strip() for h in headers]
rows = table.findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')] for  i in range(len(rows))]

bio_stats = pd.DataFrame(player_stats, columns= headerlist)

In [10]:
bio_stats.head()

Unnamed: 0,Player,Team,Age,Height,Weight,College,Country,Draft Year,Draft Round,Draft Number,GP,PTS,REB,AST,NetRtg,OREB%,DREB%,USG%,TS%,AST%
0,Aaron Brooks,MIN,33,6-0,161,Oregon,USA,2007,1,26,32,2.3,0.5,0.6,-17.2,3.6%,5.7%,19.2%,50.8%,16.1%
1,Aaron Gordon,ORL,22,6-9,220,Arizona,USA,2014,1,4,58,17.6,7.9,2.3,-2.0,4.4%,19.1%,23.8%,53.0%,11.1%
2,Aaron Harrison,DAL,23,6-6,210,Kentucky,USA,Undrafted,Undrafted,Undrafted,9,6.7,2.7,1.2,-14.1,1.5%,9.1%,14.7%,39.2%,7.2%
3,Aaron Jackson,HOU,32,6-3,183,Duquesne,USA,Undrafted,Undrafted,Undrafted,1,8.0,3.0,1.0,-19.8,5.6%,3.1%,14.9%,40.5%,5.3%
4,Abdel Nader,BOS,24,6-6,230,Iowa State,Egypt,2016,2,58,48,3.0,1.5,0.5,-10.3,2.4%,11.1%,16.2%,43.9%,7.8%


In [None]:
# Scraping the ADVANCED Table from the 2017-2018 season. 

driver = webdriver.Chrome(ChromeDriverManager().install())
url= 'https://www.nba.com/stats/players/advanced/?sort=dir%3D-1&Season=2017-18&SeasonType=Regular%20Season'
driver.get(url)

# waiting five seconds for webpage to fully load
time.sleep(5)

select = Select(driver.find_element_by_xpath(r"/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select"))
select.select_by_index(0)

src = driver.page_source
parser = BeautifulSoup(src, 'lxml')
table = parser.find('div', attrs = {'class': 'nba-stat-table__overflow'})

headers = table.findAll('th')
headerlist = [h.text.strip() for h in headers]
rows = table.findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')] for  i in range(len(rows))]


headerlist1 = headerlist[:23]

advanced_stats = pd.DataFrame(player_stats, columns= headerlist1)


In [12]:
advanced_stats.head()

Unnamed: 0,Unnamed: 1,PLAYER,TEAM,AGE,GP,W,L,MIN,OFFRTG,DEFRTG,NETRTG,AST%,AST/TO,AST Ratio,OREB%,DREB%,REB%,TO Ratio,eFG%,TS%,USG%,PACE,PIE
0,,Aaron Brooks,MIN,33,32,19,13,5.9,98.5,115.7,-17.2,16.1,1.82,19.0,3.6,5.7,4.6,10.5,48.6,50.8,19.2,100.86,4.4
1,,Aaron Gordon,ORL,22,58,19,39,32.9,106.3,108.3,-2.0,11.1,1.27,11.3,4.4,19.1,11.6,8.9,50.0,53.0,23.8,100.43,11.3
2,,Aaron Harrison,DAL,23,9,2,7,25.9,98.1,112.2,-14.1,7.2,3.67,12.1,1.5,9.1,4.9,3.3,34.1,39.2,14.7,98.98,2.2
3,,Aaron Jackson,HOU,32,1,0,1,34.5,88.1,107.8,-19.8,5.3,1.0,8.3,5.6,3.1,4.4,8.3,38.9,40.5,14.9,91.13,-1.0
4,,Abdel Nader,BOS,24,48,33,15,10.9,97.1,107.3,-10.3,7.8,0.76,11.5,2.4,11.1,6.7,15.0,41.3,43.9,16.2,100.46,3.3


In [None]:
# Scraping the ADVANCED Table from the 2018-2019 season (to obtain new response variable: PIE_2018)

driver = webdriver.Chrome(ChromeDriverManager().install())
url= 'https://www.nba.com/stats/players/advanced/?sort=dir%3D-1&Season=2018-19&SeasonType=Regular%20Season'
driver.get(url)

# waiting five seconds for webpage to fully load
time.sleep(5)

select = Select(driver.find_element_by_xpath(r"/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select"))
select.select_by_index(0)

src = driver.page_source
parser = BeautifulSoup(src, 'lxml')
table = parser.find('div', attrs = {'class': 'nba-stat-table__overflow'})

headers = table.findAll('th')
headerlist = [h.text.strip() for h in headers]
rows = table.findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')] for  i in range(len(rows))]


headerlist1 = headerlist[:23]

advanced_stats_2018 = pd.DataFrame(player_stats, columns= headerlist1)

In [14]:
advanced_stats_2018.head()

Unnamed: 0,Unnamed: 1,PLAYER,TEAM,AGE,GP,W,L,MIN,OFFRTG,DEFRTG,NETRTG,AST%,AST/TO,AST Ratio,OREB%,DREB%,REB%,TO Ratio,eFG%,TS%,USG%,PACE,PIE
0,,Aaron Gordon,ORL,23,78,40,38,33.8,107.4,105.9,1.5,16.6,1.78,18.0,4.7,16.5,10.6,10.1,50.7,53.8,21.3,99.28,10.9
1,,Aaron Holiday,IND,22,50,31,19,12.9,107.2,100.2,7.0,18.0,2.17,21.2,0.8,8.8,4.9,9.8,48.3,51.8,20.6,103.35,7.8
2,,Abdel Nader,OKC,25,61,38,23,11.4,96.4,105.8,-9.5,4.4,0.77,7.3,1.7,13.9,7.5,9.5,49.8,52.2,14.8,104.97,6.6
3,,Al Horford,BOS,33,68,41,27,29.0,112.5,106.4,6.1,20.3,2.77,24.7,6.2,16.1,11.3,8.9,58.6,60.5,18.8,100.82,13.4
4,,Al-Farouq Aminu,POR,28,81,52,29,28.3,116.3,108.2,8.2,5.7,1.44,12.3,4.8,20.4,12.7,8.5,51.4,56.8,13.4,101.22,9.7


In [None]:
# Scraping the TRADITIONAL Table from the 2017-2018 season. (scraping explanatory variables MIN_2017)

driver = webdriver.Chrome(ChromeDriverManager().install())
url = 'https://www.nba.com/stats/players/traditional/?sort=PLAYER_NAME&dir=-1&Season=2017-18&SeasonType=Regular%20Season'
driver.get(url)

# waiting five seconds for webpage to fully load
time.sleep(5)

select = Select(driver.find_element_by_xpath(r"/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select"))
select.select_by_index(0)

src = driver.page_source
parser = BeautifulSoup(src, 'lxml')
table = parser.find('div', attrs = {'class': 'nba-stat-table__overflow'})

headers = table.findAll('th')
headerlist = [h.text.strip() for h in headers]
rows = table.findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')] for  i in range(len(rows))]

headerlist1 = headerlist[:30]

traditional_stats_2017 = pd.DataFrame(player_stats, columns= headerlist1)

In [31]:
traditional_stats_2017.head()

Unnamed: 0,Unnamed: 1,PLAYER,TEAM,AGE,GP,W,L,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-
0,,Aaron Brooks,MIN,33,32,19,13,5.9,2.3,0.9,2.2,40.6,0.3,1.0,35.5,0.3,0.3,72.7,0.2,0.3,0.5,0.6,0.3,0.2,0.0,0.9,4.1,0,0,-2.3
1,,Aaron Gordon,ORL,22,58,19,39,32.9,17.6,6.5,14.9,43.4,2.0,5.9,33.6,2.7,3.9,69.8,1.5,6.4,7.9,2.3,1.8,1.0,0.8,1.9,34.1,17,0,-1.6
2,,Aaron Harrison,DAL,23,9,2,7,25.9,6.7,2.1,7.7,27.5,1.0,4.8,20.9,1.4,1.9,76.5,0.4,2.2,2.7,1.2,0.3,1.0,0.2,3.0,15.0,0,0,-8.0
3,,Aaron Jackson,HOU,32,1,0,1,34.5,8.0,3.0,9.0,33.3,1.0,4.0,25.0,1.0,2.0,50.0,2.0,1.0,3.0,1.0,1.0,0.0,0.0,4.0,12.1,0,0,-10.0
4,,Abdel Nader,BOS,24,48,33,15,10.9,3.0,1.0,3.1,33.6,0.5,1.4,35.4,0.5,0.8,59.0,0.3,1.2,1.5,0.5,0.7,0.3,0.2,0.9,6.5,0,0,-2.3


In [None]:
# Scraping the TRADITIONAL Table from the 2018-2019 season. (target expalantory variables MIN_2018 and Team_2018)

driver = webdriver.Chrome(ChromeDriverManager().install())
url = 'https://www.nba.com/stats/players/traditional/?sort=PLAYER_NAME&dir=-1&Season=2018-19&SeasonType=Regular%20Season'
driver.get(url)

# waiting five seconds for webpage to fully load
time.sleep(5)

select = Select(driver.find_element_by_xpath(r"/html/body/main/div/div/div[2]/div/div/nba-stat-table/div[1]/div/div/select"))
select.select_by_index(0)

src = driver.page_source
parser = BeautifulSoup(src, 'lxml')
table = parser.find('div', attrs = {'class': 'nba-stat-table__overflow'})

headers = table.findAll('th')
headerlist = [h.text.strip() for h in headers]
rows = table.findAll('tr')[1:]
player_stats = [[td.getText().strip() for td in rows[i].findAll('td')] for  i in range(len(rows))]


headerlist1 = headerlist[:30]

traditional_stats_2018 = pd.DataFrame(player_stats, columns= headerlist1)

In [33]:
traditional_stats_2018.head()

Unnamed: 0,Unnamed: 1,PLAYER,TEAM,AGE,GP,W,L,MIN,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-
0,,Aaron Gordon,ORL,23,78,40,38,33.8,16.0,6.0,13.4,44.9,1.6,4.4,34.9,2.4,3.2,73.1,1.7,5.7,7.4,3.7,2.1,0.7,0.7,2.2,32.6,19,0,1.4
1,,Aaron Holiday,IND,22,50,31,19,12.9,5.9,2.1,5.2,40.1,0.9,2.5,33.9,0.8,1.0,82.0,0.1,1.2,1.3,1.7,0.8,0.4,0.3,1.4,11.3,0,0,2.0
2,,Abdel Nader,OKC,25,61,38,23,11.4,4.0,1.5,3.5,42.3,0.5,1.6,32.0,0.4,0.6,75.0,0.2,1.7,1.9,0.3,0.4,0.3,0.2,1.1,7.9,0,0,-2.5
3,,Al Horford,BOS,33,68,41,27,29.0,13.6,5.7,10.6,53.5,1.1,3.0,36.0,1.1,1.4,82.1,1.8,5.0,6.7,4.2,1.5,0.9,1.3,1.9,32.8,10,1,3.8
4,,Al-Farouq Aminu,POR,28,81,52,29,28.3,9.4,3.2,7.3,43.3,1.2,3.5,34.3,1.9,2.1,86.7,1.4,6.1,7.5,1.3,0.9,0.8,0.4,1.8,23.2,12,0,4.7


___
# Web Scraping Complete: Next Merging Datasets
#### - Also performing prelimary data cleaning and selecting features for project analysis

In [35]:
# renaming Minutes column to differientiate it from response variable.

clean_traditional_stats_2017 = traditional_stats_2017.rename(columns = {'MIN':'MIN_2017'}) 

# clean_traditional_stats_2017.head()

In [36]:
# Beginning to build concatenated dataframe.

df = clean_traditional_stats_2017.copy()

In [37]:
clean_traditional_stats_2018 = traditional_stats_2018.rename(columns = {'MIN':'MIN_2018'}) 
clean_traditional_stats_2018 = clean_traditional_stats_2018[['PLAYER', 'MIN_2018']].copy()

# clean_traditional_stats_2018.head()

In [38]:
# Merging 2018 stats (the only column needed is MIN_2018, which will be our Explanatory/Indepedent Variable 
# for our Regression Model)

df = df.merge(clean_traditional_stats_2018, on='PLAYER', how='left')

# df.head()

In [39]:
# Rearranging MIN columns to keep track of them before merging more dataframes.

df = df[['', 'PLAYER', 'TEAM', 'AGE', 'MIN_2017', 'MIN_2018','GP', 'W', 'L', 'PTS', 'FGM', 'FGA', 'FG%', '3PM', '3PA', '3P%', 'FTM', 'FTA', 'FT%', 'OREB', 'DREB', 'REB', 'AST', 'TOV', 'STL', 'BLK', 'PF', 'FP', 'DD2', 'TD3', '+/-']]

df.head()

Unnamed: 0,Unnamed: 1,PLAYER,TEAM,AGE,MIN_2017,MIN_2018,GP,W,L,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-
0,,Aaron Brooks,MIN,33,5.9,,32,19,13,2.3,0.9,2.2,40.6,0.3,1.0,35.5,0.3,0.3,72.7,0.2,0.3,0.5,0.6,0.3,0.2,0.0,0.9,4.1,0,0,-2.3
1,,Aaron Gordon,ORL,22,32.9,33.8,58,19,39,17.6,6.5,14.9,43.4,2.0,5.9,33.6,2.7,3.9,69.8,1.5,6.4,7.9,2.3,1.8,1.0,0.8,1.9,34.1,17,0,-1.6
2,,Aaron Harrison,DAL,23,25.9,,9,2,7,6.7,2.1,7.7,27.5,1.0,4.8,20.9,1.4,1.9,76.5,0.4,2.2,2.7,1.2,0.3,1.0,0.2,3.0,15.0,0,0,-8.0
3,,Aaron Jackson,HOU,32,34.5,,1,0,1,8.0,3.0,9.0,33.3,1.0,4.0,25.0,1.0,2.0,50.0,2.0,1.0,3.0,1.0,1.0,0.0,0.0,4.0,12.1,0,0,-10.0
4,,Abdel Nader,BOS,24,10.9,11.4,48,33,15,3.0,1.0,3.1,33.6,0.5,1.4,35.4,0.5,0.8,59.0,0.3,1.2,1.5,0.5,0.7,0.3,0.2,0.9,6.5,0,0,-2.3


In [40]:
# Selecting Columns to Merge into master dataframe.
# bio_stats.head()
bio_columns = bio_stats.columns.to_list()
target_bio_columns = ['Player',  'Height', 'Weight', 'College', 'Country', 'Draft Year', 'Draft Round', 'Draft Number']
clean_bio_stats = bio_stats[target_bio_columns].copy()

# clean_bio_stats.head()

In [41]:
#Renaming "PLAYER" from master dataframe to match case of remaining dataframes
df = df.rename(columns = {'PLAYER':'Player'}) 

In [43]:
# Merging Bio Stats to master dataframe

df = df.merge(clean_bio_stats, on='Player', how='left')

# df.head()

In [44]:
col_names = boxout_stats.columns.to_list()
target_col_names = ['Player', 'Box Outs', 'OFF Box Outs', 'DEF Box Outs', 'Team RebOn Box Outs', 'Player RebOn Box Outs', '% Box Outs Off', '% Box Outs Def', '% Team RebWhen Box Out', '% Player RebWhen Box Out']
clean_boxout_stats = boxout_stats[target_col_names].copy()

# clean_boxout_stats.head()

In [45]:
# Merging Boxout Stats to master dataframe

df = df.merge(clean_boxout_stats, on='Player', how='left')

# df.head()

In [46]:
# Selecting Columns From Rebounding Stats to Merge.

col_names = rebounding_stats.columns.to_list()
target_col_names = ['Player', 'ContestedREB', 'ContestedREB%', 'REBChances', 'REBChance%', 'DeferredREB\xa0Chances', 'AdjustedREB\xa0Chance%', 'AVG\xa0REBDistance']
clean_rebounding_stats = rebounding_stats[target_col_names].copy()

# clean_rebounding_stats.head()

In [47]:
# Merging Rebounding Stats to master dataframe

df = df.merge(clean_rebounding_stats, on='Player', how='left')

# df.head()

In [48]:
# Selecting Columns From Passing Stats to Merge.

col_names = passing_stats.columns.to_list()
target_col_names = ['Player', 'PassesMade', 'PassesReceived', 'AST', 'SecondaryAST', 'PotentialAST', 'AST\xa0PTSCreated', 'AST\xa0PTSCreated', 'ASTAdj', 'AST\xa0ToPass%', 'AST\xa0ToPass%\xa0Adj']
clean_passing_stats = passing_stats[target_col_names].copy()

# clean_passing_stats.head()

In [49]:
# Merging Passing Stats to master dataframe

df = df.merge(clean_passing_stats, on='Player', how='left')

df.head()

Unnamed: 0,Unnamed: 1,Player,TEAM,AGE,MIN_2017,MIN_2018,GP,W,L,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST_x,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,Height_x,Weight_x,College_x,Country_x,Draft Year_x,Draft Round_x,Draft Number_x,Height_y,Weight_y,College_y,Country_y,Draft Year_y,Draft Round_y,Draft Number_y,Box Outs,OFF Box Outs,DEF Box Outs,Team RebOn Box Outs,Player RebOn Box Outs,% Box Outs Off,% Box Outs Def,% Team RebWhen Box Out,% Player RebWhen Box Out,ContestedREB,ContestedREB%,REBChances,REBChance%,DeferredREB Chances,AdjustedREB Chance%,AVG REBDistance,PassesMade,PassesReceived,AST_y,SecondaryAST,PotentialAST,AST PTSCreated,AST PTSCreated.1,AST PTSCreated.2,AST PTSCreated.3,ASTAdj,AST ToPass%,AST ToPass% Adj
0,,Aaron Brooks,MIN,33,5.9,,32,19,13,2.3,0.9,2.2,40.6,0.3,1.0,35.5,0.3,0.3,72.7,0.2,0.3,0.5,0.6,0.3,0.2,0.0,0.9,4.1,0,0,-2.3,6-0,161,Oregon,USA,2007,1,26,6-0,161,Oregon,USA,2007,1,26,0.2,0.0,0.2,0.1,0.0,0.0,100.0,75.0,25.0,0.1,17.6,1.1,47.2,0.2,56.7,11.5,9.0,10.5,0.6,0.0,1.3,,1.5,,1.5,0.7,7.0,7.3
1,,Aaron Gordon,ORL,22,32.9,33.8,58,19,39,17.6,6.5,14.9,43.4,2.0,5.9,33.6,2.7,3.9,69.8,1.5,6.4,7.9,2.3,1.8,1.0,0.8,1.9,34.1,17,0,-1.6,6-9,220,Arizona,USA,2014,1,4,6-9,220,Arizona,USA,2014,1,4,2.7,0.2,2.5,1.3,0.4,7.0,93.0,85.2,23.9,2.6,33.5,13.6,58.1,0.7,61.1,6.0,40.2,32.5,2.3,0.2,4.6,,5.6,,5.6,2.7,5.8,6.6
2,,Aaron Harrison,DAL,23,25.9,,9,2,7,6.7,2.1,7.7,27.5,1.0,4.8,20.9,1.4,1.9,76.5,0.4,2.2,2.7,1.2,0.3,1.0,0.2,3.0,15.0,0,0,-8.0,6-6,210,Kentucky,USA,Undrafted,Undrafted,Undrafted,6-6,210,Kentucky,USA,Undrafted,Undrafted,Undrafted,0.8,0.0,0.8,0.7,0.0,0.0,100.0,85.7,0.0,0.3,12.5,4.7,57.1,0.2,60.0,11.0,19.6,21.1,1.2,0.0,2.8,,3.2,,3.2,1.2,6.3,6.3
3,,Aaron Jackson,HOU,32,34.5,,1,0,1,8.0,3.0,9.0,33.3,1.0,4.0,25.0,1.0,2.0,50.0,2.0,1.0,3.0,1.0,1.0,0.0,0.0,4.0,12.1,0,0,-10.0,6-3,183,Duquesne,USA,Undrafted,Undrafted,Undrafted,6-3,183,Duquesne,USA,Undrafted,Undrafted,Undrafted,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,33.3,7.0,42.9,0.0,42.9,7.8,49.0,51.0,1.0,0.0,9.0,,3.0,,3.0,1.0,2.0,2.0
4,,Abdel Nader,BOS,24,10.9,11.4,48,33,15,3.0,1.0,3.1,33.6,0.5,1.4,35.4,0.5,0.8,59.0,0.3,1.2,1.5,0.5,0.7,0.3,0.2,0.9,6.5,0,0,-2.3,6-6,230,Iowa State,Egypt,2016,2,58,6-6,230,Iowa State,Egypt,2016,2,58,0.8,0.1,0.7,0.5,0.1,17.5,82.5,92.0,12.0,0.4,29.6,2.6,57.7,0.2,62.3,7.0,8.7,9.1,0.6,0.1,1.3,,1.5,,1.5,0.7,6.4,8.6


In [50]:
# Selecting Columns from Hustle Stats to merge.

col_names = hustle_stats.columns.to_list()
target_col_names = ['Player', 'ScreenAssists', 'ScreenAssists PTS', 'Deflections', 'OFF Loose BallsRecovered', 'DEF Loose BallsRecovered', 'Loose BallsRecovered', '% Loose BallsRecovered OFF', '% Loose BallsRecovered DEF', 'ChargesDrawn', 'Contested2PT\xa0Shots', 'Contested3PT\xa0Shots', 'ContestedShots']
clean_hustle_stats = hustle_stats[target_col_names].copy()

# clean_hustle_stats.head()

In [51]:
# Merging Hustle Stats to master dataframe

df = df.merge(clean_hustle_stats, on='Player', how='left')

# df.head()

In [52]:
# Selecting Column names for Shooting Stats


# Dropping three null columns on the right (not on website)

shooting_stats = shooting_stats.iloc[:,:-3]

# Renaming Columns that are missing information
col_names = shooting_stats.columns.to_list()
# Corrected Column names
new_col_names = ['Player', 'TEAM', 'AGE', 'FGM_und_5ft', 'FGA_und_5ft', 'FG%_und_5ft', 'FGM_5_9ft', 'FGA_5_9ft', 'FG%_5_9ft', 'FGM_10_14ft', 'FGA_10_14ft', 'FG%_10_14ft', 'FGM_15_19ft', 'FGA_15_19ft', 'FG%_15_19ft', 'FGM_20_24ft', 'FGA_20_24ft', 'FG%_20_24ft', 'FGM_25_29ft', 'FGA_25_29ft', 'FG%_25_29ft']

shooting_stats.columns = new_col_names

# Merging Shooting Stats to master dataframe

df = df.merge(shooting_stats, on=['Player','TEAM','AGE'], how='left')

# df.head()

In [53]:
# Selecting Column names for Opponent Shooting Stats

# Renaming Columns that are missing information
col_names = opp_shooting_stats.columns.to_list()

# Corrected Column names
new_col_names = ['Player', 'TEAM', 'AGE', 'OPP_FGM_und_5ft', 'OPP_FGA_und_5ft', 'OPP_FG%_und_5ft', 'OPP_FGM_5_9ft', 'OPP_FGA_5_9ft', 'OPP_FG%_5_9ft', 'OPP_FGM_10_14ft', 'OPP_FGA_10_14ft', 'OPP_FG%_10_14ft', 'OPP_FGM_15_19ft', 'OPP_FGA_15_19ft', 'OPP_FG%_15_19ft', 'OPP_FGM_20_24ft', 'OPP_FGA_20_24ft', 'OPP_FG%_20_24ft', 'OPP_FGM_25_29ft', 'OPP_FGA_25_29ft', 'OPP_FG%_25_29ft']

opp_shooting_stats.columns = new_col_names

clean_opp_shooting_stats = opp_shooting_stats.copy()

# Merging Shooting Stats to master dataframe

df = df.merge(clean_opp_shooting_stats, on=['Player','TEAM','AGE'], how='left')

# df.head()

In [54]:
# Selecting Columns from ADVANCED Stats to Merge.


# Analyzing for Duplicate Column Names.
col_names =advanced_stats.columns.to_list()

# Columns with Duplicates Removed.
clean_cols = ['PLAYER','DEFRTG', 'NETRTG', 'AST%', 'OREB%', 'DREB%', 'REB%', 'eFG%', 'TS%', 'USG%', 'PACE', 'PIE']

advanced_stats_clean = advanced_stats[clean_cols].copy()

# Renaming PLAYERS so it has same syntax as main dataframe.
advanced_stats_clean.rename(columns={'PLAYER':'Player', 'PIE':'PIE_2017'}, inplace=True)

# Merging ADVANCED Stats to main dataframe

df = df.merge(advanced_stats_clean, on=['Player'], how='left')

# df.head()

In [55]:
# Selecting Columns from ADVANCED Stats to Merge.

# Analyzing for Duplicate Column Names.
col_names =advanced_stats_2018.columns.to_list()

# Only keeping PIE column as that is the target variable.
clean_cols = ['PLAYER','PIE']

advanced_stats_2018_clean = advanced_stats_2018[clean_cols].copy()

# Renaming PLAYERS so it has same syntax as main dataframe.
advanced_stats_2018_clean.rename(columns={'PIE':'PIE_2018','PLAYER':'Player'}, inplace=True)

# Merging ADVANCED Stats to main dataframe

df = df.merge(advanced_stats_2018_clean, on=['Player'], how='left')

# df.head()

In [56]:
#Dropping null value column from web scraping

df.drop(columns='',inplace=True)

In [57]:
df.head()

Unnamed: 0,Player,TEAM,AGE,MIN_2017,MIN_2018,GP,W,L,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST_x,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,Height_x,Weight_x,College_x,Country_x,Draft Year_x,Draft Round_x,Draft Number_x,Height_y,Weight_y,College_y,Country_y,Draft Year_y,Draft Round_y,Draft Number_y,Box Outs,OFF Box Outs,DEF Box Outs,Team RebOn Box Outs,Player RebOn Box Outs,% Box Outs Off,% Box Outs Def,% Team RebWhen Box Out,% Player RebWhen Box Out,ContestedREB,ContestedREB%,REBChances,REBChance%,DeferredREB Chances,AdjustedREB Chance%,AVG REBDistance,PassesMade,PassesReceived,AST_y,SecondaryAST,PotentialAST,AST PTSCreated,AST PTSCreated.1,AST PTSCreated.2,AST PTSCreated.3,ASTAdj,AST ToPass%,AST ToPass% Adj,ScreenAssists,ScreenAssists PTS,Deflections,OFF Loose BallsRecovered,DEF Loose BallsRecovered,Loose BallsRecovered,% Loose BallsRecovered OFF,% Loose BallsRecovered DEF,ChargesDrawn,Contested2PT Shots,Contested3PT Shots,ContestedShots,FGM_und_5ft,FGA_und_5ft,FG%_und_5ft,FGM_5_9ft,FGA_5_9ft,FG%_5_9ft,FGM_10_14ft,FGA_10_14ft,FG%_10_14ft,FGM_15_19ft,FGA_15_19ft,FG%_15_19ft,FGM_20_24ft,FGA_20_24ft,FG%_20_24ft,FGM_25_29ft,FGA_25_29ft,FG%_25_29ft,OPP_FGM_und_5ft,OPP_FGA_und_5ft,OPP_FG%_und_5ft,OPP_FGM_5_9ft,OPP_FGA_5_9ft,OPP_FG%_5_9ft,OPP_FGM_10_14ft,OPP_FGA_10_14ft,OPP_FG%_10_14ft,OPP_FGM_15_19ft,OPP_FGA_15_19ft,OPP_FG%_15_19ft,OPP_FGM_20_24ft,OPP_FGA_20_24ft,OPP_FG%_20_24ft,OPP_FGM_25_29ft,OPP_FGA_25_29ft,OPP_FG%_25_29ft,DEFRTG,NETRTG,AST%,OREB%,DREB%,REB%,eFG%,TS%,USG%,PACE,PIE_2017,PIE_2018
0,Aaron Brooks,MIN,33,5.9,,32,19,13,2.3,0.9,2.2,40.6,0.3,1.0,35.5,0.3,0.3,72.7,0.2,0.3,0.5,0.6,0.3,0.2,0.0,0.9,4.1,0,0,-2.3,6-0,161,Oregon,USA,2007,1,26,6-0,161,Oregon,USA,2007,1,26,0.2,0.0,0.2,0.1,0.0,0.0,100.0,75.0,25.0,0.1,17.6,1.1,47.2,0.2,56.7,11.5,9.0,10.5,0.6,0.0,1.3,,1.5,,1.5,0.7,7.0,7.3,0.0,0.1,0.3,0.1,0.1,0.2,57.1,42.9,0.03,0.6,0.9,1.5,0.3,0.6,46.7,0.1,0.3,25.0,0.3,0.5,55.6,0.1,0.2,75.0,0.2,0.4,54.5,0.2,0.7,26.3,2.6,3.6,72.4,0.2,0.8,27.3,0.6,1.2,47.8,0.5,1.1,44.4,1.6,3.4,48.8,0.8,2.3,32.8,115.7,-17.2,16.1,3.6,5.7,4.6,48.6,50.8,19.2,100.86,4.4,
1,Aaron Gordon,ORL,22,32.9,33.8,58,19,39,17.6,6.5,14.9,43.4,2.0,5.9,33.6,2.7,3.9,69.8,1.5,6.4,7.9,2.3,1.8,1.0,0.8,1.9,34.1,17,0,-1.6,6-9,220,Arizona,USA,2014,1,4,6-9,220,Arizona,USA,2014,1,4,2.7,0.2,2.5,1.3,0.4,7.0,93.0,85.2,23.9,2.6,33.5,13.6,58.1,0.7,61.1,6.0,40.2,32.5,2.3,0.2,4.6,,5.6,,5.6,2.7,5.8,6.6,0.9,1.9,1.4,0.5,0.7,1.2,42.9,57.1,0.07,4.7,1.8,6.5,3.2,4.9,66.7,0.2,0.9,20.0,0.4,1.2,35.7,0.5,1.8,29.8,0.7,2.1,35.3,1.4,4.1,33.5,14.4,22.8,63.2,2.3,6.1,37.1,2.2,5.1,42.4,2.5,5.9,42.4,3.7,9.3,40.3,3.8,11.1,33.9,108.3,-2.0,11.1,4.4,19.1,11.6,50.0,53.0,23.8,100.43,11.3,10.9
2,Aaron Harrison,DAL,23,25.9,,9,2,7,6.7,2.1,7.7,27.5,1.0,4.8,20.9,1.4,1.9,76.5,0.4,2.2,2.7,1.2,0.3,1.0,0.2,3.0,15.0,0,0,-8.0,6-6,210,Kentucky,USA,Undrafted,Undrafted,Undrafted,6-6,210,Kentucky,USA,Undrafted,Undrafted,Undrafted,0.8,0.0,0.8,0.7,0.0,0.0,100.0,85.7,0.0,0.3,12.5,4.7,57.1,0.2,60.0,11.0,19.6,21.1,1.2,0.0,2.8,,3.2,,3.2,1.2,6.3,6.3,0.1,0.3,2.0,0.2,0.7,0.9,25.0,75.0,0.0,3.3,3.2,6.6,0.6,1.0,55.6,0.1,0.2,50.0,0.1,0.4,25.0,0.3,1.1,30.0,0.4,2.6,17.4,0.6,2.3,23.8,9.9,14.3,69.0,1.9,5.3,35.4,1.8,2.7,66.7,2.4,4.7,52.4,2.9,8.7,33.3,3.2,8.4,38.2,112.2,-14.1,7.2,1.5,9.1,4.9,34.1,39.2,14.7,98.98,2.2,
3,Aaron Jackson,HOU,32,34.5,,1,0,1,8.0,3.0,9.0,33.3,1.0,4.0,25.0,1.0,2.0,50.0,2.0,1.0,3.0,1.0,1.0,0.0,0.0,4.0,12.1,0,0,-10.0,6-3,183,Duquesne,USA,Undrafted,Undrafted,Undrafted,6-3,183,Duquesne,USA,Undrafted,Undrafted,Undrafted,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,33.3,7.0,42.9,0.0,42.9,7.8,49.0,51.0,1.0,0.0,9.0,,3.0,,3.0,1.0,2.0,2.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,25.0,1.0,1.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,50.0,0.0,2.0,0.0,16.0,24.0,66.7,2.0,7.0,28.6,1.0,3.0,33.3,1.0,4.0,25.0,6.0,11.0,54.5,2.0,9.0,22.2,107.8,-19.8,5.3,5.6,3.1,4.4,38.9,40.5,14.9,91.13,-1.0,
4,Abdel Nader,BOS,24,10.9,11.4,48,33,15,3.0,1.0,3.1,33.6,0.5,1.4,35.4,0.5,0.8,59.0,0.3,1.2,1.5,0.5,0.7,0.3,0.2,0.9,6.5,0,0,-2.3,6-6,230,Iowa State,Egypt,2016,2,58,6-6,230,Iowa State,Egypt,2016,2,58,0.8,0.1,0.7,0.5,0.1,17.5,82.5,92.0,12.0,0.4,29.6,2.6,57.7,0.2,62.3,7.0,8.7,9.1,0.6,0.1,1.3,,1.5,,1.5,0.7,6.4,8.6,0.1,0.3,0.7,0.1,0.2,0.4,38.9,61.1,0.0,1.5,1.3,2.8,0.4,1.1,40.4,0.1,0.5,20.0,0.0,0.1,20.0,0.0,0.1,0.0,0.3,0.7,38.7,0.3,0.8,32.4,4.2,7.1,59.8,0.5,1.9,26.8,0.6,1.5,41.9,0.8,1.9,42.2,1.3,3.0,43.1,1.7,4.2,39.8,107.3,-10.3,7.8,2.4,11.1,6.7,41.3,43.9,16.2,100.46,3.3,6.6


# Exporting Merged Dataframe for Data Wrangling

In [63]:
df.to_csv('data/scaped_nba_stats_data.csv',index = False)

print("Final Dataframe of Scraped Data. Number of players:", df.shape[0], 'Number of player stats:', df.shape[1])

Final Dataframe of Scraped Data. Number of players: 540 Number of player stats: 132


___
### Exporting a Separate Dataframe with Additional Features for Final Analysis of Regression Model

In [60]:
advanced_stats_2018[['PLAYER', 'TEAM']]

Unnamed: 0,PLAYER,TEAM
0,Aaron Gordon,ORL
1,Aaron Holiday,IND
2,Abdel Nader,OKC
3,Al Horford,BOS
4,Al-Farouq Aminu,POR
...,...,...
525,Zach LaVine,CHI
526,Zach Lofton,DET
527,Zaza Pachulia,DET
528,Zhaire Smith,PHI


In [61]:

# Selecting Team column from ADVANCED Stats to Merge to dataframe to export to compare model accuracy

team_2018_clean = advanced_stats_2018[['PLAYER', 'TEAM']].copy()

# Renaming PLAYERS and Team so it has same syntax as main dataframe.
team_2018_clean.rename(columns={'PLAYER': 'Player', 'TEAM':'Team_2018'}, inplace=True)

team_2018_clean.head()

# Merging Team_2018 column to separate dataframe

model_analysis_df = df.merge(team_2018_clean, on=['Player'], how='left')
model_analysis_df.head()

Unnamed: 0,Player,TEAM,AGE,MIN_2017,MIN_2018,GP,W,L,PTS,FGM,FGA,FG%,3PM,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST_x,TOV,STL,BLK,PF,FP,DD2,TD3,+/-,Height_x,Weight_x,College_x,Country_x,Draft Year_x,Draft Round_x,Draft Number_x,Height_y,Weight_y,College_y,Country_y,Draft Year_y,Draft Round_y,Draft Number_y,Box Outs,OFF Box Outs,DEF Box Outs,Team RebOn Box Outs,Player RebOn Box Outs,% Box Outs Off,% Box Outs Def,% Team RebWhen Box Out,% Player RebWhen Box Out,ContestedREB,ContestedREB%,REBChances,REBChance%,DeferredREB Chances,AdjustedREB Chance%,AVG REBDistance,PassesMade,PassesReceived,AST_y,SecondaryAST,PotentialAST,AST PTSCreated,AST PTSCreated.1,AST PTSCreated.2,AST PTSCreated.3,ASTAdj,AST ToPass%,AST ToPass% Adj,ScreenAssists,ScreenAssists PTS,Deflections,OFF Loose BallsRecovered,DEF Loose BallsRecovered,Loose BallsRecovered,% Loose BallsRecovered OFF,% Loose BallsRecovered DEF,ChargesDrawn,Contested2PT Shots,Contested3PT Shots,ContestedShots,FGM_und_5ft,FGA_und_5ft,FG%_und_5ft,FGM_5_9ft,FGA_5_9ft,FG%_5_9ft,FGM_10_14ft,FGA_10_14ft,FG%_10_14ft,FGM_15_19ft,FGA_15_19ft,FG%_15_19ft,FGM_20_24ft,FGA_20_24ft,FG%_20_24ft,FGM_25_29ft,FGA_25_29ft,FG%_25_29ft,OPP_FGM_und_5ft,OPP_FGA_und_5ft,OPP_FG%_und_5ft,OPP_FGM_5_9ft,OPP_FGA_5_9ft,OPP_FG%_5_9ft,OPP_FGM_10_14ft,OPP_FGA_10_14ft,OPP_FG%_10_14ft,OPP_FGM_15_19ft,OPP_FGA_15_19ft,OPP_FG%_15_19ft,OPP_FGM_20_24ft,OPP_FGA_20_24ft,OPP_FG%_20_24ft,OPP_FGM_25_29ft,OPP_FGA_25_29ft,OPP_FG%_25_29ft,DEFRTG,NETRTG,AST%,OREB%,DREB%,REB%,eFG%,TS%,USG%,PACE,PIE_2017,PIE_2018,Team_2018
0,Aaron Brooks,MIN,33,5.9,,32,19,13,2.3,0.9,2.2,40.6,0.3,1.0,35.5,0.3,0.3,72.7,0.2,0.3,0.5,0.6,0.3,0.2,0.0,0.9,4.1,0,0,-2.3,6-0,161,Oregon,USA,2007,1,26,6-0,161,Oregon,USA,2007,1,26,0.2,0.0,0.2,0.1,0.0,0.0,100.0,75.0,25.0,0.1,17.6,1.1,47.2,0.2,56.7,11.5,9.0,10.5,0.6,0.0,1.3,,1.5,,1.5,0.7,7.0,7.3,0.0,0.1,0.3,0.1,0.1,0.2,57.1,42.9,0.03,0.6,0.9,1.5,0.3,0.6,46.7,0.1,0.3,25.0,0.3,0.5,55.6,0.1,0.2,75.0,0.2,0.4,54.5,0.2,0.7,26.3,2.6,3.6,72.4,0.2,0.8,27.3,0.6,1.2,47.8,0.5,1.1,44.4,1.6,3.4,48.8,0.8,2.3,32.8,115.7,-17.2,16.1,3.6,5.7,4.6,48.6,50.8,19.2,100.86,4.4,,
1,Aaron Gordon,ORL,22,32.9,33.8,58,19,39,17.6,6.5,14.9,43.4,2.0,5.9,33.6,2.7,3.9,69.8,1.5,6.4,7.9,2.3,1.8,1.0,0.8,1.9,34.1,17,0,-1.6,6-9,220,Arizona,USA,2014,1,4,6-9,220,Arizona,USA,2014,1,4,2.7,0.2,2.5,1.3,0.4,7.0,93.0,85.2,23.9,2.6,33.5,13.6,58.1,0.7,61.1,6.0,40.2,32.5,2.3,0.2,4.6,,5.6,,5.6,2.7,5.8,6.6,0.9,1.9,1.4,0.5,0.7,1.2,42.9,57.1,0.07,4.7,1.8,6.5,3.2,4.9,66.7,0.2,0.9,20.0,0.4,1.2,35.7,0.5,1.8,29.8,0.7,2.1,35.3,1.4,4.1,33.5,14.4,22.8,63.2,2.3,6.1,37.1,2.2,5.1,42.4,2.5,5.9,42.4,3.7,9.3,40.3,3.8,11.1,33.9,108.3,-2.0,11.1,4.4,19.1,11.6,50.0,53.0,23.8,100.43,11.3,10.9,ORL
2,Aaron Harrison,DAL,23,25.9,,9,2,7,6.7,2.1,7.7,27.5,1.0,4.8,20.9,1.4,1.9,76.5,0.4,2.2,2.7,1.2,0.3,1.0,0.2,3.0,15.0,0,0,-8.0,6-6,210,Kentucky,USA,Undrafted,Undrafted,Undrafted,6-6,210,Kentucky,USA,Undrafted,Undrafted,Undrafted,0.8,0.0,0.8,0.7,0.0,0.0,100.0,85.7,0.0,0.3,12.5,4.7,57.1,0.2,60.0,11.0,19.6,21.1,1.2,0.0,2.8,,3.2,,3.2,1.2,6.3,6.3,0.1,0.3,2.0,0.2,0.7,0.9,25.0,75.0,0.0,3.3,3.2,6.6,0.6,1.0,55.6,0.1,0.2,50.0,0.1,0.4,25.0,0.3,1.1,30.0,0.4,2.6,17.4,0.6,2.3,23.8,9.9,14.3,69.0,1.9,5.3,35.4,1.8,2.7,66.7,2.4,4.7,52.4,2.9,8.7,33.3,3.2,8.4,38.2,112.2,-14.1,7.2,1.5,9.1,4.9,34.1,39.2,14.7,98.98,2.2,,
3,Aaron Jackson,HOU,32,34.5,,1,0,1,8.0,3.0,9.0,33.3,1.0,4.0,25.0,1.0,2.0,50.0,2.0,1.0,3.0,1.0,1.0,0.0,0.0,4.0,12.1,0,0,-10.0,6-3,183,Duquesne,USA,Undrafted,Undrafted,Undrafted,6-3,183,Duquesne,USA,Undrafted,Undrafted,Undrafted,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,33.3,7.0,42.9,0.0,42.9,7.8,49.0,51.0,1.0,0.0,9.0,,3.0,,3.0,1.0,2.0,2.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,25.0,1.0,1.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,50.0,0.0,2.0,0.0,16.0,24.0,66.7,2.0,7.0,28.6,1.0,3.0,33.3,1.0,4.0,25.0,6.0,11.0,54.5,2.0,9.0,22.2,107.8,-19.8,5.3,5.6,3.1,4.4,38.9,40.5,14.9,91.13,-1.0,,
4,Abdel Nader,BOS,24,10.9,11.4,48,33,15,3.0,1.0,3.1,33.6,0.5,1.4,35.4,0.5,0.8,59.0,0.3,1.2,1.5,0.5,0.7,0.3,0.2,0.9,6.5,0,0,-2.3,6-6,230,Iowa State,Egypt,2016,2,58,6-6,230,Iowa State,Egypt,2016,2,58,0.8,0.1,0.7,0.5,0.1,17.5,82.5,92.0,12.0,0.4,29.6,2.6,57.7,0.2,62.3,7.0,8.7,9.1,0.6,0.1,1.3,,1.5,,1.5,0.7,6.4,8.6,0.1,0.3,0.7,0.1,0.2,0.4,38.9,61.1,0.0,1.5,1.3,2.8,0.4,1.1,40.4,0.1,0.5,20.0,0.0,0.1,20.0,0.0,0.1,0.0,0.3,0.7,38.7,0.3,0.8,32.4,4.2,7.1,59.8,0.5,1.9,26.8,0.6,1.5,41.9,0.8,1.9,42.2,1.3,3.0,43.1,1.7,4.2,39.8,107.3,-10.3,7.8,2.4,11.1,6.7,41.3,43.9,16.2,100.46,3.3,6.6,OKC


In [64]:
model_analysis_df.to_csv('data/unscaled_dataframe_for_model_analysis.csv',index = False)