# Web Scraping (Selenium)

Below are the codes I used to scrape the data I needed from Basketball-Reference.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time, os

chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [103]:
# Create list of the 30 NBA teams we will loop for.
# Will create 3 separate lists to pull data, since two teams have changed names.

# NJN FROM 2010-2012, BRK 2013-2020
# NOH FROM 2010-2013, NOP 2014-2020
# CHA FROM 2010-2014, CHO 2015-2020
# WILL SKIP 2012, AS THE SEASON WAS ONLY 66 GAME (LOCKOUT SEASON)

nba_teams = ['BOS', 'TOR', 'NYK', 'PHI', 'CLE', 'MIL', 'CHI', 'IND', 'DET', 'ORL',
                 'ATL', 'MIA', 'WAS', 'DEN', 'UTA', 'POR', 'SEA', 'OKC', 'MIN', 'LAL', 
                 'PHO', 'LAC', 'GSW', 'SAC', 'DAL', 'SAS', 'HOU', 'VAN', 'MEM', 'NJN', 'BRK', 'CHH',
                 'NOH','NOK', 'NOP', 'CHA', 'CHO']

years = ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2013', '2014', '2015', '2016', '2017', '2018', '2019']

## NBA Players Web Scraping

In [111]:
driver = webdriver.Chrome(chromedriver)
final = []
for each_team in nba_teams:
    for each_year in years:
        url = f'https://www.basketball-reference.com/teams/{each_team}/{each_year}.html'        
        driver.get(url)
        time.sleep(2)
        driver.execute_script('window.scrollTo(0, 5000);')
        time.sleep(5)
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        table = soup.find('table', id = 'per_game')
        
        if table == None:
            continue
        else:
            table_row = [row for row in table.find_all('td')]

            # Find the number of names on the roster
            range_end = 0
            for each in table_row:
                if " " in each.text:
                    range_end +=1

            start = 0
            end = 27
            for each in range(range_end):
                temp = []
                for each in range(start,end):
                    temp.append(table_row[each].text)
                start += 27
                end += 27
                temp = [each_team, each_year] + temp
                final.append(temp)

            driver.find_element_by_tag_name('body').send_keys(Keys.COMMAND + 't') 

driver.close()

In [112]:
pd.DataFrame(final)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,BOS,2000,Antoine Walker,23,82,82,36.6,7.9,18.4,.430,...,.699,2.4,5.5,8.0,3.7,1.4,0.4,3.2,3.2,20.5
1,BOS,2000,Paul Pierce,22,73,72,35.4,6.7,15.1,.442,...,.798,1.1,4.3,5.4,3.0,2.1,0.8,2.4,3.2,19.5
2,BOS,2000,Kenny Anderson,29,82,82,31.6,5.3,12.0,.440,...,.775,0.7,2.1,2.7,5.1,1.7,0.1,1.6,2.8,14.0
3,BOS,2000,Adrian Griffin,25,72,47,26.8,2.4,5.7,.424,...,.753,1.8,3.4,5.2,2.5,1.6,0.2,1.3,3.1,6.7
4,BOS,2000,Vitaly Potapenko,24,79,72,22.7,3.9,7.8,.499,...,.681,2.3,4.0,6.3,1.0,0.5,0.4,1.8,3.0,9.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9960,CHO,2019,Bismack Biyombo,26,54,32,14.5,1.6,2.9,.571,...,.637,1.5,3.1,4.6,0.6,0.2,0.8,0.6,1.9,4.4
9961,CHO,2019,Willy Hernangómez,24,58,3,14.0,2.6,5.1,.519,...,.694,2.0,3.3,5.4,1.0,0.3,0.3,1.0,1.7,7.3
9962,CHO,2019,Shelvin Mack,28,4,0,10.5,0.5,3.5,.143,...,.556,0.0,0.5,0.5,0.3,0.5,0.0,1.0,0.8,2.3
9963,CHO,2019,J.P. Macura,23,2,0,8.5,1.5,4.5,.333,...,,0.0,1.5,1.5,1.0,0.0,0.0,0.0,0.0,3.0


In [113]:
nba_players = pd.DataFrame(final)

In [117]:
nba_players.columns = ('Team', 'Year', 'Name', 'Age', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB',
                       'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS/G')

In [118]:
nba_players.to_csv('nba_players.csv', index = False)

## NBA Players Playoff Stats

In [56]:
driver = webdriver.Chrome(chromedriver)
url = f'https://www.basketball-reference.com/playoffs/NBA_2000_per_game.html'
driver.get(url)
time.sleep(5)
driver.execute_script('window.scrollTo(0, 5000);')
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'html.parser')
table = soup.find('table', id = 'per_game_stats')
table_row = [row for row in table.find_all('td')]

driver.close()

In [None]:
table_row

## NBA Salary

In [None]:
# 2018-2019 Salaries Page Numbers

page_numbers = ['2', '3', '4']

In [42]:
# Set global driver as Chrome
driver = webdriver.Chrome(chromedriver)

temp_list = []
for years in ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020']:
    url = f'http://www.espn.com/nba/salaries/_/year/{years}/seasontype/4'
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    table = soup.find('div', 'page-numbers')
    end = int(table.text.split(' ')[2])
    for page_number in range(1,end+1):
        names_list = []
        teams_list = []
        salary_list = []
        if page_number == 1:

            url = f'http://www.espn.com/nba/salaries/_/year/{years}/seasontype/4'
            driver.get(url)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            table = soup.find('table')
            table_row = [row for row in table.find_all('td')]
            for each in table_row:
                if ',' in each.text and '$' not in each.text:
                    names_list.append(each.text.split(',')[0])
                elif '$' in each.text:
                    salary_list.append(each.text[1:].replace(',', ''))
                elif len(each.text) > 6:
                    teams_list.append(each.text)
            for each in range(len(names_list)):
                temp_list.append([names_list[each], salary_list[each], str(int(years)-1), years, teams_list[each]])
        else:
            url2 = f'http://www.espn.com/nba/salaries/_/year/{years}/page/{page_number}/seasontype/4'
            driver.get(url2)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            table = soup.find('table')
            table_row = [row for row in table.find_all('td')]
            for each in table_row:
                if ',' in each.text and '$' not in each.text:
                    names_list.append(each.text.split(',')[0])
                elif '$' in each.text:
                    salary_list.append(each.text[1:].replace(',', ''))
                elif len(each.text) > 6:
                    teams_list.append(each.text)
            for each in range(len(names_list)):
                temp_list.append([names_list[each], salary_list[each], str(int(years)-1), years, teams_list[each]])

            driver.find_element_by_tag_name('body').send_keys(Keys.COMMAND + 't')



In [44]:
nba_salaries = pd.DataFrame(temp_list)

In [45]:
nba_salaries.to_csv('nba_salaries.csv', index = False)

## US Cities Population

In [None]:
pop_years = [2000, 2001, 2002, 2003, 2004,2005,2006,2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]
driver = webdriver.Chrome(chromedriver)

master_poplist = []

for each_year in pop_years:
    pop_url = f'https://biggestuscities.com/{each_year}'
    driver.get(pop_url)
    time.sleep(4)

    # Use BeautifulSoup to parse data.
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    table = soup.find('table')
    rows2 = [row for row in table.find_all('td')]
    
    poplist = []
    for i in range(3, 5000, 5):
        temp_list = []
        temp_list.append(each_year)
        temp_list.append(rows2[i-2].text.replace('\n', '').replace('      ', '').replace(' ', ''))
        temp_list.append(rows2[i].text.replace('\n', '').replace('          ', ''))
        master_poplist.append(temp_list)
 
        
driver.close()

In [None]:
pd.DataFrame(master_poplist)

In [None]:
pd.DataFrame(master_poplist).to_csv('city_pop.csv', index = False)