# Web Scraping (Selenium)

Below are the codes I used to scrape the data I needed from Basketball-Reference.

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time, os

chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [25]:
# Create list of the 30 NBA teams we will loop for.
# Will create 3 separate lists to pull data, since two teams have changed names.

# NJN FROM 2010-2012, BRK 2013-2020
# NOH FROM 2010-2013, NOP 2014-2020
# CHA FROM 2010-2014, CHO 2015-2020
# WILL SKIP 2012, AS THE SEASON WAS ONLY 66 GAME (LOCKOUT SEASON)

nba_teams = ['BOS', 'TOR', 'NYK', 'PHI', 'CLE', 'MIL', 'CHI', 'IND', 'DET', 'ORL',
                 'ATL', 'MIA', 'WAS', 'DEN', 'UTA', 'POR', 'SEA', 'OKC', 'MIN', 'LAL', 
                 'PHO', 'LAC', 'GSW', 'SAC', 'DAL', 'SAS', 'HOU', 'VAN', 'MEM', 'NJN', 'BRK', 'CHH',
                 'NOH','NOK', 'NOP', 'CHA', 'CHO']

years = ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2013', '2014', '2015', '2016', '2017', '2018', '2019']

## NBA League Stats/Players Web Scraping

In [26]:
# Will take all the scraped data and input it into a master list.
nba_total_stats = []

# Set global driver as Chrome
driver = webdriver.Chrome(chromedriver)

count = 0
for each_team in nba_teams:
    for each_year in years:
        url = f'https://www.basketball-reference.com/teams/{each_team}/{each_year}.html'
        
        # Use Selenium to access the site and load up the tables needed to extract data.
        driver.get(url)
        time.sleep(5)
        driver.execute_script('window.scrollTo(0, 5000);')
        time.sleep(7)
        
        # Use BeautifulSoup to parse data.
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        team_stats = soup.find('table', id = 'team_and_opponent')
        misc_stats = soup.find('table', id = 'team_misc')


        if team_stats == None or misc_stats == None or players_stats == None:
            continue
        else:
            rows_team_stats = [row for row in team_stats.find_all('td')]
            total_team_stats = rows_team_stats[24:46]
            
            rows_misc_stats = [row for row in misc_stats.find_all('td')]
            total_misc_stats = rows_misc_stats[0:22]
            
        
        # Hold the team stats in a temporary list and append it to master list
            temp_list = []
            for num_range in range(22):
                for each in total_team_stats[num_range]:
                    temp_list.append(each)
            for num_range in range(22):
                for each in total_misc_stats[num_range]:
                    temp_list.append(each)
        
            temp_list = [each_team, each_year] + temp_list
            nba_total_stats.append(temp_list)
  
        # Opened tabs instead of windows.
            driver.find_element_by_tag_name('body').send_keys(Keys.COMMAND + 't') 

## NBA All-Stars Scraping

In [13]:
# Web-Scrape for NBA All Stars

# Master all_stars list.
all_stars = []
driver = webdriver.Chrome(chromedriver)

years = ['2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2013', '2014', '2015', '2016', '2017', '2018', '2019']
for each_year in years:
    url = f'https://www.basketball-reference.com/allstar/NBA_{each_year}.html'#site_menu_link''
    driver.get(url)
    time.sleep(5)
    
    # Use BeautifulSoup to parse data.
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # Separate out West and East All-Stars
    west = soup.find('table', id = 'West')
    east = soup.find('table', id = 'East')
    
    # Filter for just the individual teams that have all-stars
    if west == None or east == None:
        lebron = soup.find('table', id = 'LeBron')
        stephen = soup.find('table', id = 'Stephen')
        if stephen == None:
            lebron = soup.find('table', id = 'LeBron')
            giannis = soup.find('table', id = 'Giannis')
        
            lebron_rows = [row for row in lebron.find_all('td')]
            lebron_all_stars = lebron_rows[:250:20]
        
            giannis_rows = [row for row in giannis.find_all('td')]
            giannis_all_stars = giannis_rows[:250:20]
            
            temp_all_stars = []
            for each in range(len(lebron_all_stars)):
                temp_all_stars.append(lebron_all_stars[each].text)
                temp_all_stars.append(giannis_all_stars[each].text)
                
            temp_all_stars = [each_year] + temp_all_stars
            all_stars.append(temp_all_stars)
            temp_all_stars = []   
            
        else:
    
            lebron_rows = [row for row in lebron.find_all('td')]
            lebron_all_stars = lebron_rows[:240:20]
        
            stephen_rows = [row for row in stephen.find_all('td')]
            stephen_all_stars = stephen_rows[:240:20]
        
            temp_all_stars = []
            for each in range(len(lebron_all_stars)):
                temp_all_stars.append(lebron_all_stars[each].text)
                temp_all_stars.append(stephen_all_stars[each].text)
    
    
            temp_all_stars = [each_year] + temp_all_stars
            all_stars.append(temp_all_stars)
            temp_all_stars = []
            
        
    else:    
        west_rows = [row for row in west.find_all('td')]
        west_all_stars = west_rows[:240:20]

        east_rows = [row for row in east.find_all('td')]
        east_all_stars = east_rows[:240:20]

    # Store all stars in a temporary list to append to master all_stars list.
        temp_all_stars = []
        for each in range(len(west_all_stars)):
            temp_all_stars.append(west_all_stars[each].text)
            temp_all_stars.append(east_all_stars[each].text)
    
    
        temp_all_stars = [each_year] + temp_all_stars
        all_stars.append(temp_all_stars)

        temp_all_stars = []
        
        driver.find_element_by_tag_name('body').send_keys(Keys.COMMAND + 't')

In [27]:
# Constructed a DataFrame to hold all of my scraped team stats.
nba_team_stats_df = pd.DataFrame(nba_total_stats)


In [17]:
# Constructed a DataFrame to hold all of my scraped all star players.

nba_all_stars = pd.DataFrame(all_stars)

In [28]:
# Created columns for my DataFrames.
nba_team_stats_df.columns = ('Team', 'Year', 'MP','FG', 'FGA','FG%','3P', '3PA' , '3P%',
                       '2P', '2PA', '2P%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 
                       'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'WINS',
                       'LOSSES', 'PW', 'PL', 'MOV', 'SOS', 'SRS', 'ORTG', 'DRTG',
                       'PACE', 'FTR', '3PAR', 'EFG%', 'TOV%', 'ORB%', 'FT/FGA',
                       'OEFG%', 'OTOV%', 'ODRB%', 'OFT/FGA', 'ARENA', 'ATTENDANCE')


In [22]:
nba_all_stars.columns = ('Year', 'T1', 'T2', 'T3', 'T4', 'T5', 'T6', 'T7', 'T8', 'T9', 'T10', 'T11', 'T12', 'T13',
                         'T14', 'T15', 'T16', 'T17', 'T18',' T19', 'T20', 'T21', 'T22', 'T23', 'T24', 'T25', 'T26')

In [29]:
# Saved as a CSV file to have easy access.
nba_team_stats_df.to_csv('nba_team_stats.csv', index = False)

In [24]:
nba_all_stars.to_csv('nba_all_stars.csv', index = False)

## US Cities Population

In [194]:
pop_years = [2000, 2001, 2002, 2003, 2004,2005,2006,2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]
driver = webdriver.Chrome(chromedriver)

master_poplist = []

for each_year in pop_years:
    pop_url = f'https://biggestuscities.com/{each_year}'
    driver.get(pop_url)
    time.sleep(4)

    # Use BeautifulSoup to parse data.
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    table = soup.find('table')
    rows2 = [row for row in table.find_all('td')]
    
    poplist = []
    for i in range(3, 5000, 5):
        temp_list = []
        temp_list.append(each_year)
        temp_list.append(rows2[i-2].text.replace('\n', '').replace('      ', '').replace(' ', ''))
        temp_list.append(rows2[i].text.replace('\n', '').replace('          ', ''))
        master_poplist.append(temp_list)
 
        
driver.close()

In [195]:
pd.DataFrame(master_poplist)

Unnamed: 0,0,1,2
0,2000,NewYork,8015348
1,2000,LosAngeles,3703921
2,2000,Chicago,2895671
3,2000,Houston,1977811
4,2000,Philadelphia,1513800
...,...,...,...
18995,2018,PlantCity,39156
18996,2018,Norwich,39136
18997,2018,Germantown,39099
18998,2018,Northglenn,39010


In [196]:
pd.DataFrame(master_poplist).to_csv('city_pop.csv', index = False)