# Scrape Regular Season Data

### 1. Imports

In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import datetime
import time
import os
import random
import pandas as pd

### 2. NHL Website Settings

In [2]:
url_template = 'https://www.nhl.com/standings/%i/league'
season_st = 1985
search_class = 'g5-component--standings__full-view'
sleep_min = 5
sleep_max = 10
load_attempts = 3
csv_file = '../data/regular_season.csv'

### 3. Create Webdriver

In [3]:
chromedriver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

### 4. Load Existing Data File

In [4]:
try:
    df_from_file = pd.read_csv(csv_file)
    df_list = [df_from_file]
    season_st = df_from_file['Season'].max()
    print('Loaded seasons through %i from file' % season_st)
    season_st += 1
except:
    df_list = []
    print('No seasons loaded from file')

Loaded seasons through 2021 from file


### 5. Scrape Standings for Each Season

In [5]:
season_end = datetime.date.today().year
urls_visited = set()
for season in range(season_st, season_end):
    
    # Need to check URL after loading because NHL site does not error out for URLs
    # that are out of range
    failed = True
    for attempt in range(load_attempts):
        try:
            driver.get(url_template % season)
            failed = False
            break
        except:
            time.sleep(sleep_max)
    if failed:
        raise RuntimeError('Failed to load page')
    try:
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, search_class)))
    except:
        continue
    url_current = driver.current_url
    if url_current in urls_visited:
        break
    urls_visited.add(url_current)
    
    # Season standings are contained in an unlabeled table which can be found by
    # navigating down from the section with the search class
    soup_page = BeautifulSoup(driver.page_source, 'lxml')
    soup_section = soup_page.find('section', {'class': search_class})
    soup_table = soup_section.find('table')
    df_table = pd.read_html(str(soup_table))[0]
    
    # The first table column contains both team and rank data
    col_main = df_table.columns[0]
    series_team = df_table[col_main].str.slice(start=-3)
    series_rank = df_table[col_main].str.split(pat=' ', n=1, expand=True)[0]
    df_season = pd.DataFrame({'Team': series_team, 'Rank': series_rank})
    df_season['Season'] = season
    df_list.append(df_season)
    
    print('%i records loaded from %s' % (df_season.shape[0], url_current))
    sleep_duration = random.uniform(sleep_min, sleep_max)
    time.sleep(sleep_duration)
    
driver.close()

### 6. Assemble Single DataFrame

In [6]:
df = pd.concat(df_list)
print(df.shape)
df.sample(10)

(993, 3)


Unnamed: 0,Team,Rank,Season
477,DET,1,2005
375,NSH,19,2000
855,DAL,19,2017
247,NYI,24,1995
564,ATL,28,2007
905,PIT,7,2019
803,CBJ,27,2015
385,TBL,29,2000
7,CHI,8,1985
306,TOR,5,1998


### 7. Export DataFrame to CSV

In [7]:
df.to_csv(csv_file, index=False)