# Scrape Playoff Data

### 1. Imports

In [1]:
from bs4 import BeautifulSoup
import requests
import datetime
import time
import os
import random
import pandas as pd

### 2. NHL Website Settings

In [2]:
url_template = 'https://www.hockey-reference.com/playoffs/NHL_%i.html'
season_st = 1985
search_id = 'teams'
sleep_min = 1
sleep_max = 3
csv_file = '../data/playoffs.csv'
exception_file = '../data/playoffs_exceptions.csv'

### 3. Load Existing Data File

In [3]:
try:
    df_from_file = pd.read_csv(csv_file)
    file_loaded = True
except:
    file_loaded = False
    
if file_loaded:
    df_list = [df_from_file]
    season_st = df_from_file['Season'].max()
    print('Loaded seasons through %i from file' % season_st)
    season_st += 1
else:
    df_list = []
    print('No seasons loaded from file')

Loaded seasons through 2020 from file


### 4. Scrape Standings for Each Season

In [4]:
season_st += 1
season_end = datetime.date.today().year + 1
for season in range(season_st, season_end):
    # Playoff standings pages don't exist for lockouts and need to be skipped
    response = requests.get(url_template % season)
    if response.status_code != 200:
        continue
        
    # Playoff standings are contained in a table with the search_id
    soup_page = BeautifulSoup(response.content)
    soup_table = soup_page.find('table', {'id': search_id})
    df_table = pd.read_html(str(soup_table))[0]
    df_table.rename(columns={'Rk': 'Rank'}, inplace=True)
    df_table.drop(df_table.tail(1).index,inplace=True)
    df_table[['Rank', 'GP', 'W']] = df_table[['Rank', 'GP', 'W']].astype(int)
    df_table = df_table[['Team', 'Rank', 'GP', 'W']]
    df_table['Season'] = season - 1
    df_list.append(df_table)
    
    print('%i records loaded from %s' % (df_table.shape[0], url_template % season))
    sleep_duration = random.uniform(sleep_min, sleep_max)
    time.sleep(sleep_duration)

16 records loaded from https://www.hockey-reference.com/playoffs/NHL_2022.html


### 5. Assemble Single DataFrame

In [5]:
df_scraped = pd.concat(df_list)
print(df_scraped.shape)
df_scraped.sample(10)

(576, 5)


Unnamed: 0,Team,Rank,GP,W,Season
378,Ottawa Senators,11,6,2,2009
413,Detroit Red Wings,14,5,1,2011
293,Detroit Red Wings,6,12,6,2003
318,Dallas Stars,15,5,1,2005
454,Calgary Flames,7,11,5,2014
337,Pittsburgh Penguins,2,20,14,2007
539,Arizona Coyotes,12,5,1,2019
90,Detroit Red Wings,11,7,3,1990
23,Winnipeg Jets,8,10,4,1986
536,Calgary Flames,9,6,2,2019


### 6. Account for Playoff Format Exceptions
* Playoffs involved a round robin and play-in round in the 2019 season

In [6]:
df_exceptions = pd.read_csv(exception_file)
df_without_exceptions = df_scraped[~df_scraped['Season'].isin(df_exceptions['Season'].unique())]
df = pd.concat([df_without_exceptions, df_exceptions], ignore_index=True)
df.sort_values(['Season', 'Rank'], inplace=True)
print(df.shape)
df.sample(10)

(576, 5)


Unnamed: 0,Team,Rank,GP,W,Season
234,Edmonton Oilers,11,5,1,1999
422,Ottawa Senators,7,10,5,2012
540,Nashville Predators,13,6,2,2020
441,Philadelphia Flyers,10,7,3,2013
378,Ottawa Senators,11,6,2,2009
197,Ottawa Senators,6,11,5,1997
85,Los Angeles Kings,6,12,6,1990
158,Dallas Stars,15,5,1,1994
386,Tampa Bay Lightning,3,18,11,2010
171,Toronto Maple Leafs,12,6,2,1995


### 7. Export DataFrame to CSV

In [7]:
df.to_csv(csv_file, index=False)