# Scrape Playoff Data

### 1. Imports

In [1]:
from bs4 import BeautifulSoup
import requests
import datetime
import time
import os
import random
import pandas as pd

### 2. NHL Website Settings

In [2]:
url_template = 'https://www.hockey-reference.com/playoffs/NHL_%i.html'
season_st = 1985
search_id = 'teams'
sleep_min = 1
sleep_max = 3
csv_file = '../data/playoffs.csv'
exception_file = '../data/playoffs_exceptions.csv'
team_lookup_file = '../data/team_lookup.csv'

### 3. Load Existing Data File

In [3]:
try:
    df_from_file = pd.read_csv(csv_file)
    file_loaded = True
except:
    file_loaded = False
    
if file_loaded:
    df_from_file.drop(['FullName', 'Abbreviation'], axis=1, inplace=True, errors='ignore')
    df_from_file = df_from_file[['Team','Rank','GP','W','Season']]
    df_list = [df_from_file]
    season_st = df_from_file['Season'].max()
    print('Loaded seasons through %i from file' % season_st)
    season_st += 1
else:
    df_list = []
    print('No seasons loaded from file')

Loaded seasons through 2021 from file


### 4. Scrape Standings for Each Season

In [4]:
season_st += 1
season_end = datetime.date.today().year + 1
for season in range(season_st, season_end):
    # Playoff standings pages don't exist for lockouts and need to be skipped
    response = requests.get(url_template % season)
    if response.status_code != 200:
        continue
        
    # Playoff standings are contained in a table with the search_id
    soup_page = BeautifulSoup(response.content)
    soup_table = soup_page.find('table', {'id': search_id})
    df_table = pd.read_html(str(soup_table))[0]
    df_table.rename(columns={'Rk': 'Rank'}, inplace=True)
    df_table.drop(df_table.tail(1).index,inplace=True)
    df_table[['Rank', 'GP', 'W']] = df_table[['Rank', 'GP', 'W']].astype(int)
    df_table = df_table[['Team', 'Rank', 'GP', 'W']]
    df_table['Season'] = season - 1
    df_list.append(df_table)
    
    print('%i records loaded from %s' % (df_table.shape[0], url_template % season))
    sleep_duration = random.uniform(sleep_min, sleep_max)
    time.sleep(sleep_duration)

### 5. Assemble Single DataFrame

In [5]:
df_scraped = pd.concat(df_list)
print(df_scraped.shape)
df_scraped.sample(10)

(576, 5)


Unnamed: 0,Team,Rank,GP,W,Season
165,Chicago Blackhawks,6,10,6,1995
466,Tampa Bay Lightning,3,17,11,2015
32,Edmonton Oilers,1,19,16,1987
309,New Jersey Devils,6,9,5,2005
102,Detroit Red Wings,7,11,4,1991
410,Florida Panthers,11,7,3,2011
536,Calgary Flames,9,6,2,2019
232,St. Louis Blues,9,7,3,1999
13,Boston Bruins,14,3,0,1985
368,Chicago Blackhawks,1,22,16,2009


### 6. Account for Playoff Format Exceptions
* Playoffs involved a round robin and play-in round in the 2019 season

In [6]:
df_exceptions = pd.read_csv(exception_file)
df_without_exceptions = df_scraped[~df_scraped['Season'].isin(df_exceptions['Season'].unique())]
df_with_exceptions = pd.concat([df_without_exceptions, df_exceptions], ignore_index=True)
df_with_exceptions.sort_values(['Season', 'Rank'], inplace=True)
print(df_with_exceptions.shape)
df_with_exceptions.sample(10)

(576, 5)


Unnamed: 0,Team,Rank,GP,W,Season
336,Detroit Red Wings,1,22,16,2007
133,Boston Bruins,6,13,6,1993
207,Los Angeles Kings,16,4,0,1997
2,St. Louis Blues,3,19,10,1985
379,Colorado Avalanche,12,6,2,2009
377,Washington Capitals,10,7,3,2009
171,Toronto Maple Leafs,12,6,2,1995
204,Boston Bruins,13,6,2,1997
51,Chicago Blackhawks,4,16,9,1988
265,New York Islanders,10,7,3,2001


### 7. Merge Lookup Data for Team Abbreviations

In [7]:
df_team_lookup = pd.read_csv(team_lookup_file)
df = df_with_exceptions.merge(df_team_lookup, how='left', left_on='Team', right_on='FullName')
df.drop('FullName', axis=1, inplace=True)
print(df.shape)
df.sample(10)

(576, 6)


Unnamed: 0,Team,Rank,GP,W,Season,Abbreviation
546,New York Islanders,3,19,11,2020,NYI
444,Columbus Blue Jackets,13,6,2,2013,CBJ
494,Calgary Flames,15,4,0,2016,CGY
437,Anaheim Ducks,6,13,7,2013,ANA
166,Philadelphia Flyers,7,12,6,1995,PHI
564,Carolina Hurricanes,5,14,7,2021,CAR
256,Detroit Red Wings,1,23,16,2001,DET
150,San Jose Sharks,7,11,4,1994,SJS
557,Washington Capitals,14,5,1,2020,WSH
389,Nashville Predators,6,12,6,2010,NSH


### 8. Export DataFrame to CSV

In [8]:
df.to_csv(csv_file, index=False)