# The Washington Nationals recently won the 2019 World Series in unprecedented fashion, compiling all 4 of their series wins on the road. The FOX broadcast indicated that no away team had ever won every game of a 7 game series. This stat also included series from the NBA and NHL. 

# Sports-reference.com has a series of websites for each of these sports that have all of this playoff series data available, so I created a quick scraping tool to see if I could verify that stat.

In [424]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import urllib.request

In [425]:
# example scraping functions found online
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None
    
def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)

def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [572]:
# function used to extract game scores from individual series webpages

def create_series_table(url, series_name):
    raw_html = simple_get(url)
    html = BeautifulSoup(raw_html, 'html.parser')
    # find number of games in the series
    game_tables = html.findAll('div',{'class':'game_summary nohover'})
    # NBA game tables have different label
    if len(game_tables) == 0:
        game_tables = html.findAll('div',{'class':'game_summary expanded nohover'})
    num_games = len(game_tables)
    # create table to hold series results
    series_table = pd.DataFrame(columns=['Series','Away Team','Away Score', 'Home Team', 
                                         'Home Score','Away Win','Winner Name'])
    for i in range(num_games):
        # create a final game score string containing all data needed for analysis
        final = game_tables[i].text.split('tr class="loser"')[0]
        away_team = final.split('\n')[5]
        away_score = int(final.split('\n')[6])
        home_team = final.split('\n')[12]
        home_score = int(final.split('\n')[13])
        series_table.loc[i] = [series_name, away_team, away_score, home_team, home_score, \
                               'Y' if away_score > home_score else 'N', away_team if away_score > home_score else home_team]
        # NHL playoff pages have regular season game tables as well, don't want to count those
        if series_table['Winner Name'].value_counts().max() >= 4:
            break
        
    return series_table

In [573]:
create_series_table('https://www.baseball-reference.com/postseason/2019_WS.shtml', 'WS 2019')

Unnamed: 0,Series,Away Team,Away Score,Home Team,Home Score,Away Win,Winner Name
0,WS 2019,Washington Nationals,5,Houston Astros,4,Y,Washington Nationals
1,WS 2019,Washington Nationals,12,Houston Astros,3,Y,Washington Nationals
2,WS 2019,Houston Astros,4,Washington Nationals,1,Y,Houston Astros
3,WS 2019,Houston Astros,8,Washington Nationals,1,Y,Houston Astros
4,WS 2019,Houston Astros,7,Washington Nationals,1,Y,Houston Astros
5,WS 2019,Washington Nationals,7,Houston Astros,2,Y,Washington Nationals
6,WS 2019,Washington Nationals,6,Houston Astros,2,Y,Washington Nationals


In [574]:
create_series_table('https://www.basketball-reference.com/playoffs/1978-nba-finals-bullets-vs-supersonics.html', 'Finals')

Unnamed: 0,Series,Away Team,Away Score,Home Team,Home Score,Away Win,Winner Name
0,Finals,Washington Bullets,102,Seattle SuperSonics,106,N,Seattle SuperSonics
1,Finals,Seattle SuperSonics,98,Washington Bullets,106,N,Washington Bullets
2,Finals,Seattle SuperSonics,93,Washington Bullets,92,Y,Seattle SuperSonics
3,Finals,Washington Bullets,120,Seattle SuperSonics,116,Y,Washington Bullets
4,Finals,Washington Bullets,94,Seattle SuperSonics,98,N,Seattle SuperSonics
5,Finals,Seattle SuperSonics,82,Washington Bullets,117,N,Washington Bullets
6,Finals,Washington Bullets,105,Seattle SuperSonics,99,Y,Washington Bullets


In [575]:
create_series_table('https://www.hockey-reference.com/playoffs/2018-vegas-golden-knights-vs-washington-capitals-stanley-cup-final.html','Finals')

Unnamed: 0,Series,Away Team,Away Score,Home Team,Home Score,Away Win,Winner Name
0,Finals,Washington Capitals,4,Vegas Golden Knights,6,N,Vegas Golden Knights
1,Finals,Washington Capitals,3,Vegas Golden Knights,2,Y,Washington Capitals
2,Finals,Vegas Golden Knights,1,Washington Capitals,3,N,Washington Capitals
3,Finals,Vegas Golden Knights,2,Washington Capitals,6,N,Washington Capitals
4,Finals,Washington Capitals,4,Vegas Golden Knights,3,Y,Washington Capitals


In [557]:
# MLB
mlb_results_table = pd.DataFrame(columns=['League','Series','Total Games','Away Wins'])
series_counter = 0
league = 'MLB'

# using predictable link structure to auto generate all series links
for year in range(1905, 2020):
    if year in [1907, 1912, 1922, 1969, 1970, 1994]:
        # no postseason or shortened postseason
        continue
    for series_type in ['ALCS','NLCS','WS']:
        if year < 1969 and series_type in ['ALCS','NLCS']:
            # no CS before 1969
            continue
        url = "https://www.baseball-reference.com/postseason/" + str(year) + "_" + series_type + ".shtml"
        # create series string
        series_name = series_type + ' ' + str(year)
        series_table = create_series_table(url, series_name)
        
        # count wins from series table
        away_wins = len(series_table[series_table['Away Win'] == 'Y'])
        # remove any non 7 game series (no team won at least 4 games)
        if series_table['Winner Name'].value_counts().max() < 4:
            continue
        mlb_results_table.loc[series_counter] = [league, series_name, len(series_table), away_wins]
        series_counter += 1

In [558]:
# NBA
nba_results_table = pd.DataFrame(columns=['League','Series','Total Games','Away Wins'])
series_counter = 0
league = 'NBA'

# NBA series links include team names so can't be predicted, need to grab links from playoff homepage
raw_html = simple_get('https://www.basketball-reference.com/playoffs/series.html')
html = BeautifulSoup(raw_html, 'html.parser')

data_table = html.find('table',{'class':'sortable stats_table'})

# create a list of series links
url_endings = []
for link in data_table.findAll('a'):
    if len(link.get('href')) > 30:
        url_endings.append(link.get('href'))

# loop through all series links
for link in url_endings:
    url = "https://www.basketball-reference.com" + link
    split_link = re.split('-|/',url)
    # filtering out non-NBA leagues
    if split_link[6] != 'nba':
        continue
    # capture series name from url
    if len(split_link) < 12:
        series_name = split_link[5] + ' ' + split_link[7]
    else:
        series_name = split_link[5] + ' ' + split_link[7] + ' ' + split_link[8] + ' ' + split_link[9]
    
    series_table = create_series_table(url, series_name)

    # count away wins from series table
    away_wins = len(series_table[series_table['Away Win'] == 'Y'])
    # remove any non 7 game series (no team won at least 4 games)
    if series_table['Winner Name'].value_counts().max() < 4:
        continue
    nba_results_table.loc[series_counter] = [league, series_name, len(series_table), away_wins]
    series_counter += 1

In [563]:
# NHL
nhl_results_table = pd.DataFrame(columns=['League','Series','Total Games','Away Wins'])
series_counter = 0
league = 'NHL'

# need to capture all series links from predictable yearly playoff homepages
url_endings = []
for year in range(1923, 2020):
    if year == 2005:
        # no postseason or shortened postseason
        continue
    url = "https://www.hockey-reference.com/playoffs/NHL_" + str(year) + ".html"
    
    # navigate to yearly playoff page to get series links
    raw_html = simple_get(url)
    html = BeautifulSoup(raw_html, 'html.parser')

    data_table = html.find('table',{'class':'suppress_all nohover sortable stats_table'})
    
    # create a list of series links found on yearly playoff page
    for link in data_table.findAll('a'):
        if len(link.get('href')) > 30:
            url_endings.append(link.get('href'))

        
# loop through all series links
for link in url_endings:
    url = "https://www.hockey-reference.com" + link
    split_link = re.split('-|/',url)

    series_name = split_link[5] + ' ' + split_link[-2] + ' ' + split_link[-1][:-5]
    
    ## Special exception for NHL, need to manually remove 5 game series in this time window
    if int(split_link[5]) in range(1980,1987) and ('preliminary' in series_name or 'division semi' in series_name):
        continue
    
    series_table = create_series_table(url, series_name)

    # count away wins from series table
    away_wins = len(series_table[series_table['Away Win'] == 'Y'])
    # remove any non 7 game series (no team won at least 4 games)
    if series_table['Winner Name'].value_counts().max() < 4:
        continue
    nhl_results_table.loc[series_counter] = [league, series_name, len(series_table), away_wins]
    series_counter += 1

In [566]:
# combine all results into one table
final_results_table = pd.concat([mlb_results_table, nba_results_table, nhl_results_table])
final_results_table

Unnamed: 0,League,Series,Total Games,Away Wins
0,MLB,WS 1905,5,3
1,MLB,WS 1906,6,5
2,MLB,WS 1908,5,4
3,MLB,WS 1909,7,3
4,MLB,WS 1910,5,2
...,...,...,...,...
731,NHL,2019 first round,4,2
732,NHL,2019 first round,7,2
733,NHL,2019 first round,6,5
734,NHL,2019 first round,5,2


In [567]:
final_results_table[final_results_table['Total Games'] == final_results_table['Away Wins']]

Unnamed: 0,League,Series,Total Games,Away Wins
176,MLB,WS 2019,7,7


# Research confirms that the 2019 World Series is the only 7 game series where the away team won all 7 games. Go Nats!!!