# Scraping Player Injury Data
I scraped all player injury data from TSN.ca's player profiles. In order to find the URLs for each player profile I first scraped the main TSN player page for links to every available profile. I then iterated through each profile to scrape the section containing injury, transaction, and suspension data. I finally converted the list of profile data into a DataFrame, removed non-injury events, and parsed reports for length and type of injury. I utilized Selenium for scraping because both the TSN player menu and profile pages use Javascript to populate their table data.

In [175]:
import os
import re
import itertools
import pickle
import pandas as pd

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

### Scraping and Conversion Functions

In [176]:
def read_profile_links(driver):
    '''Reads all player profile links from TSN.ca's main player page
    
    Args:
        driver (selenium webdriver): a valid webdriver
        
    Returns:
        list: player profile URLs
    '''
    main_url = 'https://www.tsn.ca/nhl/players'
    link_pattern = '/nhl/player-bio/[^"]+'
    disabled_class = 'ng-scope disabled'
    
    # Iterates through all pages of player profile links
    driver.get(main_url)
    links = []
    while True:
        links.extend(re.findall(link_pattern, driver.page_source))
        next_button = driver.find_element_by_css_selector('a.next.ng-scope')
        if next_button.find_element_by_xpath('..').get_attribute('class') == disabled_class:
            break
        next_button.click()
        
    return links

def read_player_profile(driver, player_url):
    '''Reads a player's injury, transaction, and suspension history from TSN.ca
    
    Args:
        driver (selenium webdriver): a valid webdriver
        player_url (str): url that contains player profile data
        
    Returns:
        list: player event data as a list of lists, each nested list containing
              [player name, birth date, date, event description]
    '''
    driver.get(player_url)
    soup = BeautifulSoup(driver.page_source, 'lxml')
    roster_updates = []
    
    # Player Name is stored in list items with specific classes
    first_name = soup.find('li', {'class':'first-name ng-binding'}).text
    last_name = soup.find('li', {'class':'last-name ng-binding'}).text
    player_name = f'{first_name} {last_name}'
    
    # Birth Date is stored in a span with a specific class
    birth_date = soup.find('span', {'class':'value-desc ng-binding'}).text
    
    # Date and Event are stored in two spans under 'tr' tags with specific 'ng-repeat' values
    rows = soup.find_all('tr', {'ng-repeat':'rosterMoves in PlayerBio.RosterMoves'})
    for row in rows:
        spans = row.find_all('span')
        update = [span.text for span in spans]
        if update:
            roster_updates.append([player_name, birth_date] + update)
            
    return roster_updates

def profiles_to_dfs(player_profiles):
    '''Converts a list of player profiles into a DataFrame containing just injury data
    
    Args:
        player_urls (str list): nested lists of individual player profiles
        
    Returns:
        DataFrame: player name data with columns [Name, Birth_Date]
        DataFrame: player injury data with columns
                   [Name, Birth_Date, Date, Report, Games_Missed, Cause]
    '''
    columns = ['Name', 'Birth_Date', 'Date', 'Report']
    df = pd.DataFrame(list(itertools.chain(*player_profiles)), columns=columns)
    names_df = df[columns[:2]].drop_duplicates().reset_index(drop=True)
    
    # Use a mask to isolate reports of missed games and ignore suspensions
    df['Report'] = df['Report'].str.lower()
    report_mask = ((df['Report'].str.contains('missed \d* game')) &
                   ~(df['Report'].str.contains('suspen')))
    df = df[report_mask].reset_index(drop=True)
    df['Date'] = pd.to_datetime(df['Date'], format='%b %d, %Y')
    
    # Assume the number of missed games is the only numerical data in reports
    df['Missed'] = df['Report'].str.replace('\D', '')
    # Cause of missed games is everything between parentheses
    df['Cause'] = df['Report'].str.findall('(?<=\().*(?=\))').str.join('')
    
    return names_df, df

def var_to_pickle(var, filename):
    '''Writes the given variable to a pickle file
    
    Args:
        var (any): variable to be written to pickle file
        filename (str): path and filename of pickle file
    
    Returns:
        None
    '''
    try:
        with open(filename, 'wb') as f:
            pickle.dump(var, f)
    except:
        print(f'Failed to save pickle to \'{filename}\'')
    return

def read_pickle(filename):
    '''Reads the given pickle file
    
    Args:
        filename (str): path and filename of pickle file
    
    Returns:
        any: contents of pickle file if it exists, None if not
    '''
    output = None
    if os.path.exists(filename):
        try:
            with open(filename, 'rb') as f:
                output = pickle.load(f)
        except:
            print(f'Failed to load pickle from \'{filename}\'')
    return output

### Create a Single, Shared Webdriver

In [177]:
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

### Scrape Player Profile Links

In [188]:
links_pickle = 'pickle_data/player_links.pickle'
links = read_pickle(links_pickle)
if links == None:
    links = read_profile_links(driver)
    var_to_pickle(links, links_pickle)

### Scrape Each Player Profile

In [193]:
profiles_pickle = 'pickle_data/player_profiles.pickle'
bio_url = 'https://www.tsn.ca%s/bio'
save_step = 50

# Loads data from pickle if it exists, otherwise scrapes it
profiles = read_pickle(profiles_pickle)
if profiles == None:
    profiles = []
    
# If pickle data is not complete, picks up where it left off
start = len(profiles)
if start < len(links):
    for cnt,link in enumerate(links[start:]):
        profiles.append(read_player_profile(driver, bio_url % link))
        if (cnt+1) % save_step == 0:
            var_to_pickle(profiles, profiles_pickle)
    var_to_pickle(profiles, profiles_pickle)

AttributeError: 'NoneType' object has no attribute 'text'

### Convert Profiles to Injury DataFrame

In [184]:
names_df, injuries_df = profiles_to_dfs(profiles)

### Quit Webdriver

In [12]:
driver.quit()