# Scraping Player Injury Data
I scraped all player injury data from TSN.ca's player profiles. In order to find the URLs for each player profile I first scraped the main TSN player page for links to every available profile. I then iterated through each profile to scrape the section containing injury, transaction, and suspension data. I finally converted the list of profile data into a DataFrame, removed non-injury events, and parsed reports for length and type of injury. I utilized Selenium for scraping because both the TSN player menu and profile pages use Javascript to populate their table data.

In [14]:
import os
import re
import itertools
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

### Scraping and Conversion Functions

In [15]:
def read_profile_links(driver):
    '''Reads all player profile links from TSN.ca's main player page
    
    Args:
        driver (selenium webdriver): a valid webdriver
        
    Returns:
        list: player profile URLs
    '''
    main_url = 'https://www.tsn.ca/nhl/players'
    link_pattern = '/nhl/player-bio/[^"]+'
    
    # Need to iterate this part for all links pages
    driver.get(main_url)
    players_html = driver.page_source
    rel_links = re.findall(link_pattern, players_html)
    #
    
    abs_links = ['https://www.tsn.ca%s/bio' % link for link in rel_links]
    return abs_links

def read_player_profile(driver, player_url):
    '''Reads a player's injury, transaction, and suspension history from TSN.ca
    
    Args:
        driver (selenium webdriver): a valid webdriver
        player_url (str): url that contains player profile data
        
    Returns:
        list: player event data as a list of lists, each nested list containing
              [player name, date, event description]
    '''
    driver.get(player_url)
    soup = BeautifulSoup(driver.page_source, 'lxml')
    roster_updates = []
    # Player Name is stored in list items with specific classes
    first_name = soup.find('li', {'class':'first-name ng-binding'}).text
    last_name = soup.find('li', {'class':'last-name ng-binding'}).text
    player_name = f'{first_name} {last_name}'
    # Date and Event are stored in two spans under 'tr' tags with specific 'ng-repeat' values
    rows = soup.find_all('tr', {'ng-repeat':'rosterMoves in PlayerBio.RosterMoves'})
    for row in rows:
        spans = row.find_all('span')
        update = [span.text for span in spans]
        if update:
            roster_updates.append([player_name] + update)
    return roster_updates

def profiles_to_injuries_df(player_profiles):
    '''Converts a list of player profiles into a DataFrame containing just injury data
    
    Args:
        player_urls (str list): nested lists of individual player profiles
        
    Returns:
        DataFrame: player injury data with columns [Name, Date, Report, Games_Missed, Cause]
    '''
    columns = ['Name', 'Date', 'Report']
    df = pd.DataFrame(list(itertools.chain(*player_profiles)), columns=columns)
    # Make reports lowercase for easier matching
    df['Report'] = df['Report'].str.lower()
    # Use a mask to isolate reports of missed games and ignore suspensions
    report_mask = ((df['Report'].str.contains('missed \d* game')) &
                   ~(df['Report'].str.contains('suspen')))
    df = df[report_mask].reset_index(drop=True)
    df['Date'] = pd.to_datetime(df['Date'], format='%b %d, %Y')
    # Assume the number of missed games is the only numerical data in reports
    df['Missed'] = df['Report'].str.replace('\D', '')
    # Cause of missed games is everything between parentheses
    df['Cause'] = df['Report'].str.findall('(?<=\().*(?=\))').str.join('')
    return df

### Create a Single, Shared Webdriver

In [16]:
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

### Scrape Player Profile Links

In [17]:
profile_links = read_profile_links(driver)

In [20]:
profile_links

['https://www.tsn.ca/nhl/player-bio/mikkel-aagaard/bio',
 'https://www.tsn.ca/nhl/player-bio/spencer-abbott/bio',
 'https://www.tsn.ca/nhl/player-bio/justin-abdelkader/bio',
 'https://www.tsn.ca/nhl/player-bio/konrad-abeltshauser/bio',
 'https://www.tsn.ca/nhl/player-bio/pontus-aberg/bio',
 'https://www.tsn.ca/nhl/player-bio/cameron-abney/bio',
 'https://www.tsn.ca/nhl/player-bio/rodrigo-abols/bio',
 'https://www.tsn.ca/nhl/player-bio/vitaly-abramov/bio',
 'https://www.tsn.ca/nhl/player-bio/noel-acciari/bio',
 'https://www.tsn.ca/nhl/player-bio/sena-acolatse/bio',
 'https://www.tsn.ca/nhl/player-bio/will-acton/bio',
 'https://www.tsn.ca/nhl/player-bio/luke-adam/bio',
 'https://www.tsn.ca/nhl/player-bio/craig-adams/bio',
 'https://www.tsn.ca/nhl/player-bio/jeremiah-addison/bio',
 'https://www.tsn.ca/nhl/player-bio/kenny-agostino/bio',
 'https://www.tsn.ca/nhl/player-bio/andrew-agozzino/bio',
 'https://www.tsn.ca/nhl/player-bio/jonas-ahnelov/bio',
 'https://www.tsn.ca/nhl/player-bio/seba

### Scrape Each Player Profile

In [8]:
urls = ['https://www.tsn.ca%s/bio' % x for x in player_links]
profiles_list = []
for url in urls:
    profiles_list.append(read_player_profile(driver, url))

### Convert Profiles to Injury DataFrame

In [11]:
injuries_df = profiles_to_injuries_df(profiles_list)
injuries_df

Unnamed: 0,Name,Date,Report,Missed,Cause
0,Justin Abdelkader,2018-01-30,missed 6 games (lower body injury).,6,lower body injury
1,Justin Abdelkader,2017-11-15,missed 1 game (facial injury).,1,facial injury
2,Justin Abdelkader,2017-01-10,missed 16 games (sprained mcl).,16,sprained mcl
3,Justin Abdelkader,2016-10-27,missed 2 games (lower body injury).,2,lower body injury
4,Justin Abdelkader,2014-12-16,missed 7 games (shoulder injury).,7,shoulder injury
5,Justin Abdelkader,2014-03-30,missed 7 games (lacerated leg).,7,lacerated leg
6,Justin Abdelkader,2013-12-26,missed 5 games (head injury).,5,head injury
7,Justin Abdelkader,2012-03-10,missed 1 game (stomach virus).,1,stomach virus
8,Justin Abdelkader,2010-10-30,missed 9 games (rib injury).,9,rib injury
9,Pontus Aberg,2019-02-19,missed 8 games (lower body injury).,8,lower body injury


### Quit Webdriver

In [12]:
driver.quit()