In [3]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium.common.exceptions import TimeoutException

from scraper import page_driver, search_options

In [11]:
url = 'http://registration.baa.org/2018/cf/Public/iframe_ResultsSearch.cfm'

# get driver and Boston Marathon search options page
options_page = page_driver(url)

In [21]:
# clean the results page to get the options in a list
divisions, gender, states, countries = search_options(options_page)
len(states)

72

In [22]:
data_storage = {'Bib': [],
             'Name': [],
             'Age': [],
             'M/F': [],
             'City':[],
             'State': [],
             'Country': [],
             'Citizen': [],
             'Blank': [],
             '5K': [],
             '10K': [],
             '15K': [],
             '20K': [],
             'Half': [],
             '25K': [],
             '30K': [],
             '35K': [],
             '40K': [],
             'Pace': [],
             'Proj. Time': [],
             'Official Time': [],
             'Overall': [],
             'Gender': [],
             'Division': []
            }

In [23]:
def scrape_results(page):
    '''This function takes the html from the page, extracts 
    the table data and stores it in an external dictionary.
    '''
    
    soup = BeautifulSoup(page, "lxml")
    
    info_headers = ['Bib', 'Name', 'Age', 'M/F', 'City', 'State', 
                    'Country', 'Citizen', 'Blank']
    
    result_headers = ['5K', '10K', '15K', '20K', 'Half', '25K', 
                     '30K', '35K', '40K', 'Pace', 'Proj. Time', 
                     'Official Time', 'Overall', 'Gender', 'Division']
    
    # scrape each participants info
    racer_info = soup.findAll("tr", {"class": "tr_header"})
    for i, racer in enumerate(racer_info):
        for key, val in zip(info_headers, racer_info[i].find_all('td')):
            data_storage[key] += [val.text.strip()]
            
    # scrape each participants results            
    race_times = soup.findAll("table", {"class": "table_infogrid"})
    for i, time in enumerate(race_times):
        for key, val in zip(result_headers, race_times[i].find_all('td')):
            data_storage[key] += [val.text.strip()]


In [24]:
def scrape_by_country(countries):
    '''Scrape the race results based on the country the runner is from.'''
    
    url = 'http://registration.baa.org/2018/cf/Public/iframe_ResultsSearch.cfm'
    driver = webdriver.Chrome()
    driver.get(url)
    max_limit = 1000
    
    for country in countries:
        
        if country == 'United States of America' or country == 'Canada':
            continue

        country_option = driver.find_element_by_name("CountryOfResID")
        country_option.send_keys(country)

        limit = driver.find_element_by_name("VarTargetCount")
        limit.send_keys(max_limit)


        try:
            xpath1 = '//*[@id="PublicSearch"]/div/div/input'
            submit_button = driver.find_elements_by_xpath(xpath1)[0]
            time.sleep(3)
            submit_button.click()
            scrape_results(driver.page_source)
        except IndexError:
            xpath4 = '/html/body/div/div/div/div/table[2]/tbody/tr/td[1]/form/input'
            search_again = driver.find_elements_by_xpath(xpath4)[0]
            time.sleep(3)
            search_again.click()
            continue

        try: 
            xpath2 = ('/html/body/div/div/div/div/table[4]/tbody/tr/td/'
                      'table/tbody/tr[51]/td/table/tbody/tr/td[2]/form/input[2]')
            next_button = driver.find_elements_by_xpath(xpath2)[0]
            time.sleep(3)
            next_button.click()
            scrape_results(driver.page_source)
        except IndexError:
            xpath4 = '/html/body/div/div/div/div/table[2]/tbody/tr/td[1]/form/input'
            search_again = driver.find_elements_by_xpath(xpath4)[0]
            time.sleep(3)
            search_again.click()
            continue

        xpath3 = ('/html/body/div/div/div/div/table[3]/tbody/tr/td/table/tbody'
                  '/tr[51]/td/table/tbody/tr/td[2]/form/input[2]')
        while True:
            try:
                next_button = driver.find_elements_by_xpath(xpath3)[0]
                time.sleep(3)
                next_button.click()
                scrape_results(driver.page_source)
            except IndexError:
                xpath4 = '/html/body/div/div/div/div/table[2]/tbody/tr/td[1]/form/input'
                search_again = driver.find_elements_by_xpath(xpath4)[0]
                time.sleep(3)
                search_again.click()
                break

In [30]:
def scrape_by_state(states, divisions):
    '''Scrape the race results based on the state runner is from.'''

    url = 'http://registration.baa.org/2018/cf/Public/iframe_ResultsSearch.cfm'
    driver = webdriver.Chrome()
    driver.get(url)
    max_limit = 1000

    for state in states:
        if state in ['Massachusetts', 'California', 'New York', 'Texas']:
            for division in divisions:
                division_option = driver.find_element_by_name("AwardsDivisionID")
                division_option.send_keys(division)
            
        state_option = driver.find_element_by_name("StateID")
        state_option.send_keys(state)

        limit = driver.find_element_by_name("VarTargetCount")
        limit.send_keys(max_limit)


        try:
            xpath1 = '//*[@id="PublicSearch"]/div/div/input'
            submit_button = driver.find_elements_by_xpath(xpath1)[0]
            time.sleep(3)
            submit_button.click()
            scrape_results(driver.page_source)
        except IndexError:
            xpath4 = '/html/body/div/div/div/div/table[2]/tbody/tr/td[1]/form/input'
            search_again = driver.find_elements_by_xpath(xpath4)[0]
            time.sleep(3)
            search_again.click()
            continue

        try: 
            xpath2 = ('/html/body/div/div/div/div/table[4]/tbody/tr/td/'
                      'table/tbody/tr[51]/td/table/tbody/tr/td[2]/form/input[2]')
            next_button = driver.find_elements_by_xpath(xpath2)[0]
            time.sleep(3)
            next_button.click()
            scrape_results(driver.page_source)
        except IndexError:
            xpath4 = '/html/body/div/div/div/div/table[2]/tbody/tr/td[1]/form/input'
            search_again = driver.find_elements_by_xpath(xpath4)[0]
            time.sleep(3)
            search_again.click()
            continue

        xpath3 = ('/html/body/div/div/div/div/table[3]/tbody/tr/td/table/tbody'
                  '/tr[51]/td/table/tbody/tr/td[2]/form/input[2]')
        while True:
            try:
                next_button = driver.find_elements_by_xpath(xpath3)[0]
                time.sleep(3)
                next_button.click()
                scrape_results(driver.page_source)
            except IndexError:
                xpath4 = '/html/body/div/div/div/div/table[2]/tbody/tr/td[1]/form/input'
                search_again = driver.find_elements_by_xpath(xpath4)[0]
                time.sleep(3)
                search_again.click()
                break


In [31]:
scrape_by_state(states, divisions)

In [36]:
scrape_by_country(countries)

In [37]:
results2018 = pd.DataFrame(data_storage)
print(results2018.shape)
results2018.head()

(19884, 24)


Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Citizen,Blank,5K,...,25K,30K,35K,40K,Pace,Proj. Time,Official Time,Overall,Gender,Division
0,189,"Ash, Alan",26,M,Atmore,AL,USA,,,0:17:58,...,1:31:16,1:50:19,2:09:22,2:28:08,0:05:58,,2:36:04,63,63,59
1,855,"Goodman, John Mark",38,M,Birmingham,AL,USA,,,0:18:43,...,1:34:33,1:53:49,2:13:27,2:33:06,0:06:11,,2:41:52,187,186,173
2,1049,"Mcclung, Joshua",34,M,Birmingham,AL,USA,,,0:18:45,...,1:35:31,1:55:14,2:15:14,2:34:45,0:06:15,,2:43:42,247,246,226
3,303,"Graham, James",26,M,Birmingham,AL,USA,,,0:18:58,...,1:33:09,1:52:29,2:13:49,2:37:17,0:06:23,,2:47:07,385,377,346
4,5267,"Pierce, Nathaniel",31,M,Huntsville,AL,USA,,,0:20:10,...,1:40:23,2:00:42,2:21:22,2:41:26,0:06:30,,2:50:19,608,593,536


In [38]:
results2018['Bib'].value_counts()[:5]

29375    2
6305     2
23270    2
29903    2
3254     2
Name: Bib, dtype: int64

In [39]:
results2018.to_csv('data/marathon_results_2018.csv')

In [None]:
        
#         gender_option = driver.find_element_by_name("GenderID")
#         gender_option.send_keys(sex)

#         division_option = driver.find_element_by_name("AwardsDivisionID")
#         division_option.send_keys(division)

#                 country_option = driver.find_element_by_name("CountryOfResID")
#                 country_option.send_keys(country)

In [4]:
df17 = pd.read_csv('data/marathon_results_2017.csv')
df17.head()

Unnamed: 0.1,Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Citizen,Unnamed: 9,...,25K,30K,35K,40K,Pace,Proj Time,Official Time,Overall,Gender,Division
0,0,11,"Kirui, Geoffrey",24,M,Keringet,,KEN,,,...,1:16:59,1:33:01,1:48:19,2:02:53,0:04:57,-,2:09:37,1,1,1
1,1,17,"Rupp, Galen",30,M,Portland,OR,USA,,,...,1:16:59,1:33:01,1:48:19,2:03:14,0:04:58,-,2:09:58,2,2,2
2,2,23,"Osako, Suguru",25,M,Machida-City,,JPN,,,...,1:17:00,1:33:01,1:48:31,2:03:38,0:04:59,-,2:10:28,3,3,3
3,3,21,"Biwott, Shadrack",32,M,Mammoth Lakes,CA,USA,,,...,1:17:00,1:33:01,1:48:58,2:04:35,0:05:03,-,2:12:08,4,4,4
4,4,9,"Chebet, Wilson",31,M,Marakwet,,KEN,,,...,1:16:59,1:33:01,1:48:41,2:05:00,0:05:04,-,2:12:35,5,5,5


In [15]:
df17[df17['Country'] == 'USA']['State'].value_counts()

MA    4586
CA    2049
NY    1324
TX    1055
PA     842
IL     771
OH     693
FL     609
VA     564
CO     534
MI     516
NC     499
NJ     476
WA     465
MN     443
UT     400
MD     396
WI     392
CT     377
NH     324
GA     317
OR     312
AZ     270
IN     267
TN     237
MO     203
ME     186
DC     156
KY     138
RI     136
SC     136
IA     111
ID      93
KS      92
LA      88
NE      85
NV      85
OK      81
VT      80
AL      80
NM      65
AR      56
AK      52
DE      47
WV      41
HI      37
MT      36
SD      34
ND      29
MS      27
WY      21
PR      21
AE       5
VI       2
AP       2
GU       1
AA       1
Name: State, dtype: int64