In [None]:
# Setting up required libraries
import pandas as pd
import numpy as np
import requests
import lxml
import cchardet
import re
from bs4 import BeautifulSoup, SoupStrainer
from multiprocessing.dummy import Pool  # This is a thread-based Pool
from multiprocessing import cpu_count

# Function Definitions:
Two main functions, get_results old and new. These correspond to the two website styles

In [None]:
def get_results_new(url, sex, year):
    #Function to scrape modern virgin london marathon results page (2020 and 2019)
    #Set up empty dataframe for results
    results = pd.DataFrame()
    
    #Use requests to get content from site
    site=requests.get(url).content
    #Soup strainer restricts content to sped up soup
    strainer = SoupStrainer(class_="section-main")
    #Parse the html
    soup = BeautifulSoup(site,'lxml', parse_only=strainer)
    #fields = soup.find(class_='section-main')

    #Loop through each row and column to create a list of cells
    my_table = []
    for row in soup.find_all(class_='list-group-item'):
        row_data = []
        for cell in row.find_all(class_='list-field'):
            row_data.append(cell.text)
        
        #If the row isn't empty, then create a dict of the row to create datafram from
        if(len(row_data) > 0):
            data_item = {"Place (Overall)": row_data[0],
                        "Place (Gender)": row_data[1],
                        "Place (Category)": row_data[2],
                        "Name": row_data[3],
                        "Sex": sex,
                        "Club": row_data[4],
                        "Running Number": row_data[5],
                        "Category": row_data[6],
                        "Finish": row_data[7],
                        "Year": year
            }
            my_table.append(data_item)
        
    #Strip table header
    df = pd.DataFrame(my_table).iloc[1:]
        
    #Append to results
    results = results.append(df)
    
    return results

In [None]:
def get_results_old(url, sex, year):
    #Function to scrape old virgin london marathon results page (2014 to 2018)
    #Set up empty dataframe for results
    results = pd.DataFrame()

    #Use requests to get content from site
    site=requests.get(url).content
    #Soup strainer restricts content to sped up soup
    strainer = SoupStrainer('tbody')
    #Parse the html
    soup = BeautifulSoup(site,'lxml', parse_only=strainer)

    my_table = []
    for row in soup.find_all('tr'):
        row_data = []
        for cell in row.find_all('td'):
            #Check if cell has alt text, if so use that as data
            alt_text = cell.find('span')
            if alt_text != None:
                cell = alt_text['title']
            else:
                cell = cell.text
            row_data.append(cell)
            
        #If the row isn't empty, then create a dict of the row to create datafram from
        if(len(row_data) > 0):
            data_item = {"Place (Overall)": row_data[0],
                            "Place (Gender)": row_data[1],
                            "Place (Category)": row_data[2],
                            "Name": row_data[3],
                            "Sex": sex,
                            "Club": row_data[4],
                            "Running Number": row_data[5],
                            "Category": row_data[6],
                            "Finish": row_data[8],
                            "Year": year
            }
            my_table.append(data_item)

    #Strip table header
    df = pd.DataFrame(my_table).iloc[1:]

    #Append to results
    results = results.append(df)

    return results

In [None]:
def get_results(url):
    #Function choose what results func to apply

    #Check what year the url is
    year = int(re.search('\.com\/(\d{4})\/', url).group(1))
    sex = re.search('sex%5D=(\w)', url).group(1)
    if year >= 2019:
        data = get_results_new(url, sex, year)
    elif year >= 2010:
        data = get_results_old(url, sex, year)
    else:
        data = None
    return data

In [None]:
def get_virgin_urls(sex, pages, year):
    # Get a list of urls, this is needed to be used to apply function to to then use multiprocessing
    urls = ['NaN'] * pages
    if year >= 2019:
        for i in range(len(urls)):
            urls[i] =  'https://results.virginmoneylondonmarathon.com/' \
                        +str(year) \
                        +'/?page=' \
                        +str(i+1) \
                        +'&event=ALL&num_results=1000&pid=search&pidp=results_nav&search%5Bsex%5D=' \
                        +sex \
                        +'&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name'


    elif year >= 2014:
        for i in range(len(urls)):
            urls[i] = ('https://results.virginmoneylondonmarathon.com/'
                        +str(year)
                        +'/?page='
                        +str(i+1)
                        +'&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D='
                        +sex)

    elif year >= 2010:
        for i in range(len(urls)):
            urls[i] = ('https://results.virginmoneylondonmarathon.com/'
                        +str(year)
                        +'/index.php?page='
                        +str(i+1)
                        +'&event=MAS&num_results=1000&pid=search&search%5Bsex%5D='
                        +sex)

    return urls

In [None]:
urls = []
#Get no. of pages using technique like
#Not kept in/included in functions because requests take forever!
# site_m=requests.get(url1+'1'+url2+'M').text
# site_w=requests.get(url1+'1'+url2+'W').text
# soup_m = BeautifulSoup(site_m,'lxml')
# soup_w = BeautifulSoup(site_w,'lxml')

# m_pages = int(soup_m.find(class_='pages').text[-4:-2])
# w_pages = int(soup_w.find(class_='pages').text[-4:-2])
# print(m_pages, w_pages)
pages_men = [23, 24, 23, 25, 23, 24, 24, 24, 24, 25, 22]
pages_women = [13, 14, 13, 13, 14, 15, 16, 16, 17, 18, 22]
for i, year in enumerate(range(2010, 2020)):
    w_urls = get_virgin_urls('W', pages_women[i], year)
    m_urls = get_virgin_urls('M', pages_men[i], year)
    new_urls = m_urls + w_urls
    urls = urls + new_urls

In [None]:
#Shorter urls to test with
urls = []
pages_men = [22, 25]
pages_women = [22, 18]
for i, year in enumerate(range(2019, 2020)):
    w_urls = get_virgin_urls('W', pages_women[i], year)
    m_urls = get_virgin_urls('M', pages_men[i], year)
    new_urls = m_urls + w_urls
    urls = urls + new_urls

In [None]:
results = pd.DataFrame()
#Setup multiprocessing and start scraping!
pool = Pool(8)
#Scrape multiprocessing
results2 = pool.map(get_results, urls)
df2 = pd.concat(results2)
#Cleanup after yourself
pool.terminate()
pool.join()

In [None]:
#Some quick data cleaning
london_marathon_results['Club'] = london_marathon_results['Club'].str.replace("Club", "", regex=False)
london_marathon_results['Running Number'] = london_marathon_results['Running Number'].str.replace("Running Number", "", regex=False)
london_marathon_results['Category'] = london_marathon_results['Category'].str.replace("Category", "", regex=False)
london_marathon_results['Finish'] = london_marathon_results['Finish'].str.replace("Finish", "", regex=False)

In [None]:
# Let's see what we've got
london_marathon_results.describe()

In [None]:
# And quickly save them in a csv
london_marathon_results.to_csv(r'C:\Users\michael.walshe\Documents\Python and CAS\London_Marathon.csv', index=False, header=True)

In [None]:
#Profiling function to find bottlenecks, need to speed up parser more???
# url = 'https://results.virginmoneylondonmarathon.com/2019/?page=1&event=ALL&'+ \
#       'num_results=1000&pid=search&pidp=results_nav&search%5Bsex%5D=M&search%5Bage_'+ \
#        'class%5D=%25&search%5Bnation%5D=%25&search_sort=name'

# %lprun -f get_results_new get_results_new(url, "M", 2019)