In [None]:
# Setting up required libraries
import pandas as pd
import numpy as np
import requests
import lxml
from bs4 import BeautifulSoup, SoupStrainer

In [None]:
def get_results_new(url1, url2, url3, pages, sex):
    #Function to scrape modern virgin london marathon results page (2020 and 2019)
    #Set up empty dataframe for results
    results = pd.DataFrame()
    
    #Loop through all pages to be scraped
    for i in range(pages+1):
        #Use requests to get content from site
        site=requests.get(url1+str(i)+url2+sex+url3).content
        #Soup strainer restricts content to sped up soup
        strainer = SoupStrainer(id="cbox-main")
        #Parse the html
        soup = BeautifulSoup(site,'lxml', parse_only=strainer)
        
        #Loop through each row and column to create a list of cells
        my_table = []
        for row in soup.find_all(class_='list-group-item'):
            row_data = []
            for cell in row.find_all(class_='list-field'):
                row_data.append(cell.text)
            
            #If the row isn't empty, then create a dict of the row to create datafram from
            if(len(row_data) > 0):
                data_item = {"Place (Overall)": row_data[0],
                             "Place (Gender)": row_data[1],
                             "Place (Category)": row_data[2],
                             "Name": row_data[3],
                             "Sex": sex,
                             "Club": row_data[4],
                             "Running Number": row_data[5],
                             "Category": row_data[6],
                             "Finish": row_data[7],
                }
                my_table.append(data_item)
        
        #Strip table header
        df = pd.DataFrame(my_table).iloc[1:]
        
        #Append to results
        results = results.append(df)
    
    return results

In [None]:
def get_results_old(url1, url2, pages, sex):
    #Function to scrape old virgin london marathon results page (2014 to 2018)
    #Set up empty dataframe for results
    results = pd.DataFrame()

    #Loop through all pages to be scraped
    for i in range(pages+1):
        #Use requests to get content from site
        site=requests.get(url1+str(i)+url2+sex).content
        #Soup strainer restricts content to sped up soup
        strainer = SoupStrainer('tbody')
        #Parse the html
        soup = BeautifulSoup(site,'lxml', parse_only=strainer)
    
        #Loop through each row and column to create a list of cells
        my_table = []
        for row in soup.find_all('tr'):
            row_data = []
            for cell in row.find_all('td'):
                row_data.append(cell.text)

            #If the row isn't empty, then create a dict of the row to create datafram from
            if(len(row_data) > 0):
                data_item = {"Place (Overall)": row_data[0],
                             "Place (Gender)": row_data[1],
                             "Place (Category)": row_data[2],
                             "Name": row_data[3],
                             "Sex": sex,
                             "Club": row_data[4],
                             "Running Number": row_data[5],
                             "Category": row_data[6],
                             "Finish": row_data[8],
                }
                my_table.append(data_item)

        #Strip table header
        df = pd.DataFrame(my_table).iloc[1:]

        #Append to results
        results = results.append(df)

    return results

In [None]:
def get_virgin_urls(pages, sex, year):
    urls = ['NaN'] * pages
    if year >= 2019:
        for i in range(len(urls)):
            urls[i] =  'https://results.virginmoneylondonmarathon.com/' \
                        +str(year) \
                        +'/?page=' \
                        +str(pages) \
                        +'&event=ALL&num_results=1000&pid=search&pidp=results_nav&search%5Bsex%5D=' \
                        +sex \
                        +'&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name'
    elif year >= 2014:
        for i in urls:
            urls[i] = ('https://results.virginmoneylondonmarathon.com/'
                        +str(year)
                        +'/?page='
                        +str(pages)
                        +'&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D='
                        +sex)
    elif year >= 2010:
        for i in urls:
            urls[i] = ('https://results.virginmoneylondonmarathon.com/'
                        +str(year)
                        +'/index.php?page='
                        +str(pages)
                        +'&event=MAS&num_results=1000&pid=search&search%5Bsex%5D='
                        +sex)
    return urls

In [None]:
#Get results for recent 2020
url1='https://results.virginmoneylondonmarathon.com/2020/?page='
url2='&event=ALL&num_results=1000&pid=search&pidp=results_nav&search%5Bsex%5D='
url3='&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name'
#Get results for men, 22 pages of results <-There is no search option for other gender/sex
mens_2020 = get_results_new(url1, url2, url3, pages=22, sex='M')
#Get results for women, 22 pages of results
womens_2020 = get_results_new(url1, url2, url3, pages=22, sex='W')

In [None]:
#Get results for recent 2020
url1='https://results.virginmoneylondonmarathon.com/2020/?page='
url2='&event=ALL&num_results=1000&pid=search&pidp=results_nav&search%5Bsex%5D='
url3='&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name'
#Get results for men, 22 pages of results <-There is no search option for other gender/sex
mens_2020_test = get_results_new(url1, url2, url3, pages=2, sex='M')

In [None]:
#Get results for recent 2019
url1='https://results.virginmoneylondonmarathon.com/2019/?page='
url2='&event=ALL&num_results=1000&pid=search&pidp=results_nav&search%5Bsex%5D='
url3='&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name'
#Get results for men, 22 pages of results <-There is no search option for other gender/sex
mens_2019 = get_results_new(url1, url2, url3, pages=25, sex='M')
#Get results for women, 22 pages of results
womens_2019 = get_results_new(url1, url2, url3, pages=18, sex='W')

# Concatenate results
results_2019_2020 = pd.concat([mens_2020, womens_2020, mens_2019, womens_2019])

In [None]:
# Get results for 2014 to 2018, searches for number of pages
results_2014_2018 = pd.DataFrame()
for year in range(2014, 2019):
    url1='https://results.virginmoneylondonmarathon.com/'+str(year)+'/?page='
    url2='&event=MAS&num_results=1000&pid=list&search%5Bage_class%5D=%25&search%5Bsex%5D='
    
    site_m=requests.get(url1+'1'+url2+'M').text
    site_w=requests.get(url1+'1'+url2+'W').text
    soup_m = BeautifulSoup(site_m,'lxml')
    soup_w = BeautifulSoup(site_w,'lxml')

    m_pages = int(soup_m.find(class_='pages').text[-4:-2])
    w_pages = int(soup_w.find(class_='pages').text[-4:-2])
    
    
    mens = get_results_2014_2018(url1, url2, pages=m_pages, sex='M')
    womens = get_results_2014_2018(url1, url2, pages=w_pages, sex='W')
    
    results_2014_2018 = results_2014_2018.append(pd.concat([mens, womens]))

In [None]:
# Get results for 2010 to 2013
results_2010_2013 = pd.DataFrame()
for year in range(2010, 2014):
    url1='https://results.virginmoneylondonmarathon.com/'+str(year)+'/index.php?page='
    url2='&event=MAS&num_results=1000&pid=search&search%5Bsex%5D='
    site_m=requests.get(url1+'1'+url2+'M').text
    site_w=requests.get(url1+'1'+url2+'W').text
    soup_m = BeautifulSoup(site_m,'lxml')
    soup_w = BeautifulSoup(site_w,'lxml')

    m_pages = int(soup_m.find(class_='pages').text[-4:-2])
    w_pages = int(soup_w.find(class_='pages').text[-4:-2])
    
    
    mens = get_results_old(url1, url2, pages=m_pages, sex='M')
    womens = get_results_old(url1, url2, pages=w_pages, sex='W')
    
    results_2010_2013 = results_2010_2013.append(pd.concat([mens, womens]))

london_marathon_results = pd.concat(results_2019_2020, results_2014_2018, results_2010_2013])

In [None]:
#Some quick data cleaning
london_marathon_results['Club'] = london_marathon_results['Club'].str.replace("Club", "", regex=False)
london_marathon_results['Running Number'] = london_marathon_results['Running Number'].str.replace("Running Number", "", regex=False)
london_marathon_results['Category'] = london_marathon_results['Category'].str.replace("Category", "", regex=False)
london_marathon_results['Finish'] = london_marathon_results['Finish'].str.replace("Finish", "", regex=False)

In [None]:
# Let's see what we've got
london_marathon_results.describe()

In [None]:
# And quickly save them in a csv
london_marathon_results.to_csv(r'C:\Users\michael.walshe\Documents\Python and CAS\London_Marathon.csv', index=False, header=True)