In [None]:
# Setting up required libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [None]:
# Connect to website to be scraped, and get all html
url1='https://results.virginmoneylondonmarathon.com/2020/?page='
url2='&event=ALL&num_results=1000&pid=search&pidp=results_nav&search%5Bsex%5D='
url3='&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name'

#Get results for men, 878 pages of results <-There is no search option for other gender/sex
mens_results = pd.DataFrame()
for i in range(22):
    sex='M'
    website_url=requests.get(url1+str(i)+url2+sex+url3).text
    soup = BeautifulSoup(website_url,'lxml')

    fields = soup.find(class_='section-main')

    my_table = []
    for row in fields.findAll(class_='list-group-item'):
        row_data = []
        for cell in row.findAll(class_='list-field'):
            row_data.append(cell.text)

        if(len(row_data) > 0):
            data_item = {"Place (Overall)": row_data[0],
                         "Place (Gender)": row_data[1],
                         "Place (Category)": row_data[2],
                         "Name": row_data[3],
                         "Sex": sex,
                         "Club": row_data[4],
                         "Running Number": row_data[5],
                         "Category": row_data[6],
                         "Finish": row_data[7],
            }
            my_table.append(data_item)

    df = pd.DataFrame(my_table).iloc[1:]
    
    mens_results = mens_results.append(df)

In [None]:
#Get results for women
womens_results = pd.DataFrame()

for i in range(22):
    sex='F'
    website_url=requests.get(url1+str(i)+url2+sex+url3).text
    soup = BeautifulSoup(website_url,'lxml')

    fields = soup.find(class_='section-main')

    my_table = []
    for row in fields.findAll(class_='list-group-item'):
        row_data = []
        for cell in row.findAll(class_='list-field'):
            row_data.append(cell.text)

        if(len(row_data) > 0):
            data_item = {"Place (Overall)": row_data[0],
                         "Place (Gender)": row_data[1],
                         "Place (Category)": row_data[2],
                         "Name": row_data[3],
                         "Sex": sex,
                         "Club": row_data[4],
                         "Running Number": row_data[5],
                         "Category": row_data[6],
                         "Finish": row_data[7],
            }
            my_table.append(data_item)

    df = pd.DataFrame(my_table).iloc[1:]
    
    womens_results = womens_results.append(df)

In [None]:
# Concatenate results
results = pd.concat([mens_results, womens_results])

In [None]:
#Some quick data cleaning
#Remove leftover titles
results['Club'] = results['Club'].str.replace("Club", "", regex=False)
results['Running Number'] = results['Running Number'].str.replace("Running Number", "", regex=False)
results['Category'] = results['Category'].str.replace("Category", "", regex=False)
results['Finish'] = results['Finish'].str.replace("Finish", "", regex=False)

#Extract country groups, like (USA), from Name group
results['Country'] = results['Name'].str.extract(r'(\([A-Z]{3,}\))')
#Remove brackets in country
results['Country'] = results['Country'].str.replace(r'\(|\)', "", regex=True)
#Remove country group from name column
results['Name'] = results['Name'].str.replace(r'(\([A-Z]{3}\))', "", regex=True)

#Split first/lastname into new columns
LastFirst = results['Name'].str.split(pat=",", n=1, expand=True) 
results['FirstName'], results['LastName'] = LastFirst[1], LastFirst[0]
#Remove comma from Name column, so that this can be saved as a CSV ----- Must happen after splitting Name into two cols!!
results['Name'] = results['Name'].str.replace(r'(\,)', "", regex=True)
#Replace non-standard '–' with NaN for missing vals
results = results.replace('–', np.nan)
results = results.replace('DSQ', np.nan)
results = results.replace('', np.nan)

#Delete odd race number row - table description not actual data
results = results.loc[(results['Running Number'] != 'RM9999') & (results['Running Number'] != 'RF9999')]

#Set data types
results =  results.astype({"Place (Overall)": 'float64',
                           "Place (Gender)": 'float64',
                           "Place (Category)": 'float64',
                           "Name": str,
                           "Sex": str,
                           "Club": str,
                           "Running Number": 'float64',
                           "Category": 'category',
                           "Country": str,
                           "FirstName": str,
                           "LastName": str})
# Due to an irritating bug with converting objects to Int64, needed to first convert to float and then to int
results =  results.astype({"Place (Overall)": 'Int64',
                           "Place (Gender)": 'Int64',
                           "Place (Category)": 'Int64',
                           "Running Number": 'Int64'})
results['Finish'] = pd.to_timedelta(results['Finish'])
results['Finish (Total Seconds)'] = results['Finish'].dt.total_seconds()

In [None]:
results.info()

In [None]:
# And quickly save them in a csv
results.to_csv(r'C:\Users\michael.walshe\Documents\Python Projects\Scraping_London_Marathon_Project\London_2020.csv', index=False, header=True)