In [1]:
# Setting up required libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
# Connect to website to be scraped, and get all html
url1='https://results.virginmoneylondonmarathon.com/2020/?page='
url2='&event=ALL&num_results=1000&pid=search&pidp=results_nav&search%5Bsex%5D='
url3='&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name'

#Get results for men, 878 pages of results <-There is no search option for other gender/sex
mens_results = pd.DataFrame()
for i in range(22):
    sex='M'
    website_url=requests.get(url1+str(i)+url2+sex+url3).text
    soup = BeautifulSoup(website_url,'lxml')

    fields = soup.find(class_='section-main')

    my_table = []
    for row in fields.findAll(class_='list-group-item'):
        row_data = []
        for cell in row.findAll(class_='list-field'):
            row_data.append(cell.text)

        if(len(row_data) > 0):
            data_item = {"Place (Overall)": row_data[0],
                         "Place (Gender)": row_data[1],
                         "Place (Category)": row_data[2],
                         "Name": row_data[3],
                         "Sex": sex,
                         "Club": row_data[4],
                         "Running Number": row_data[5],
                         "Category": row_data[6],
                         "Finish": row_data[7],
            }
            my_table.append(data_item)

    df = pd.DataFrame(my_table).iloc[1:]
    
    mens_results = mens_results.append(df)

In [3]:
#Get results for women
womens_results = pd.DataFrame()

for i in range(22):
    sex='F'
    website_url=requests.get(url1+str(i)+url2+sex+url3).text
    soup = BeautifulSoup(website_url,'lxml')

    fields = soup.find(class_='section-main')

    my_table = []
    for row in fields.findAll(class_='list-group-item'):
        row_data = []
        for cell in row.findAll(class_='list-field'):
            row_data.append(cell.text)

        if(len(row_data) > 0):
            data_item = {"Place (Overall)": row_data[0],
                         "Place (Gender)": row_data[1],
                         "Place (Category)": row_data[2],
                         "Name": row_data[3],
                         "Sex": sex,
                         "Club": row_data[4],
                         "Running Number": row_data[5],
                         "Category": row_data[6],
                         "Finish": row_data[7],
            }
            my_table.append(data_item)

    df = pd.DataFrame(my_table).iloc[1:]
    
    womens_results = womens_results.append(df)

In [52]:
# Concatenate results
results = pd.concat([mens_results, womens_results])

In [54]:
#Some quick data cleaning
#Remove leftover titles
results['Club'] = results['Club'].str.replace("Club", "", regex=False)
results['Running Number'] = results['Running Number'].str.replace("Running Number", "", regex=False)
results['Category'] = results['Category'].str.replace("Category", "", regex=False)
results['Finish'] = results['Finish'].str.replace("Finish", "", regex=False)

#Extract country groups, like (USA), from Name group
results['Country'] = results['Name'].str.extract(r'(\([A-Z]{3,}\))')
#Remove brackets in country
results['Country'] = results['Country'].str.replace(r'\(|\)', "")
#Remove country group from name column
results['Name'] = results['Name'].str.replace(r'(\([A-Z]{3}\))', "")
#Split first/lastname into new columns
LastFirst = results['Name'].str.split(pat=",", n=1, expand=True) 
results['FirstName'], results['LastName'] = LastFirst[1], LastFirst[0]
#Replace non-standard '–' with NaN for missing vals
results.replace('–', np.nan)

Unnamed: 0,Place (Overall),Place (Gender),Place (Category),Name,Sex,Club,Running Number,Category,Finish,Country,FirstName,LastName
1,27350,15247,5416,"Abando, Jeffrey",M,,9162,18-39,07:02:57,,Jeffrey,Abando
2,4946,3859,828,"Abbas, Ahmad",M,,33403,40-44,03:54:10,,Ahmad,Abbas
3,16562,10605,3946,"Abbas, Wasim",M,,28880,18-39,05:15:03,,Wasim,Abbas
4,,,,"Abbot, Ryan",M,,38738,40-44,,,Ryan,Abbot
5,7419,5520,2134,"Abbotson, Peter",M,,16712,18-39,04:12:28,,Peter,Abbotson
...,...,...,...,...,...,...,...,...,...,...,...,...
996,,,,"Kozak, Lauren",F,,42820,18-39,,,Lauren,Kozak
997,3068,2533,1041,"Kozak, Vitalii",F,,13199,18-39,03:38:43,,Vitalii,Kozak
998,,,,"Koziol, Andy",F,,13924,45-49,,,Andy,Koziol
999,25271,14412,5151,"Kozior, Konrad",F,,41829,18-39,06:34:31,,Konrad,Kozior


In [58]:
# And quickly save them in a csv
results.to_csv(r'C:\Users\michael.walshe\Documents\Python Projects\Scraping_London_Marathon_Project\London_2020.csv', index=False, header=True)