In [1]:
# Setting up required libraries
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
# Connect to website to be scraped, and get all html
url1='https://results.virginmoneylondonmarathon.com/2020/?page='
url2='&event=ALL&num_results=1000&pid=search&pidp=results_nav&search%5Bsex%5D='
url3='&search%5Bage_class%5D=%25&search%5Bnation%5D=%25&search_sort=name'

#Get results for men, 878 pages of results <-There is no search option for other gender/sex
mens_results = pd.DataFrame()
for i in range(22):
    sex='M'
    website_url=requests.get(url1+str(i)+url2+sex+url3).text
    soup = BeautifulSoup(website_url,'lxml')

    fields = soup.find(class_='section-main')

    my_table = []
    for row in fields.findAll(class_='list-group-item'):
        row_data = []
        for cell in row.findAll(class_='list-field'):
            row_data.append(cell.text)

        if(len(row_data) > 0):
            data_item = {"Place (Overall)": row_data[0],
                         "Place (Gender)": row_data[1],
                         "Place (Category)": row_data[2],
                         "Name": row_data[3],
                         "Sex": sex,
                         "Club": row_data[4],
                         "Running Number": row_data[5],
                         "Category": row_data[6],
                         "Finish": row_data[7],
            }
            my_table.append(data_item)

    df = pd.DataFrame(my_table).iloc[1:]
    
    mens_results = mens_results.append(df)

In [3]:
#Get results for women
womens_results = pd.DataFrame()

for i in range(22):
    sex='F'
    website_url=requests.get(url1+str(i)+url2+sex+url3).text
    soup = BeautifulSoup(website_url,'lxml')

    fields = soup.find(class_='section-main')

    my_table = []
    for row in fields.findAll(class_='list-group-item'):
        row_data = []
        for cell in row.findAll(class_='list-field'):
            row_data.append(cell.text)

        if(len(row_data) > 0):
            data_item = {"Place (Overall)": row_data[0],
                         "Place (Gender)": row_data[1],
                         "Place (Category)": row_data[2],
                         "Name": row_data[3],
                         "Sex": sex,
                         "Club": row_data[4],
                         "Running Number": row_data[5],
                         "Category": row_data[6],
                         "Finish": row_data[7],
            }
            my_table.append(data_item)

    df = pd.DataFrame(my_table).iloc[1:]
    
    womens_results = womens_results.append(df)

In [4]:
# Concatenate results
results = pd.concat([mens_results, womens_results])

In [6]:
results

Unnamed: 0,Place (Overall),Place (Gender),Place (Category),Name,Sex,Club,Running Number,Category,Finish,Country,FirstName,LastName
1,27354,15250,5417,Abando Jeffrey,M,,9162,18-39,07:02:57,USA,Jeffrey,Abando
2,4947,3860,828,Abbas Ahmad,M,,33403,40-44,03:54:10,GBR,Ahmad,Abbas
3,16564,10607,3947,Abbas Wasim,M,,28880,18-39,05:15:03,GBR,Wasim,Abbas
4,,,,Abbot Ryan,M,,38738,40-44,,GBR,Ryan,Abbot
5,7420,5521,2135,Abbotson Peter,M,,16712,18-39,04:12:28,GBR,Peter,Abbotson
...,...,...,...,...,...,...,...,...,...,...,...,...
996,3069,2534,1042,Kozak Vitalii,F,,13199,18-39,03:38:43,GBR,Vitalii,Kozak
997,,,,Koziol Andy,F,,13924,45-49,,CAN,Andy,Koziol
998,25275,14415,5152,Kozior Konrad,F,,41829,18-39,06:34:31,GBR,Konrad,Kozior
999,,,,Kozlovskyi Oleksii,F,,7388,40-44,,UKR,Oleksii,Kozlovskyi


In [5]:
#Some quick data cleaning
#Remove leftover titles
results['Club'] = results['Club'].str.replace("Club", "", regex=False)
results['Running Number'] = results['Running Number'].str.replace("Running Number", "", regex=False)
results['Category'] = results['Category'].str.replace("Category", "", regex=False)
results['Finish'] = results['Finish'].str.replace("Finish", "", regex=False)

#Extract country groups, like (USA), from Name group
results['Country'] = results['Name'].str.extract(r'(\([A-Z]{3,}\))')
#Remove brackets in country
results['Country'] = results['Country'].str.replace(r'\(|\)', "")
#Remove country group from name column
results['Name'] = results['Name'].str.replace(r'(\([A-Z]{3}\))', "")

#Split first/lastname into new columns
LastFirst = results['Name'].str.split(pat=",", n=1, expand=True) 
results['FirstName'], results['LastName'] = LastFirst[1], LastFirst[0]
#Remove comma from Name column, so that this can be saved as a CSV ----- Must happen after splitting Name into two cols!!
results['Name'] = results['Name'].str.replace(r'(\,)', "")
#Replace non-standard '–' with NaN for missing vals
results = results.replace('–', np.nan)

#Set data types
results =  results.astype({"Place (Overall)": 'float64',
                           "Place (Gender)": 'float64',
                           "Place (Category)": 'float64',
                           "Name": str,
                           "Sex": str,
                           "Club": str,
                           "Running Number": 'float64',
                           "Category": 'category',
                           "Year": 'Int64',
                           "Country": str,
                           "FirstName": str,
                           "LastName": str})
# Due to an irritating bug with converting objects to Int64, needed to first convert to float and then to int
results =  results.astype({"Place (Overall)": 'Int64',
                           "Place (Gender)": 'Int64',
                           "Place (Category)": 'Int64',
                           "Running Number": 'Int64'})
results['Finish'] = pd.to_timedelta(results['Finish'])
results['Finish (Total Seconds)'] = results['Finish'].dt.total_seconds()

KeyError: 'Only a column name can be used for the key in a dtype mappings argument.'

In [7]:
results.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44000 entries, 1 to 1000
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Place (Overall)   38724 non-null  object
 1   Place (Gender)    38724 non-null  object
 2   Place (Category)  38649 non-null  object
 3   Name              44000 non-null  object
 4   Sex               44000 non-null  object
 5   Club              8021 non-null   object
 6   Running Number    44000 non-null  object
 7   Category          43891 non-null  object
 8   Finish            38724 non-null  object
 9   Country           43304 non-null  object
 10  FirstName         44000 non-null  object
 11  LastName          44000 non-null  object
dtypes: object(12)
memory usage: 4.4+ MB


In [None]:
# And quickly save them in a csv
results.to_csv(r'C:\Users\michael.walshe\Documents\Python Projects\Scraping_London_Marathon_Project\London_2020.csv', index=False, header=True)