# Scraping event history of a weekly 5k in KN
This script scrapes the event history of a weekly 5k running event in Konstanz. The page this scrapes only includes information about the fastest male and female finishers but not about all other runners Since the organisation doesn't allow webscraping, the scraped data will be anonymised and the script will be adjusted such that no direct references to the 5k event can be read in it.

The scraped data will be saved as an anonymised csv file.

In [26]:
# import relevant libraries
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import hashlib

First I use the requests library to scrape the html content of the website. It is important to add a User-Agent header to the method, otherwise the web content will be an error 403 error message.

In [40]:
URL = 'REDACTED'
headers = {'User-Agent': 'REDACTED'}
page = requests.get(URL, headers=headers)

I then use BeautifulSoup to parse the html content of the website.

In [41]:
soup = BeautifulSoup(page.content, 'html.parser')

From the developer tools mode of the Website, I know that the information I want is stored in a table of the class Results-table.

In [42]:
table = soup.find('table', {'class':'Results-table'})

Actually I can retrieve all the information I want from the table row tag of each row. So I first create a list of all the table rows.

In [5]:
rows = table.find_all('tr', class_='Results-table-row')

Once I have this list, I create lists for all the columns I want to have in my dataframe later and use regular expressions to extract the rough cut of the information. This includes a string that describes what kind of information it is.

In [21]:
dates = []
first_f = []
time_f = []
finishers = []
first_m = []
time_m = []
event_num = []
num_vols = []

for r in rows:

    if re.search(r'data-date="\d{4}-\d{2}-\d{2}"', str(r)) is not None:
        dates.append(re.search(r'data-date="\d{4}-\d{2}-\d{2}"', str(r)).group(0))
    else:
        pass
    
    if re.search(r'data-female="[^0-9]+(?: [^0-9]+)+\.?" ', str(r)) is not None:
        first_f.append(re.search(r'data-female="[^0-9]+(?: [^0-9]+)+\.?" ', str(r)).group(0))
    else:
        continue
    
    if re.search(r'data-femaletime="\d+"', str(r)) is not None:
        time_f.append(re.search(r'data-femaletime="\d+"', str(r)).group(0))
    else:
        continue
    
    if re.search(r'data-finishers="\d+"', str(r)) is not None:
        finishers.append(re.search(r'data-finishers="\d+"', str(r)).group(0))
    else:
        continue
    
    if re.search(r'data-male="[^0-9]+(?: [^0-9]+)+\.?" ', str(r)) is not None:
        first_m.append(re.search(r'data-male="[^0-9]+(?: [^0-9]+)+\.?" ', str(r)).group(0))
    else:
        continue
    
    if re.search(r'data-maletime="\d+"', str(r)) is not None:
        time_m.append(re.search(r'data-maletime="\d+"', str(r)).group(0))
    else:
        continue
        
    if re.search(r'data-parkrun="\d+"', str(r)) is not None:
        event_num.append(re.search(r'data-parkrun="\d+"', str(r)).group(0))
    else:
        continue
    
    if re.search(r'data-volunteers="\d+"', str(r)) is not None:
        num_vols.append(re.search(r'data-volunteers="\d+"', str(r)).group(0))
    else:
        continue


I then clean the information so only the relevant bits remain.

In [23]:
lists = [dates, first_f, time_f, finishers, first_m, time_m, event_num, num_vols]

for l in lists:
    for i, item in enumerate(l):
        l[i] = re.search(r'[a-zA-Z\-]="([^"]+)"', item).group(1)

Finally, the times are written as dddd, so I add a semicolon to make them sensible. I assume here, that the fastest times are always in the format mm:ss. This is a fairly reasonable assumption as 5k times are never faster below ten minutes and rarely over 1h.

In [24]:
for l in [time_f, time_m]:
    for i, item in enumerate(l):
        l[i] = item[:2] + ':' + item[2:]

In the last step, I create a pandas dataframe from the lists, anonymise the data and write it to a csv file.

In [39]:
data_dict = {'date': dates, 
             'first_female': first_f,
             'first_female_time': time_f,
             'first_male': first_m,
             'first_male_time': time_m,
             'number_of_finishers': finishers,
             'number_of_volunteers': num_vols,
             'event_number': event_num}

data = pd.DataFrame(data_dict)



data['first_female'] = data['first_female'].apply(lambda x: hashlib.sha256(x.encode()).hexdigest())
data['first_male'] = data['first_male'].apply(lambda x: hashlib.sha256(x.encode()).hexdigest())


data.to_csv('5k_KN_history.csv', index=False)