In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import pickle 
from datetime import datetime
import math 
import requests
from bs4 import BeautifulSoup
import time
import sys 

from functions.predictor import process_parkrun_data_for_models, user_input, target_time

If you have made your own model/scaler with different parkrun data then update the filepaths below:

In [6]:
model_to_use = 'models/to_use/xgb_opt_model.pkl'
scaler_to_use = 'models/to_use/minmax_scaler.pkl'
data_for_model = 'data/clean/cleaned_parkrun_no_outliers.csv'

In [3]:
df = process_parkrun_data_for_models(data_for_model)

# Automatic input (with parkrun id or results link)


Do you know your parkrun id? 
Should be an integer following A. For example A-5125087

Your link for full results should be in the form:
https://www.parkrun.org.uk/parkrunner/5125087/all

In [41]:
parkrun_id = 5125087

'Owen GEORGE (A5125087)'

In [93]:
# Loop over the range of pages from x to y
url = f'https://www.parkrun.org.uk/parkrunner/{parkrun_id}/all/'

# Set up headers to avoid blocking by the website
headers = {
     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edge/110.0.1587.56',  # Updated User-Agent for newer browsers
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'en-US,en;q=0.9',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'DNT': '1',
    'Cache-Control': 'max-age=0',
    'TE': 'Trailers',
    'Pragma': 'no-cache',
    'Referer': 'https://www.parkrun.org.uk/',
    'Origin': 'https://www.parkrun.org.uk',
    'X-Requested-With': 'XMLHttpRequest',
    'If-None-Match': 'W/"f0b3eb46c6c7e1f04161c38a1f041f4"'
}

# Request the page content
response = requests.get(url, headers=headers)
response.raise_for_status()  # Check for any HTTP errors
soup = BeautifulSoup(response.content, "html.parser")
soup



<!DOCTYPE html>

<html lang="en-US">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="/wp-content/themes/parkrun/favicons/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
<link href="/wp-content/themes/parkrun/favicons/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
<link href="/wp-content/themes/parkrun/favicons/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
<link href="/wp-content/themes/parkrun/favicons/site.webmanifest" rel="manifest"/>
<link color="#2b233d" href="/wp-content/themes/parkrun/favicons/safari-pinned-tab.svg" rel="mask-icon"/>
<link href="/wp-content/themes/parkrun/favicons/favicon.ico" rel="shortcut icon"/>
<meta content="#da532c" name="msapplication-TileColor"/>
<meta content="/wp-content/themes/parkrun/favicons/browserconfig.xml" name="msapplication-config"/>
<meta content="#ffffff" name="theme-color"/>
<meta content="https://images.parkrun.com/website/gener

In [137]:
table = soup.find_all('table')

dates = []
times = []

for row in table[2].find_all('tr'):
    data_point = row.find_all('td')
    if len(data_point) > 4:
        date = data_point[1].find('span', class_="format-date")
        time = data_point[4]

        if date:
            dates.append(date.text.strip())
        if time:
            times.append(time.text.strip())

date_time_df = pd.DataFrame({'Date': dates, 'Time': times})

date_time_df


<span class="format-date">26/10/2024</span> <built-in method strip of str object at 0x00000209426FC150>
<span class="format-date">26/10/2024</span> <td>30:38</td>
<span class="format-date">05/10/2024</span> <td>27:44</td>
<span class="format-date">07/09/2024</span> <td>28:43</td>
<span class="format-date">06/07/2024</span> <td>28:21</td>
<span class="format-date">01/06/2024</span> <td>29:06</td>
<span class="format-date">06/04/2024</span> <td>30:05</td>
<span class="format-date">30/03/2024</span> <td>28:51</td>
<span class="format-date">16/03/2024</span> <td>29:32</td>
<span class="format-date">09/03/2024</span> <td>30:43</td>
<span class="format-date">07/10/2023</span> <td>30:36</td>
<span class="format-date">12/08/2023</span> <td>28:23</td>
<span class="format-date">08/07/2023</span> <td>28:40</td>
<span class="format-date">27/05/2023</span> <td>28:07</td>
<span class="format-date">25/03/2023</span> <td>27:50</td>
<span class="format-date">18/03/2023</span> <td>28:39</td>
<span class

Unnamed: 0,Date,Time
0,26/10/2024,30:38
1,05/10/2024,27:44
2,07/09/2024,28:43
3,06/07/2024,28:21
4,01/06/2024,29:06
5,06/04/2024,30:05
6,30/03/2024,28:51
7,16/03/2024,29:32
8,09/03/2024,30:43
9,07/10/2023,30:36


In [115]:
for rows in table:
    data_point = row.find_all('tr')
    print(data_point)

[<tr><th>Event</th><th>Run Date</th><th>Run Number</th><th>Pos</th><th>Time</th><th>Age<br/>Grade</th><th>PB?</th></tr>, <tr><td><a href="https://www.parkrun.org.uk/brighton/results/">Brighton &amp; Hove</a></td><td><a href="https://www.parkrun.org.uk/brighton/results/820/"><span class="format-date">26/10/2024</span></a></td><td><a href="https://www.parkrun.org.uk/brighton/results/820/">820</a></td><td>295</td><td>30:38</td><td>42.22%</td><td>
                                     
                                    </td></tr>, <tr><td><a href="https://www.parkrun.org.uk/brighton/results/">Brighton &amp; Hove</a></td><td><a href="https://www.parkrun.org.uk/brighton/results/817/"><span class="format-date">05/10/2024</span></a></td><td><a href="https://www.parkrun.org.uk/brighton/results/817/">817</a></td><td>198</td><td>27:44</td><td>46.63%</td><td>
                                     
                                    </td></tr>, <tr><td><a href="https://www.parkrun.org.uk/brighton/

In [57]:
def fetch_runner_data(parkrun_id):
    """
    Fetches parkrun data for the given id and saves as a dataframe.
    
    Parameters:
    - parkrun_id: int, the parkrunner id as an integer.

    Outputs:
    - A dataframe of that runner's stats
    """
     
    # Loop over the range of pages from x to y
    url = f'https://www.parkrun.org.uk/parkrunner/{parkrun_id}/all/'
    
    # Set up headers to avoid blocking by the website
    headers = {
         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edge/110.0.1587.56',  # Updated User-Agent for newer browsers
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'en-US,en;q=0.9',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'DNT': '1',
        'Cache-Control': 'max-age=0',
        'TE': 'Trailers',
        'Pragma': 'no-cache',
        'Referer': 'https://www.parkrun.org.uk/',
        'Origin': 'https://www.parkrun.org.uk',
        'X-Requested-With': 'XMLHttpRequest',
        'If-None-Match': 'W/"f0b3eb46c6c7e1f04161c38a1f041f4"'
    }

    try:
        # Request the page content
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Check for any HTTP errors
        soup = BeautifulSoup(response.content, "html.parser")

        # Call the function to confirm the parkrunner details
        confirm_parkrunner(soup)
        
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")
        
    except SystemExit as e:
        print(e)  # Handle exit and display the message

    # Gets age category
    age_cat = soup.find('p').text.strip().split()[-1]

    # Converts to age (approx) and gender
    age = int(age_cat[-2:])-2
    gender = age_cat[1]

    table = soup.find_all('table')

    dates = []
    times = []
    
    for row in table[2].find_all('tr'):
        data_point = row.find_all('td')
        if len(data_point) > 4:
            date = data_point[1].find('span', class_="format-date")
            time = data_point[4]
    
            if date:
                dates.append(date.text.strip())
            if time:
                times.append(time.text.strip())
    
    date_time_df = pd.DataFrame({'Date': dates, 'Time': times})
    

    return parkrun_id

def confirm_parkrunner(soup):
    """
    Confirms the parkrunner by showing the name and asking for input.
    
    Parameters:
    - soup: BeautifulSoup object of the page.
    """
    name_list = soup.find('h2').text.strip().split()
    name = " ".join(name_list)
    name_test = input(f"Found name: {name}. Press enter to continue, input a different id to try again, or 'n' to exit: ")

    if name_test == "":
        return  # Continue with the current flow
    elif name_test.isdigit():
        new_parkrun_id = int(name_test)
        fetch_runner_data(new_parkrun_id)  # Restart with the new ID
    elif name_test.lower() == 'n':
        raise SystemExit("Process cancelled")  # Exit the entire function
    else:
        print("Invalid input. Try again.")
        confirm_parkrunner(soup)  # Recurse to retry the input

In [63]:
fetch_runner_data(parkrun_id)

Found name: Owen GEORGE (A5125087). Press enter to continue, input a different id to try again, or 'n' to exit:  5782
Found name: Jacob ZEUNER-GRIFFITHS (A5782). Press enter to continue, input a different id to try again, or 'n' to exit:  


5125087

# Manual input:

In [6]:
target_time(user_input(df), df, model_to_use, scaler_to_use)

Enter expected temperature in (°C). (Leave blank for a default value of 11.8):  5
Enter expected windspeed in km/h (Leave blank for a default value of 20.6):  12
Is it likely to rain? (y/n):  n
How many parkruns have you run previously? (Leave blank for a default value of 33.0):  28
Enter the planned parkrun date (leave blank for today, format YYYY-MM-DD):  
Enter the date of your last parkrun (format YYYY-MM-DD):  05-10-2024


Invalid date format. Please enter the date in the format YYYY-MM-DD.


Enter the date of your last parkrun (format YYYY-MM-DD):  2024-10-05
Enter the rough date you started doing parkruns (leave blank for 2022-05-13, format YYYY-MM-DD):  2018-6-6
Enter your most recent parkrun time in the form 'mins, secs' (Required):  27,44
Enter your previous PB in the form 'mins, secs'. Leave blank to use your previous time 27.7 mins:  23,51
Enter your average parkrun time in the form 'mins, secs'. Leave blank to use your previous time 27.7 mins:  28,32
Enter your age:  30
Enter your gender (m/f) or leave blank:  m



Data added successfully
Target time: 28m2s


28.032959238688154

In [147]:
print(f"{(34.0246%1) * 60 :02.0f}")

01
