<a href="https://colab.research.google.com/github/lweislo/cycling-project/blob/master/OneDayRaceData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import requests
import html5lib
import pandas as pd
from bs4 import BeautifulSoup
import os
import numpy as np
import datetime
from google.colab import files
import re

In [0]:
# Set the year range
years = np.arange(1990, 2020, 1)
len(years)

In [0]:
# Get the URLS to past races from race history
race_name = 'Tour of Flanders'
race_page = "https://www.procyclingstats.com/race/ronde-van-vlaanderen/1990/history"
page = requests.get(race_page).content

soup = BeautifulSoup(page, "html5lib")
div = soup.find('table', class_="basic")
links = div.find_all('a')

In [0]:
# Generate a list of all the race urls
urls = []
for i in links:
    if 'race' in i.get('href'):
            urls.append("https://www.procyclingstats.com/race/" + i.get('href'))

In [0]:
urls

In [0]:
# Just manually trim them to the ones you want
url_list = ['https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2019',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2018',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2017',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2016',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2015',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2014',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2013',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2012',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2011',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2010',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2009',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2008',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2007',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2006',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2005',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2004',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2003',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2002',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2001',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/2000',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/1999',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/1998',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/1997',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/1996',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/1995',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/1994',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/1993',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/1992',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/1991',
 'https://www.procyclingstats.com/race/race/ronde-van-vlaanderen/1990']

In [0]:
def get_data(new_url):

    speed = np.NaN
    page = requests.get(new_url)
    if page.status_code == 200:
        content = page.content
        soup = BeautifulSoup(content, "html5lib")

        try:
            # Race year has its own class
            year = int(soup.find('span', class_="year").text)
            # Pull out the race distance from the subheader (if it exists, if not nan)
            if soup.find('span', class_="red distance"):
                distance = float(soup.find('span', class_="red distance").text.split('(')[1].split(')')[0].split('k')[0])
            else:
                distance = np.NaN
            # Get the race winning time 
            tables = soup.findChildren('table')
            row1 = tables[0].findChildren('tr')
            tm = row1[1].find(class_='timeff').text

            # Other information is in a sidebar to the right
            info = soup.find('div', class_="res-right")
            for item in info.find_all("b"):
                if 'Date' in item.text:
                    date = item.next_sibling
                elif 'speed' in item.text:
                    speed = item.next_sibling
                    speed = float(speed.split(' ')[1])
                
            # Finally spit out the data
            data = {'Race': race_name, 'Date':date, 'Year': year, 'Distance':distance, 'Speed':speed, 'Time': tm}
        except ElementDoesNotExist as e:
            pass
        return data

In [0]:
# First get the overview data for all years for the race
df = pd.DataFrame(columns=['Race', 'Date', 'Year', 'Distance','Speed', 'Time'])

for i in url_list:
    if i:
        print(i)
        data = get_data(i)
        df = df.append(data, ignore_index=True)
    else:
        pass

In [0]:
# Export overview data
outfile = f'{race_name}_data.csv'
with open(outfile, 'w') as file:
    file.write(df.to_csv(header=True, index=False, encoding='Latin-1'))

files.download(outfile)

In [0]:
def pcscrape(stage_url):
    # print(f"Getting result links for {stage_url}")
    year = stage_url.split('/')[-1]
    page = requests.get(stage_url)

    if page.status_code == 200:
        content = page.content
        soup = BeautifulSoup(content, "html5lib")

        try:
            div = soup.find(class_="resultCont")
            for match in soup.findAll('span', class_='timeff'):
                match.replace_with('')
            for match in soup.findAll('span', class_='teammob'):
                match.replace_with('')
            for match in soup.findAll('span', class_='uppercase'):
                match.append(',')
            if soup.find('span', class_="red distance"):
                distance = float(soup.find('span', class_="red distance").text.split('(')[1].split(')')[0].split('k')[0])
            # Get the nationalities into a series
            nations = []
            table1 = div.findChildren('table')[0]
            rows = table1.findChildren('tr')
            for i in rows:
                nations.append(i.attrs.get('data-nation'))
            nations.pop(0)
            # Now get the data table but only columns we want
            results = pd.read_html(str(div))
            res_df = pd.DataFrame()
            res_df['Rank'] = results[0]['Rnk']
            res_df['Rider'] = results[0]['Rider'].str.split(', ').str[::-1].str.join(' ')
            res_df['Nation'] = nations
            res_df['Age'] = results[0]['Age']
            res_df['Team'] = results[0]['Team']
            res_df['Time'] = results[0]['Time']
            res_df['Year'] = year
        except ElementDoesNotExist as e:
            print(f"That does not appear to be a valid results URL. {e}")
# Return the data
    print(res_df.head(1))
    res_df = fixtimes(res_df)
    res_df['Speed'] = distance / res_df['Time'] * 3600
    return res_df

In [0]:
def fixtimes(df19):
    df19 = df19.replace({'Time': {0: np.nan}}).ffill()
    for index, row in df19.iterrows():
        if ':' in row['Time']:
            t = row['Time'].split(':')
            if len(t) == 2:
                h,m,s = [0, t[0], t[1]]
                df19.loc[index, 'Time'] = int(datetime.timedelta(hours=int(h),minutes=int(m),seconds=int(s)).total_seconds())
            else:
                h,m,s = [t[0],t[1],t[2]]
                df19.loc[index, 'Time'] = int(datetime.timedelta(hours=int(h),minutes=int(m),seconds=int(s)).total_seconds())
        elif '-' in row['Time']:
            df19.loc[index, 'Time'] = np.nan
    wtime = df19['Time'][0]
    for index, row in df19.iterrows():
        if row['Time'] < wtime:
            df19.loc[index, 'Time'] = row['Time'] + wtime
    return df19

In [0]:
df_master = pd.DataFrame(columns=['Rank', 'Rider', 'Nation','Age','Team', 'Time','Year'])
for url in url_list:
    year = url.split('/')[-1]
    df_temp = pcscrape(url)
    df_temp.replace({'Time': {0: np.nan}}).ffill()
    df_master = df_master.append(df_temp, ignore_index=True)


In [0]:
# # Test
# df19 = pcscrape('https://www.procyclingstats.com/race/race/gent-wevelgem/2019')
# df19.head()

In [0]:
# Export all results data
outfile = f'{race_name}_results.csv'
with open(outfile, 'w') as file:
    file.write(df_master.to_csv(header=True, index=False, encoding='UTF-8'))

files.download(outfile)

In [0]:
df_master.head(50)