<a href="https://colab.research.google.com/github/lweislo/CNResults/blob/master/ASO_results_scrape.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This page lets you scrape results directly from any LeTour.fr-formatted race.


In [0]:
import requests
import html5lib
import pandas as pd
from bs4 import BeautifulSoup
import os
import json
import numpy as np
import datetime
from google.colab import files

In [0]:
# Universal code for ASO race results

tab_dict = {'ite':'Stage',
'ipe':'Points',
'ime':'Mountains',
'ije':'Young riders',
'ice':'Combativity',
'ete':'Teams',
'itg':'General Classification',
'ipg':'Points Classification',
'img':'Mountains Classification',
'ijg':'Young Riders Classification',
'icg':'Combativity Classification',
'etg':'Teams Classification'}

order_list = ['ite', 'ipe', 'ime', 'ije', 'ice', 'ete', 'itg', 'ipg', 'img', 'icg','ijg', 'etg']
points_stg = ['ipe', 'ime']
output_labels = []
output_tables = []


In [0]:
def output_file(header_list, table_list):
    outfile = input("Enter your desired CSV filename: ")
    with open(outfile, 'w') as file:
        for item in range(0, len(header_list)):
            try:
                file.write(f'\n{header_list[item]}\n')
                file.write(table_list[item].to_csv(header=False, index=False, encoding='Latin-1'))
            except:
                pass
    files.download(outfile)

In [0]:
# This cell scrapes the page for stage and extracts a list of raw URLs coded with the classification code

def scrape_urls(stage_url):
    base_url = stage_url.split('/')[2] #Race URL
    stage_id = stage_url.split('/')[-1] #Stage
    links_list = [] #Temporarily store the list of unparsed URLs
    url_dict = {} #Dictionary with result type code and full URL
    print(f"Getting result links for {stage_url}")

    page = requests.get(stage_url)

    if page.status_code == 200:
        content = page.content
        soup = BeautifulSoup(content, "html5lib")

    #Pull out a specific block of code with two sets of coded URLs from the soup. YMMV if ASO change the format!
        try:
            all_links = soup.find(class_="ranking__tabs") #This is from a UL tag above the table on the page
            links = all_links.find_all(class_="tabs__link") #There are two items in the list, stage and GC
        except ElementDoesNotExist as e:
            print(f"That does not appear to be a valid results URL. {e}")
# Parse out the URLs from the list 'data-ajax-stack' attribute
    for item in links:
        urls = item.get('data-ajax-stack')
        links_list.append(urls)


    for item in links_list:
          #clean up the code into a useable URL
        urls = item.replace('\'', '"')
        myurl = urls.replace('\/', '/')
        myurls = json.loads(myurl)
        for key, value in myurls.items():
            url_dict[key] = f"https://{base_url}{value}"
            
    for item in order_list:   # IMPORTANT: We want to process these in the order of the universal result order list 
        try:
            print(f"Scraping results from {tab_dict[item]}")
            scrape_results(item, url_dict[item])
        except KeyError:
            print(f"No results for {tab_dict[item]}")
            pass
    return output_labels, output_tables

In [0]:

def scrape_results(item, url):
    print(f"Getting results for {item}, {url}")
    out_df = pd.DataFrame()
    try:
        page = requests.get(url).content
        soup = BeautifulSoup(page, "html5lib")
#Treat sprint and mountains stage results differently, they have multiple tables per page
        if item == 'ipe' or item == 'ime':
            l_list = []
            t_list = []

            if item == 'ipe':
                name = "Sprint"
            else:
                name = "Mountain"
            print(f"Found a {name}")
            mydiv = soup.find(class_="tabs__content")
            labels = mydiv.find_all(class_="rankingTables__caption")
            tables = mydiv.find_all(class_="rankingTable")

            for item in labels:
                try:
                    this_label = item.text.title()
                    this_label = this_label.replace(" Km", "").replace(" - "," km. ")
                    this_label = (f"{name} {labels.index(item)+1} - {this_label}")
                    print(this_label)
                    output_labels.append(this_label)
                except ValueError:
                    pass

            for item in tables:
                out_df = pd.DataFrame()
                try:
                    table = pd.read_html(str(item), index_col=None, header=None)
                    table = table[0]
                    out_df['Place'] = table['Rank']
                    out_df['Bib'] = table['Rider No.'].fillna('')
                    out_df['Points'] = table['Points'].str.replace('PTS', '')
                    print(out_df)
                    output_tables.append(out_df)
                except ValueError:
                    pass        
        else:
            # Fish the tables out of the soup
            res_table = pd.read_html(page)
            df = res_table[0]
            # Get the heading for output
            label = tab_dict[item]
            output_labels.append(label)
            # Get the table for output
            if item in ['ite', 'ije', 'itg', 'ijg']:
                out_df['Place'] = df['Rank'].fillna('')
                out_df['Bib'] = df['Rider No.'].fillna('')
                out_df['Result'] = df['Times'].str.replace('h ', ':').str.replace('\'\'', '').str.replace('\' ',':')
            elif item in ['ipg', 'img']:
                out_df['Place'] = df['Rank'].fillna('')
                out_df['Bib'] = df['Rider No.'].fillna('')
                out_df['Result'] = df['Points'].str.replace(' PTS', '')
            elif item == 'ice':
                out_df['Place'] = df['Rank'].fillna('')
                out_df['Bib'] = df['Rider No.'].fillna('')
                out_df['Result'] = ''
            elif item in ['ete', 'etg']:
                out_df['Place'] = df['Rank'].fillna('')
                out_df['Bib'] = df['Team'].str.title().fillna('')
                out_df['Result'] = df['Times'].str.replace('h ', ':').str.replace('\'\'', '').str.replace('\' ',':')
            #out_df = df[df.columns[-3:]]
#             output_res(out_df, label)
            output_tables.append(out_df)


    except ValueError:
        print("No table found for " + item)
        

In [0]:
def start():
    stage_url = input("Enter the URL to LeTour results eg. https://www.tour-de-yorkshire.co.uk/en/rankings/stage-2:")
    scrape_urls(stage_url)
    output_file(output_labels, output_tables)

In [7]:
start()

Enter the URL to LeTour results eg. https://www.tour-de-yorkshire.co.uk/en/rankings/stage-2:https://www.tour-de-yorkshire.co.uk/en/rankings/stage-2
Getting result links for https://www.tour-de-yorkshire.co.uk/en/rankings/stage-2
Scraping results from Stage
Getting results for ite, https://www.tour-de-yorkshire.co.uk/en/ajax/ranking/2/ite/6f21d73eddddbb74e35c0113a43935ca/none
Scraping results from Points
Getting results for ipe, https://www.tour-de-yorkshire.co.uk/en/ajax/ranking/2/ipe/b7b8a6ce4bba7b1390561280c900d179/none
Found a Sprint
Sprint 1 - Pontefract km. 20.5
Sprint 2 - A61 Harrogate km. 84.5
Sprint 3 - Bedale km. 132
   Place  Bib Points
0      1  121     5 
1      2  145     3 
2      3  165     1 
   Place  Bib Points
0      1  145     5 
1      2  165     3 
2      3  121     1 
   Place  Bib Points
0      1   45    15 
1      2  136    12 
2      3   16     9 
3      4  126     7 
4      5  181     6 
5      6  155     5 
6      7   87     4 
7      8  115     3 
8      9 