## Scrapes Volksläufe from Runner's World Website

Scrapes information from every Volkslauf listed on https://www.runnersworld.de/volkslaeufe-strassenlaeufe/

In [1]:
from autoscraper import AutoScraper

In [185]:
ex1 = (
    "https://www.runnersworld.de/laufkalender/badischemeilekarlsruhe/",
    [
        "Badische Meile Karlsruhe",
        "Stadtlauf / Citylauf", "Volkslauf / Straßenlauf",
        "08.05.2022",
        "76136 Karlsruhe, Hermann-Veit-Straße 3", 
        "www.badischemeile.de",
        "info@badischemeile.de"
    ]
)

ex2 = (
    "https://www.runnersworld.de/laufkalender/placentia-half-marathon/",
    [
        "Placentia Half Marathon"
        "Stadtlauf / Citylauf", "Volkslauf / Straßenlauf",
        "08.05.2022",
        "03.05.2022",
        "IT 29121 Piacenza, Palazzo Farnese",
        "www.placentiahalfmarathon.org",
        "info@placentiahalfmarathon.it"
    ]
)

ex3 = (
    "https://www.runnersworld.de/laufkalender/wings-for-life-world-run-muenchen/",
    [
        "Wings for Life World Run München",
        "Volkslauf / Straßenlauf",
        "Landschaftslauf",
        "Spendenlauf / Charity-Lauf",
        "08.05.2022",
        "80809 München, Hans-Jochen-Vogel-Platz (Olympiapark/Nord)",
        "www.wingsforlifeworldrun.com/de/locations/munich"
    ]
)

ex4 = (
    "https://www.runnersworld.de/laufkalender/lisbon-eco-marathon/",
    [
        "Lisbon Eco Marathon: abgesagt",
        "Stadtlauf / Citylauf", "Volkslauf / Straßenlauf",
        "08.05.2022",
        "PT 1070-051 Lissabon, Parque Eduardo VII de Inglaterra",
        "maratonadelisboa.pt",
        "lisbonecomarathon@clubechronos.com"
    ]
)

data_train = [
    ex1,
    ex2, 
    ex3,
    ex4
]

In [86]:
# training
scraper = AutoScraper()
for url, wanted_list in data_train:
    scraper.build(url, wanted_list, update=True)

In [43]:
# create a scraper for the tables...autoscraper can't handle varying tables
import requests
import csv
from bs4 import BeautifulSoup

# url = "https://www.runnersworld.de/laufkalender/gutenbergmarathonmainz/"
# url = "https://www.runnersworld.de/laufkalender/lissabonhalbmarathon/"
url = "https://www.runnersworld.de/laufkalender/wings-for-life-world-run-muenchen/"
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
table = soup.find('table')

list_rows = [] #make list for rows
for row in table.findAll('tr')[1:]:
    list_cells = [] #array for row
    for cell in row.findAll('td'):
        text = cell.text.replace('&nbsp;', '') #replace nonbreaking space
        list_cells.append(text)
    list_rows.append(list_cells)
    
list_rows

[['möglichst viele km bis zum Kontakt mit Catcher Car',
  '13.00 Uhr',
  '49,00 - 59,00 Euro']]

In [95]:
def scrape_table(url: str) -> list:
    """
    Extracts the table from a runnersworld.de table, containing information
    about the run. Returns a list of lists. Each inner list corresponds
    to a row in the table. E.g a table with 2 rows and 3 cols will yield:
    X = [[x11, x12, x13],
         [x21, x21, x22]]
    """
    response = requests.get(url)
    html = response.content
    soup = BeautifulSoup(html)
    table = soup.find('table')

    list_rows = [] #make list for rows
    for row in table.findAll('tr')[1:]:
        list_cells = [] #array for row
        for cell in row.findAll('td'):
            text = cell.text.replace('&nbsp;', '') #replace nonbreaking space
            list_cells.append(text)
        list_rows.append(list_cells)
        
    return list_rows

In [82]:
# extract the links of the runs from https://www.runnersworld.de/volkslaeufe-strassenlaeufe/
url = "https://www.runnersworld.de/volkslaeufe-strassenlaeufe/"
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html)
table = soup.find('table')

links = []
for link in soup.findAll('a'):
    l = link.get('href')
    if l is None:
        continue
    if "laufkalender" in l:
        # clean
        if "laufeintragen" in l:
            continue
        if l == 'https://www.runnersworld.de/laufkalender/':
            continue
        links.append(l)    
links

['https://www.runnersworld.de/laufkalender/altstadtlaufweinheim/',
 'https://www.runnersworld.de/laufkalender/lissabonhalbmarathon/',
 'https://www.runnersworld.de/laufkalender/seemeilebremerhaven/',
 'https://www.runnersworld.de/laufkalender/badischemeilekarlsruhe/',
 'https://www.runnersworld.de/laufkalender/20-km-de-lausanne/',
 'https://www.runnersworld.de/laufkalender/neheimer-citylauf-arnsberg/',
 'https://www.runnersworld.de/laufkalender/wings-for-life-world-run-schweiz/',
 'https://www.runnersworld.de/laufkalender/saarbrueckener-city-lauf/',
 'https://www.runnersworld.de/laufkalender/lisbon-eco-marathon/',
 'https://www.runnersworld.de/laufkalender/wings-for-life-world-run-muenchen/',
 'https://www.runnersworld.de/laufkalender/heilbronnertrollingermarathon/',
 'https://www.runnersworld.de/laufkalender/duathlon-krailling/',
 'https://www.runnersworld.de/laufkalender/rahlstedterwandselaufhamburg/',
 'https://www.runnersworld.de/laufkalender/barcelona-marathon/',
 'https://www.run

In [84]:
# # remove everything after the 5th "/"
# import re
# pattern = ".+"
# test_string = 'https://www.runnersworld.de/laufkalender/salzburger-frauenlauf/'
# ls = test_string.split("/")
# clean_string = "/".join(ls[:5])
# clean_string    

'https://www.runnersworld.de/laufkalender/salzburger-frauenlauf'

In [96]:
# go through each link and extract information
volkslaeufe = {}
for url in links:
    info = scraper.get_result_similar(url)
    tbl = scrape_table(url)
    volkslaeufe[url] = [info, tbl]

In [177]:
# clean the extracted information
def remove_multiple_titles(autoscraper_result):
    """
    Sometimes the title of the run is captured 2 or 3 times in different variations
    # example 1:
        # 'Bitterfelder Goitzsche-Marathon',
        # 'Goitzsche-Marathon Bitterfeld',
        # ': Goitzsche-Marathon Bitterfeld',
    # example 2
        # 'Seemeile Bremerhaven',
        # ': Seemeile Bremerhaven',
    # example 3
        # 'Wings for Life World Run Schweiz',
        # ': Wings for Life World Run Schweiz',
    # example 4
        # 'Heilbronner Trollinger-Marathon',
        # ': Heilbronner Trollinger-Marathon'
    """
    x = autoscraper_result
    substrings = x[0].split()
    for sub in substrings:
        if sub in x[2]:
            x.pop(2)
            x.pop(1)
            break
        if sub in x[1]:
            x.pop(1)
            break
    return x

for key, val in volkslaeufe.items():
    autoscraper_result = val[0]
    if "StatischeSeiten" in autoscraper_result:
        autoscraper_result.remove("StatischeSeiten")
    volkslaeufe[key][0] = remove_multiple_titles(autoscraper_result)

In [184]:
i = np.random.randint(len(volkslaeufe))
volkslaeufe[list(volkslaeufe.keys())[i]]

[['Badische Meile Karlsruhe',
  'Stadtlauf / Citylauf',
  'Volkslauf / Straßenlauf',
  '08.05.2022',
  '76135 Karlsruhe, Hermann-Veit-Straße 3',
  'www.badischemeile.de',
  'info@badischemeile.de'],
 [['8,888 km ', '10.30 Uhr', '22,00 - 25,00 Euro']]]