# World Surf League Scores Web Scraper

## Imports

In [41]:
import pandas as pd
import numpy as np
from requests import get
import json
from bs4 import BeautifulSoup
import time



## World Surf League Scores

I'm going to focus on the Pipeline Masters first. It is the marquee event of the Championship Tour and is run every year (**verify**).

### Get HTML soup and review

Use Beautiful Soup to get html from World Surf League event pages. Each page has the final result and a 'carousel' that links to previous rounds.

In [42]:
# Starting URLs
pipe_2008_url = "https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters"
res = get(pipe_2008_url)
soup = BeautifulSoup(res.content)


From inspecting page source I have found an unordered list with useful urls to get all scores from each event

In [43]:
# Find all list items within the event rounds div
carousel = soup.find('div', id='event-rounds').find_all('li')

In [44]:
# Visual check of event round urls and round names/numbers
for horse in carousel:
    link_elements = horse.find_all('a')
    print(link_elements)

[<a class="next" data-no-scroll="true" data-request-name="eventRound" href="/events/2008/mct/75/billabong-pipeline-masters?roundId=4">Round 2</a>]
[<a class="previous" data-no-scroll="true" data-request-name="eventRound" href="/events/2008/mct/75/billabong-pipeline-masters?roundId=3">Round 1</a>, <a class="next" data-no-scroll="true" data-request-name="eventRound" href="/events/2008/mct/75/billabong-pipeline-masters?roundId=5">Round 3</a>]
[<a class="previous" data-no-scroll="true" data-request-name="eventRound" href="/events/2008/mct/75/billabong-pipeline-masters?roundId=4">Round 2</a>, <a class="next" data-no-scroll="true" data-request-name="eventRound" href="/events/2008/mct/75/billabong-pipeline-masters?roundId=6">Round 4</a>]
[<a class="previous" data-no-scroll="true" data-request-name="eventRound" href="/events/2008/mct/75/billabong-pipeline-masters?roundId=5">Round 3</a>, <a class="next" data-no-scroll="true" data-request-name="eventRound" href="/events/2008/mct/75/billabong-pip

In [45]:
# Build a dict of urls and round names
round_urls = []
for horse in carousel:
    link_elements = horse.find_all('a')
    for link in link_elements:
        url_dict = {}
        url_dict['round'] = link.text
        # Add endpoint for ease of use later
        url_dict['url'] = "https://www.worldsurfleague.com" + link['href']
        print(link['href'], link.text)
        round_urls.append(url_dict)

/events/2008/mct/75/billabong-pipeline-masters?roundId=4 Round 2
/events/2008/mct/75/billabong-pipeline-masters?roundId=3 Round 1
/events/2008/mct/75/billabong-pipeline-masters?roundId=5 Round 3
/events/2008/mct/75/billabong-pipeline-masters?roundId=4 Round 2
/events/2008/mct/75/billabong-pipeline-masters?roundId=6 Round 4
/events/2008/mct/75/billabong-pipeline-masters?roundId=5 Round 3
/events/2008/mct/75/billabong-pipeline-masters?roundId=7 Quarterfinal
/events/2008/mct/75/billabong-pipeline-masters?roundId=6 Round 4
/events/2008/mct/75/billabong-pipeline-masters?roundId=52 Semifinal
/events/2008/mct/75/billabong-pipeline-masters?roundId=7 Quarterfinal
/events/2008/mct/75/billabong-pipeline-masters?roundId=77 Final
/events/2008/mct/75/billabong-pipeline-masters?roundId=52 Semifinal


In [46]:
# With help from https://stackoverflow.com/questions/11092511/list-of-unique-dictionaries
# Keep only unique values
round_urls_unique = list({unique['url']:unique for unique in round_urls}.values())
round_urls_unique

[{'round': 'Round 2',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=4'},
 {'round': 'Round 1',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=3'},
 {'round': 'Round 3',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=5'},
 {'round': 'Round 4',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=6'},
 {'round': 'Quarterfinal',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=7'},
 {'round': 'Semifinal',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=52'},
 {'round': 'Final',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=77'}]

### Get scores for every round in an event

In [47]:
# Get scores function takes the url dict and gets scores for each round of the event
def get_scores(url_list_of_dicts, event_year, event_name):
    data = []
    # Iterate through event rounds
    for round in url_list_of_dicts:
        # Get html soup
        res = get(round['url'])
        round_soup = BeautifulSoup(res.content)
        # Get list of athlete names
        athletes = round_soup.find_all("span", class_="athlete-name")

        # Get list of scores lists
        scores = round_soup.find_all("div", class_="all-waves all-waves-grid")

        # Go through athletes
        for i in range(len(athletes)):

            # Create dict to save scores
            athlete_scores = {}

            # Save event name and year
            athlete_scores['year'] = event_year
            athlete_scores["event"] = event_name

            # Save round name
            athlete_scores["round"] = round['round']

            # Save heat number
            athlete_scores["heat"] = (
                athletes[i]
                .find_parent("div", class_="bd new-heat-bd")
                .find_previous_sibling()
                .find("span", class_="new-heat-hd-name")
                .text
            )

            # Athlete name
            athlete_scores["name"] = athletes[i].text

            # Empty list for athletes scores in that heat
            athlete_scores["scores"] = []

            # Scores list
            per_wave_scores = scores[i].find_all("span", class_="score")

            # Append scores
            for b in range(len(per_wave_scores)):
                try:
                    athlete_scores["scores"].append(float(per_wave_scores[b].text))
                except:
                    continue

            # Add to data list
            data.append(athlete_scores)

        # Sleep
        time.sleep(1)

    return data

In [49]:
pipe_2008_scores = get_scores(round_urls_unique, '2008', 'pipe')

In [50]:
pd.DataFrame(pipe_2008_scores)

Unnamed: 0,year,event,round,heat,name,scores
0,2008,pipe,Round 2,Heat 1,Kamalei Alexander,"[6.33, 3.0, 5.57, 5.07, 0.67]"
1,2008,pipe,Round 2,Heat 1,Mikael Picon,"[2.67, 1.0, 5.17, 5.77, 5.9, 4.43]"
2,2008,pipe,Round 2,Heat 2,Evan Valiere,"[2.67, 0.57, 1.57, 5.5, 1.67, 1.37, 9.07]"
3,2008,pipe,Round 2,Heat 2,Mick Campbell,"[1.5, 3.17, 5.67, 1.0, 8.67]"
4,2008,pipe,Round 2,Heat 3,Jamie O'Brien,"[5.0, 6.0, 1.33, 3.83, 1.73, 2.23, 5.73, 9.0]"
...,...,...,...,...,...,...
122,2008,pipe,Semifinal,Heat 1,Tim Reyes,"[2.07, 2.57, 6.5, 9.1, 5.67]"
123,2008,pipe,Semifinal,Heat 2,Chris Ward,"[6.83, 1.13, 1.23, 9.63, 4.5, 1.5]"
124,2008,pipe,Semifinal,Heat 2,Adrian Buchan,"[4.33, 1.0, 1.43, 1.4, 3.5, 5.83, 2.67, 0.2]"
125,2008,pipe,Final,Heat 1,Kelly Slater,"[1.4, 2.0, 7.17, 0.73, 1.1, 1.0, 6.83]"


### Get all scores for all Pipeline events in all years

In [51]:
# Get urls for each year's events page
yearly_urls = []
for year in range(2008, 2023):
    # Exclude 2020, all events were cancelled
    if year != 2020:
        dict_list = {}
        dict_list['year'] = year
        dict_list['url'] = f'https://www.worldsurfleague.com/events/{year}/mct?all=1'
        yearly_urls.append(dict_list)

In [52]:
# Get URL for each event accross years
event_urls = []
for year in yearly_urls:
    res = get(year['url'])
    soup = BeautifulSoup(res.content)
    for each in soup.find_all("a", class_="event-schedule-details__event-name"):
        # Only get pipeline events
        if 'Pipe' in each.text.replace('/',''):
            event = {}
            event['year'] = year['year']
            event['name'] = each.text.replace('/','')
            event['url'] = each['href']
            event_urls.append(event)
        else:
            continue
    time.sleep(1)

event_urls

[{'year': 2008,
  'name': 'Billabong Pipeline Masters',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters'},
 {'year': 2009,
  'name': 'Billabong Pipeline Masters',
  'url': 'https://www.worldsurfleague.com/events/2009/mct/138/billabong-pipeline-masters'},
 {'year': 2010,
  'name': 'Billabong Pipe Masters',
  'url': 'https://www.worldsurfleague.com/events/2010/mct/220/billabong-pipe-masters'},
 {'year': 2011,
  'name': 'Billabong Pipe Masters',
  'url': 'https://www.worldsurfleague.com/events/2011/mct/368/billabong-pipe-masters'},
 {'year': 2012,
  'name': 'Billabong Pipe Masters',
  'url': 'https://www.worldsurfleague.com/events/2012/mct/504/billabong-pipe-masters'},
 {'year': 2013,
  'name': 'Billabong Pipe Masters',
  'url': 'https://www.worldsurfleague.com/events/2013/mct/615/billabong-pipe-masters'},
 {'year': 2014,
  'name': 'Billabong Pipe Masters',
  'url': 'https://www.worldsurfleague.com/events/2014/mct/730/billabong-pipe-masters'},
 {'ye

In [53]:
# Get all scores and save them as a list of df
df_list = []
for event in event_urls:
    print(f"Getting {event['year']} {event['name']}...")
    res = get(event["url"])
    soup = BeautifulSoup(res.content)
    # Find all list items within the event rounds div
    try:
        carousel = soup.find("div", id="event-rounds").find_all("li")
    except:
        print(f"FAILED {event['year']} {event['name']}")
        continue

    # Build a dict of urls and round names
    round_urls = []
    for horse in carousel:
        link_elements = horse.find_all("a")
        for link in link_elements:
            url_dict = {}
            url_dict["round"] = link.text
            # Add endpoint for ease of use later
            url_dict["url"] = "https://www.worldsurfleague.com" + link["href"]
            round_urls.append(url_dict)

    # Keep only unique values
    round_urls_unique = list({unique["url"]: unique for unique in round_urls}.values())

    # Run get scores function and assign result to a variable
    event_scores = get_scores(
        round_urls_unique, event_year=event["year"], event_name=event["name"]
    )

    # Convert scores data to DataFrame and append to df_list
    # pd.DataFrame(event_scores).to_csv(
    #     f"./data/{event['name']} {event['year']}.csv", index=False
    # )
    df_list.append(pd.DataFrame(event_scores))
    print(f"... got {event['year']} {event['name']}")


Getting 2008 Billabong Pipeline Masters...
... got 2008 Billabong Pipeline Masters
Getting 2009 Billabong Pipeline Masters...
... got 2009 Billabong Pipeline Masters
Getting 2010 Billabong Pipe Masters...
... got 2010 Billabong Pipe Masters
Getting 2011 Billabong Pipe Masters...
... got 2011 Billabong Pipe Masters
Getting 2012 Billabong Pipe Masters...
... got 2012 Billabong Pipe Masters
Getting 2013 Billabong Pipe Masters...
... got 2013 Billabong Pipe Masters
Getting 2014 Billabong Pipe Masters...
... got 2014 Billabong Pipe Masters
Getting 2015 Billabong Pipe Masters...
... got 2015 Billabong Pipe Masters
Getting 2016 Billabong Pipe Masters...
... got 2016 Billabong Pipe Masters
Getting 2017 Billabong Pipe Masters...
... got 2017 Billabong Pipe Masters
Getting 2018 Billabong Pipe Masters...
... got 2018 Billabong Pipe Masters
Getting 2019 Billabong Pipe Masters...
... got 2019 Billabong Pipe Masters
Getting 2021 Billabong Pipe Masters presented by Hydro Flask...
... got 2021 Billabo

In [54]:
pd.concat(df_list)

Unnamed: 0,year,event,round,heat,name,scores
0,2008,Billabong Pipeline Masters,Round 2,Heat 1,Kamalei Alexander,"[6.33, 3.0, 5.57, 5.07, 0.67]"
1,2008,Billabong Pipeline Masters,Round 2,Heat 1,Mikael Picon,"[2.67, 1.0, 5.17, 5.77, 5.9, 4.43]"
2,2008,Billabong Pipeline Masters,Round 2,Heat 2,Evan Valiere,"[2.67, 0.57, 1.57, 5.5, 1.67, 1.37, 9.07]"
3,2008,Billabong Pipeline Masters,Round 2,Heat 2,Mick Campbell,"[1.5, 3.17, 5.67, 1.0, 8.67]"
4,2008,Billabong Pipeline Masters,Round 2,Heat 3,Jamie O'Brien,"[5.0, 6.0, 1.33, 3.83, 1.73, 2.23, 5.73, 9.0]"
...,...,...,...,...,...,...
105,2022,Billabong Pro Pipeline,Semifinals,Heat 1 Watch replay,Miguel Pupo,"[2.25, 6.33, 3.17]"
106,2022,Billabong Pro Pipeline,Semifinals,Heat 2 Watch replay,Seth Moniz,"[1.77, 5.67, 0.8, 7.83, 0.2, 1.23, 0.33]"
107,2022,Billabong Pro Pipeline,Semifinals,Heat 2 Watch replay,Caio Ibelli,"[1.23, 1.2, 4.83, 0.63, 1.5, 1.0, 0.23]"
108,2022,Billabong Pro Pipeline,Final,Heat 1 Watch replay,Kelly Slater,"[1.77, 1.37, 9.0, 7.17, 8.17, 9.77]"


In [55]:
pd.concat(df_list).to_csv('./data/wsl/pipeline_scores_all_years.csv')

## World Surf League Dates and Times of Rounds

Using a similar loop as before, get dates for event heats where possible

In [24]:
res = get(event_urls[-4]["url"] + "/results")
soup = BeautifulSoup(res.content)
results_grid = soup.find(
    "div", class_="post-event-watch-heat-grid__body post-event-watch-module-body"
)
round_name = results_grid.find(
    "div", class_="post-event-watch-heat-grid__round-header"
).text.strip()
heats = results_grid.find_all("div", class_="hot-heat__hd")


In [25]:
heats[0].find("span", class_="heat-name").text


'Heat 1'

In [26]:
heats[0].find("span", class_="hot-heat__status hot-heat__status--over").text


'Ended February 15, 2022'

In [37]:
# Build a data frame with year,event,round,heat,date columns for all events all years
df_list = []
for event in event_urls:
    data = []
    print(f"Getting {event['year']} {event['name']}...")
    res = get(event["url"] + "/results")
    soup = BeautifulSoup(res.content)
    results_grid = soup.find(
        "div", class_="post-event-watch-heat-grid__body post-event-watch-module-body"
    )

    if results_grid:
        round_name = results_grid.find(
            "div", class_="post-event-watch-heat-grid__round-header"
        ).text.strip()
        heats = results_grid.find_all("div", class_="hot-heat__hd")
        for heat in heats:
            heat_data = {}
            try:
                heat_data["year"] = event["year"]
                heat_data["event"] = event["name"]
                heat_data["round"] = results_grid.find(
                    "div", class_="post-event-watch-heat-grid__round-header"
                ).text.strip()
                heat_data["heat"] = heat.find("span", class_="heat-name").text
                heat_data["date"] = heat.find(
                    "span", class_="hot-heat__status hot-heat__status--over"
                ).text
                data.append(heat_data)
            except:
                print(f"something wrong in heat data for {event['year']} {event['name']}")
                continue
        print(f"... got {event['year']} {event['name']}")

    else:
        print(f"FAILED {event['year']} {event['name']}")
        continue

    df_list.append(pd.DataFrame(data))
    time.sleep(2)


Getting 2008 Billabong Pipeline Masters...
... got 2008 Billabong Pipeline Masters
Getting 2009 Billabong Pipeline Masters...
... got 2009 Billabong Pipeline Masters
Getting 2010 Billabong Pipe Masters...
... got 2010 Billabong Pipe Masters
Getting 2011 Billabong Pipe Masters...
... got 2011 Billabong Pipe Masters
Getting 2012 Billabong Pipe Masters...
... got 2012 Billabong Pipe Masters
Getting 2013 Billabong Pipe Masters...
... got 2013 Billabong Pipe Masters
Getting 2014 Billabong Pipe Masters...
... got 2014 Billabong Pipe Masters
Getting 2015 Billabong Pipe Masters...
... got 2015 Billabong Pipe Masters
Getting 2016 Billabong Pipe Masters...
... got 2016 Billabong Pipe Masters
Getting 2017 Billabong Pipe Masters...
... got 2017 Billabong Pipe Masters
Getting 2018 Billabong Pipe Masters...
... got 2018 Billabong Pipe Masters
Getting 2019 Billabong Pipe Masters...
... got 2019 Billabong Pipe Masters
Getting 2021 Billabong Pipe Masters presented by Hydro Flask...
... got 2021 Billabo

In [40]:
pd.concat(df_list).to_csv('./data/wsl/pipe_masters_heat_dates.csv')