## Imports

In [1]:
import pandas as pd
import numpy as np
from requests import get
import json
from bs4 import BeautifulSoup
import time



## World Surf League Scores

I'm going to focus on the Pipeline Masters first. It is the marquee event of the Championship Tour and is run every year (**verify**).

### Get HTML soup and review

Use Beautiful Soup to get html from World Surf League event pages. Each page has the final result and a 'carousel' that links to previous rounds.

In [2]:
# Starting URLs
pipe_2008_url = "https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters"
res = get(pipe_2008_url)
soup = BeautifulSoup(res.content)


From inspecting page source I have found an unordered list with useful urls to get all scores from each event

In [9]:
# Find all list items within the event rounds div
carousel = soup.find('div', id='event-rounds').find_all('li')

In [16]:
# Visual check of event round urls and round names/numbers
for horse in carousel:
    link_elements = horse.find_all('a')
    print(link_elements)

[<a class="next" data-no-scroll="true" data-request-name="eventRound" href="/events/2008/mct/75/billabong-pipeline-masters?roundId=4">Round 2</a>]
[<a class="previous" data-no-scroll="true" data-request-name="eventRound" href="/events/2008/mct/75/billabong-pipeline-masters?roundId=3">Round 1</a>, <a class="next" data-no-scroll="true" data-request-name="eventRound" href="/events/2008/mct/75/billabong-pipeline-masters?roundId=5">Round 3</a>]
[<a class="previous" data-no-scroll="true" data-request-name="eventRound" href="/events/2008/mct/75/billabong-pipeline-masters?roundId=4">Round 2</a>, <a class="next" data-no-scroll="true" data-request-name="eventRound" href="/events/2008/mct/75/billabong-pipeline-masters?roundId=6">Round 4</a>]
[<a class="previous" data-no-scroll="true" data-request-name="eventRound" href="/events/2008/mct/75/billabong-pipeline-masters?roundId=5">Round 3</a>, <a class="next" data-no-scroll="true" data-request-name="eventRound" href="/events/2008/mct/75/billabong-pip

In [23]:
# Build a dict of urls and round names
round_urls = []
for horse in carousel:
    link_elements = horse.find_all('a')
    for link in link_elements:
        url_dict = {}
        url_dict['round'] = link.text
        # Add endpoint for ease of use later
        url_dict['url'] = "https://www.worldsurfleague.com" + link['href']
        print(link['href'], link.text)
        round_urls.append(url_dict)

/events/2008/mct/75/billabong-pipeline-masters?roundId=4 Round 2
/events/2008/mct/75/billabong-pipeline-masters?roundId=3 Round 1
/events/2008/mct/75/billabong-pipeline-masters?roundId=5 Round 3
/events/2008/mct/75/billabong-pipeline-masters?roundId=4 Round 2
/events/2008/mct/75/billabong-pipeline-masters?roundId=6 Round 4
/events/2008/mct/75/billabong-pipeline-masters?roundId=5 Round 3
/events/2008/mct/75/billabong-pipeline-masters?roundId=7 Quarterfinal
/events/2008/mct/75/billabong-pipeline-masters?roundId=6 Round 4
/events/2008/mct/75/billabong-pipeline-masters?roundId=52 Semifinal
/events/2008/mct/75/billabong-pipeline-masters?roundId=7 Quarterfinal
/events/2008/mct/75/billabong-pipeline-masters?roundId=77 Final
/events/2008/mct/75/billabong-pipeline-masters?roundId=52 Semifinal


In [24]:
# With help from https://stackoverflow.com/questions/11092511/list-of-unique-dictionaries
# Keep only unique values
round_urls_unique = list({unique['url']:unique for unique in round_urls}.values())
round_urls_unique

[{'round': 'Round 2',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=4'},
 {'round': 'Round 1',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=3'},
 {'round': 'Round 3',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=5'},
 {'round': 'Round 4',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=6'},
 {'round': 'Quarterfinal',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=7'},
 {'round': 'Semifinal',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=52'},
 {'round': 'Final',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=77'}]

### Get scores for every round in an event

In [31]:
# Get scores function takes the url dict and gets scores for each round of the event
def get_scores(url_list_of_dicts, event_name):
    data = []
    # Iterate through event rounds
    for round in url_list_of_dicts:
        # Get html soup
        res = get(round['url'])
        round_soup = BeautifulSoup(res.content)
        # Get list of athlete names
        athletes = round_soup.find_all("span", class_="athlete-name")

        # Get list of scores lists
        scores = round_soup.find_all("div", class_="all-waves all-waves-grid")

        # Go through athletes
        for i in range(len(athletes)):

            # Create dict to save scores
            athlete_scores = {}

            # Save event name
            athlete_scores["event"] = event_name

            # Save round name
            athlete_scores["round"] = round['round']

            # Save heat number
            athlete_scores["heat"] = (
                athletes[i]
                .find_parent("div", class_="bd new-heat-bd")
                .find_previous_sibling()
                .find("span", class_="new-heat-hd-name")
                .text
            )

            # Athlete name
            athlete_scores["name"] = athletes[i].text

            # Empty list for athletes scores in that heat
            athlete_scores["scores"] = []

            # Scores list
            per_wave_scores = scores[i].find_all("span", class_="score")

            # Append scores
            for b in range(len(per_wave_scores)):
                try:
                    athlete_scores["scores"].append(float(per_wave_scores[b].text))
                except:
                    continue

            # Add to data list
            data.append(athlete_scores)

        # Sleep
        time.sleep(1)

    return data

In [32]:
pipe_2008_scores = get_scores(round_urls_unique, 'Pipe 2008')

In [33]:
pd.DataFrame(pipe_2008_scores)

Unnamed: 0,event,round,heat,name,scores
0,Pipe 2008,Round 2,Heat 1,Kamalei Alexander,"[6.33, 3.0, 5.57, 5.07, 0.67]"
1,Pipe 2008,Round 2,Heat 1,Mikael Picon,"[2.67, 1.0, 5.17, 5.77, 5.9, 4.43]"
2,Pipe 2008,Round 2,Heat 2,Evan Valiere,"[2.67, 0.57, 1.57, 5.5, 1.67, 1.37, 9.07]"
3,Pipe 2008,Round 2,Heat 2,Mick Campbell,"[1.5, 3.17, 5.67, 1.0, 8.67]"
4,Pipe 2008,Round 2,Heat 3,Jamie O'Brien,"[5.0, 6.0, 1.33, 3.83, 1.73, 2.23, 5.73, 9.0]"
...,...,...,...,...,...
122,Pipe 2008,Semifinal,Heat 1,Tim Reyes,"[2.07, 2.57, 6.5, 9.1, 5.67]"
123,Pipe 2008,Semifinal,Heat 2,Chris Ward,"[6.83, 1.13, 1.23, 9.63, 4.5, 1.5]"
124,Pipe 2008,Semifinal,Heat 2,Adrian Buchan,"[4.33, 1.0, 1.43, 1.4, 3.5, 5.83, 2.67, 0.2]"
125,Pipe 2008,Final,Heat 1,Kelly Slater,"[1.4, 2.0, 7.17, 0.73, 1.1, 1.0, 6.83]"


### Get all scores for all events in all years

In [34]:
yearly_urls = []
for year in range(2008, 2020):
    yearly_urls.append(f'https://www.worldsurfleague.com/events/{year}/mct?all=1')

In [39]:
event_urls = []
for year in yearly_urls:
    res = get(year)
    soup = BeautifulSoup(res.content)
    for each in soup.find_all("a", class_="event-schedule-details__event-name"):
        event = {}
        event['name'] = each.text
        event['url'] = each['href']
        event_urls.append(event)
    time.sleep(1)

event_urls

[{'name': 'Quiksilver Pro Gold Coast',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/4/quiksilver-pro-gold-coast'},
 {'name': 'Rip Curl Pro Bells Beach',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/8/rip-curl-pro-bells-beach'},
 {'name': 'Billabong Pro Teahupoo',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/20/billabong-pro-teahupoo'},
 {'name': 'Globe Pro Fiji',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/24/globe-pro-fiji'},
 {'name': 'Billabong Pro J-Bay',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/31/billabong-pro-j-bay'},
 {'name': 'Rip Curl Pro Search Bali',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/37/rip-curl-pro-search-bali'},
 {'name': 'Boost Mobile Pro',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/48/boost-mobile-pro'},
 {'name': 'Quiksilver Pro France',
  'url': 'https://www.worldsurfleague.com/events/2008/mct/54/quiksilver-pro-france'},
 {'name': 'Billabong Pro Mundaka',
  'ur