In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()

#improve resolution
%config InlineBackend.figure_format ='retina'

# set plotting size parameter
plt.rcParams['figure.figsize'] = (17, 7)
plt.rcParams.update({'font.size': 16})

In [3]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime

In [164]:
def scrape_wsl():

    wsl_base = 'https://www.worldsurfleague.com/events'

    years = ['2022']

    tours = ['ct']
    
    print(f'Scraping WSL for years {years} and tours {tours}')
    print()

    for year in years:
        for tour in tours:
            # Build url from year and tour
            url = f'{wsl_base}/{year}/{tour}?all=1'

            # Get HTML and make soup
            html = requests.get(url).text
            soup = BeautifulSoup(html, 'html.parser')

            # Get year's events
            events_html = soup.find_all('a', class_ = "event-schedule-details__event-name")
            events = {}
            for a in events_html:
                event_name = a.text
                event_url = re.sub('main', 'results', a.get('href'))
                
                print(f'Scraping {year} {tour} {event_name}')
                
                if 'Finals' in event_name:
                    events[event_name] = {
                        'name' : event_name,
                        'year' : year,
                        'tour' : tour,
                        'url' : event_url,
                        # Scrape event
                        'rounds' : {'Final' : {
                            'name' : 'Final',
                            'url' : event_url,
                            'scores' : scrape_round(event_url)
                        }
                                   }
                                    
                    }
                else:   
                    events[event_name] = {
                        'name' : event_name,
                        'year' : year,
                        'tour' : tour,
                        'url' : event_url,
                        # Scrape each round in event
                        'rounds' : scrape_rounds(event_url)
                    }

    print('Done.')
    return events

def scrape_rounds(url):
    
    # Get HTML and make soup
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')
    
    # Get all rounds
    round_divs = soup.find("div", class_ = "post-event-watch-round-nav__items scroll-nav-wrap").find_all("div")
    rounds = {}
    for round_div in round_divs:
        round_a = round_div.find("a")
        round_name = round_a.find('span', class_ = "round-name").text
        round_href = f'https://www.worldsurfleague.com{round_a.get("href")}'
        rounds[round_name] = {
            'name' : round_name,
            'url' : round_href,
            # Get all scores for this round
            'scores' : scrape_round(round_href)
        }
    
    return rounds
    
def scrape_round(url):

    # Get HTML and make soup
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html.parser')

    # Get all surfers names and heat scores
    avatar_texts = soup.find_all("div", class_="avatar-text")
    scores = {}
    for i, avatar_text in enumerate(avatar_texts):
        # Get name
        athlete_name = avatar_text.find("div", class_="hot-heat-athlete__name hot-heat-athlete__name--full").text
        # Get country
        athlete_country = avatar_text.find("span", class_="athlete-country-flag").get("title")
        # Get score
        try:
            athlete_score = float(avatar_text.find("div", class_="hot-heat-athlete__score").text)
        except: 
            athlete_score = 0
        
        try:
            # Get number of waves
            athlete_num_waves = int(re.search(r'\d+', 
                                              avatar_text.find("div", class_="hot-heat-athlete__num-waves").text
                                             ).group())
        except:
            # Exception caused if there are no digits to be found by re
            athlete_num_waves = 0
            
        try:
            # Get wave breakdown
            athlete_waves = avatar_text.find("div", class_="hot-heat-athlete__counted-waves").text
        except:
            # If there is no wave breakdown object, the heat dit not take place. Continue for loop
            athlete_waves = ''
            
        # If there were any waves, length of string will be longer than 2
        if len(athlete_waves) > 2:
            # Get best wave score
            athlete_wave1 = float(re.search(r'^.{4}', athlete_waves).group())
            try:
                # Get second best wave score
                athlete_wave2 = float(re.search(r'(?<=\+ ).{4}', athlete_waves).group())
            except:
                # Exception caused if second best wave not found by re, therefore compute it to 0
                athlete_wave2 = 0.00
        else:
            # No waves caught but heat took place, therefore compute both waves to 0
            athlete_wave1 = 0.00
            athlete_wave2 = 0.00
            
        # Get heat outcome
        try:
            athlete_outcome = avatar_text.find("div", class_="hot-heat-athlete__abbreviated-advanced-to").text
        except:
            # Exception caused if in WSL Finals, where only win or eliminated
            athlete_outcome = 'unknown'
        
        scores[i] = {
            'name' : athlete_name,
            'country' : athlete_country,
            'score' : athlete_score,
            'num_waves' : athlete_num_waves,
            'wave1' : athlete_wave1,
            'wave2' : athlete_wave2,
            'outcome' : athlete_outcome
        }
        
    return scores

In [165]:
events = scrape_wsl()

Scraping WSL for years ['2022'] and tours ['ct']

Scraping 2022 ct Billabong Pro Pipeline
Scraping 2022 ct Hurley Pro Sunset Beach

Presented By Shiseido


Scraping 2022 ct MEO Portugal Pro

Presented By Rip Curl


Scraping 2022 ct Rip Curl Pro Bells Beach
Scraping 2022 ct Margaret River Pro
Scraping 2022 ct Quiksilver/ROXY Pro G-Land
Scraping 2022 ct Surf City El Salvador Pro

Presented By Corona


Scraping 2022 ct Oi Rio Pro

Presented By Corona


Scraping 2022 ct Corona Open J-Bay

Presented By Corona


Scraping 2022 ct Outerknown Tahiti Pro
Scraping 2022 ct Rip Curl WSL Finals
Done.


In [166]:
df_dict = {
    'event' : [],
    'year' : [],
    'tour' : [],
    'round' : [],
    'name' : [],
    'athlete_country' : [],
    'score' : [],
    'num_waves' : [],
    'wave1' : [],
    'wave2' : [],
    'outcome' : []
}

for event_name, event in events.items():
    for round_name, round_obj in event['rounds'].items():
        for score_idx, score in round_obj['scores'].items():
            df_dict['event'] += [event['name']]
            df_dict['year'] += [event['year']]
            df_dict['tour'] += [event['tour']]
            df_dict['round'] += [round_obj['name']]
            df_dict['name'] += [score['name']]
            df_dict['athlete_country'] += [score['country']]
            df_dict['score'] += [score['score']]
            df_dict['num_waves'] += [score['num_waves']]
            df_dict['wave1'] += [score['wave1']]
            df_dict['wave2'] += [score['wave2']]
            df_dict['outcome'] += [score['outcome']]

In [167]:
df = pd.DataFrame(df_dict)
df.sample(frac = 1).head()

Unnamed: 0,event,year,tour,round,name,athlete_country,score,num_waves,wave1,wave2,outcome
478,Margaret River Pro,2022,ct,Elimination Round,Jack Thomas,Australia,9.94,4,5.17,4.77,Eliminated
151,Hurley Pro Sunset Beach\n\nPresented By Shisei...,2022,ct,Elimination Round,Billy Kemper,Hawaii,11.66,2,7.93,3.73,Adv to R/32 - HT 9
353,Rip Curl Pro Bells Beach,2022,ct,Opening Round,Jackson Baker,Australia,13.77,3,7.17,6.6,Adv to R/32 - HT 3
754,Oi Rio Pro\n\nPresented By Corona\n\n,2022,ct,Semifinals,Samuel Pupo,Brazil,11.44,8,6.17,5.27,Adv to F
803,Corona Open J-Bay\n\nPresented By Corona\n\n,2022,ct,Round of 16,Kelly Slater,United States,12.87,9,6.67,6.2,Eliminated


In [171]:
df.groupby('name')['score'].mean().sort_values(ascending = False).head(20)

name
Stephanie Gilmore      15.472500
Kauli Vaast            15.052000
Tatiana Weston-Webb    14.870000
John John Florence     14.382609
Brisa Hennessy         14.330000
Jack Robinson          13.995676
Mikey Wright           13.670000
Yago Dora              13.599333
Ethan Ewing            13.434324
Nathan Hedge           13.372500
Italo Ferreira         13.348462
Luke Thompson          13.305000
Filipe Toledo          13.060465
Gabriel Medina         12.991000
Griffin Colapinto      12.975143
Koa Smith              12.770000
Kanoa Igarashi         12.546944
Barron Mamiya          12.436250
Joao Chianca           12.185385
Mick Fanning           12.167500
Name: score, dtype: float64