# Web Scraper

## Imports

In [1]:
import pandas as pd
import numpy as np
from requests import get
import json
from bs4 import BeautifulSoup
import time



## World Surf League Scores

I'm going to focus on the Pipeline Masters first. It is the marquee event of the Championship Tour and is run every year (**verify**).

### Get HTML soup and review

In [2]:
# Starting URLs
pipe_2008_url = "https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=3"
res = get(pipe_2008_url)
soup = BeautifulSoup(res.content)


In [3]:
soup.find_all("div", class_="all-waves all-waves-grid")[0].find_all(
    "span", class_="score"
)[3].text


'\xa0'

In [4]:
len(soup.find_all("div", class_="all-waves all-waves-grid"))


32

In [5]:
soup.find_all("span", class_="athlete-name")[0].text


'Kamalei Alexander'

In [19]:
soup.find_all("span", class_="new-heat-hd-name")[0].text


'Heat 1 '

In [6]:
len(soup.find_all("span", class_="athlete-name"))


32

### Round 1

In [7]:
# Starting URLs
pipe_2008_url = "https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=3"
res = get(pipe_2008_url)
soup = BeautifulSoup(res.content)


In [8]:
athletes = soup.find_all("span", class_="athlete-name")
scores = soup.find_all("div", class_="all-waves all-waves-grid")


In [9]:
round_1 = []
for i in range(len(athletes)):
    athlete_scores = {}
    athlete_scores["name"] = soup.find_all("span", class_="athlete-name")[i].text
    athlete_scores["scores"] = []
    for b in range(len(scores[i].find_all("span", class_="score"))):
        try:
            athlete_scores["scores"].append(
                float(scores[i].find_all("span", class_="score")[b].text)
            )
        except:
            continue
    round_1.append(athlete_scores)

round_1


[{'name': 'Kamalei Alexander', 'scores': [8.83, 6.83, 1.5]},
 {'name': 'Daniel Ross', 'scores': [9.93, 1.23]},
 {'name': 'Jesse Merle-Jones', 'scores': [2.83, 0.2, 1.57, 7.17]},
 {'name': 'Tj Barron', 'scores': [3.5, 1.83, 2.77, 0.27, 2.9, 1.8, 4.83, 0.5]},
 {'name': 'Ezra Sitt', 'scores': [1.33, 7.5, 3.83]},
 {'name': 'Daniel Jones', 'scores': [1.17, 5.67, 1.67, 1.63, 2.83, 1.4, 1.97]},
 {'name': 'Tory Barron',
  'scores': [4.33, 6.17, 0.9, 7.67, 5.67, 1.5, 3.43, 1.33]},
 {'name': 'Royden Bryson',
  'scores': [0.6, 3.33, 2.5, 1.6, 4.67, 6.17, 2.1, 4.43, 0.73]},
 {'name': 'Aritz Aranburu', 'scores': [1.33, 9.33, 3.27, 5.67, 4.33]},
 {'name': 'Kalani Chapman', 'scores': [4.17, 0.77, 4.27, 1.0, 7.33]},
 {'name': 'Dave Wassel', 'scores': [9.67, 3.67, 1.93]},
 {'name': 'Ricky Basnett', 'scores': [3.17, 0.83, 6.0]},
 {'name': 'Evan Valiere', 'scores': [1.83, 3.67, 2.57, 6.0, 1.97, 1.77, 6.57]},
 {'name': 'Olamana Eleogram', 'scores': [0.27, 7.67, 0.33, 4.67, 0.83]},
 {'name': 'John John Flo

### Generalize for all rounds

In [10]:
roundids = {
    3: "round_1",
    4: "round_2",
    5: "round_3",
    6: "round_4",
    7: "quarter",
    52: "semi",
    77: "final",
}  # roundIds do not follow a strict logical order

event_name = "2008 Pipeline"
data = []

for roundid, round_name in roundids.items():
    url = f"https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId={roundid}"
    res = get(url)
    soup = BeautifulSoup(res.content)
    athletes = soup.find_all("span", class_="athlete-name")
    scores = soup.find_all("div", class_="all-waves all-waves-grid")

    for i in range(len(athletes)):
        athlete_scores = {}
        athlete_scores["event"] = event_name
        athlete_scores["round"] = round_name
        athlete_scores["name"] = soup.find_all("span", class_="athlete-name")[i].text
        athlete_scores["scores"] = []
        for b in range(len(scores[i].find_all("span", class_="score"))):
            try:
                athlete_scores["scores"].append(
                    float(scores[i].find_all("span", class_="score")[b].text)
                )
            except:
                continue

        data.append(athlete_scores)

    time.sleep(3)


In [11]:
pd.DataFrame(data)

Unnamed: 0,event,round,name,scores
0,2008 Pipeline,round_1,Kamalei Alexander,"[8.83, 6.83, 1.5]"
1,2008 Pipeline,round_1,Daniel Ross,"[9.93, 1.23]"
2,2008 Pipeline,round_1,Jesse Merle-Jones,"[2.83, 0.2, 1.57, 7.17]"
3,2008 Pipeline,round_1,Tj Barron,"[3.5, 1.83, 2.77, 0.27, 2.9, 1.8, 4.83, 0.5]"
4,2008 Pipeline,round_1,Ezra Sitt,"[1.33, 7.5, 3.83]"
...,...,...,...,...
122,2008 Pipeline,semi,Tim Reyes,"[2.07, 2.57, 6.5, 9.1, 5.67]"
123,2008 Pipeline,semi,Chris Ward,"[6.83, 1.13, 1.23, 9.63, 4.5, 1.5]"
124,2008 Pipeline,semi,Adrian Buchan,"[4.33, 1.0, 1.43, 1.4, 3.5, 5.83, 2.67, 0.2]"
125,2008 Pipeline,final,Kelly Slater,"[1.4, 2.0, 7.17, 0.73, 1.1, 1.0, 6.83]"


### Generalize to a function for all events

In [20]:
events = [
    {
        "name": "2008 Gold Coast",
        "url": "https://www.worldsurfleague.com/events/2008/mct/4/quiksilver-pro-gold-coast?roundId=",
        "rounds": {
            16: "round_1",
            36: "round_2",
            20: "round_3",
            42: "round_4",
            48: "quarter",
            69: "semi",
            71: "final",
        },
    },
    {
        "name": "2008 Bells Beach",
        "url": "https://www.worldsurfleague.com/events/2008/mct/8/rip-curl-pro-bells-beach?roundId=",
        "rounds": {
            31: "round_1",
            10: "round_2",
            50: "round_3",
            56: "round_4",
            60: "quarter",
            64: "semi",
            68: "final",
        },
    },
    {
        "name": "2008 Teahupoo",
        "url": "https://www.worldsurfleague.com/events/2008/mct/20/billabong-pro-teahupoo?roundId=",
        "rounds": {
            8: "round_1",
            35: "round_2",
            30: "round_3",
            43: "round_4",
            51: "quarter",
            65: "semi",
            66: "final",
        },
    },
    {
        "name": "2008 Fiji",
        "url": "https://www.worldsurfleague.com/events/2008/mct/24/globe-pro-fiji?roundId=",
        "rounds": {
            13: "round_1",
            37: "round_2",
            18: "round_3",
            49: "round_4",
            58: "quarter",
            57: "semi",
            74: "final",
        },
    },
    {
        "name": "2008 J-Bay",
        "url": "https://www.worldsurfleague.com/events/2008/mct/31/billabong-pro-j-bay?roundId=",
        "rounds": {
            22: "round_1",
            15: "round_2",
            25: "round_3",
            53: "round_4",
            59: "quarter",
            63: "semi",
            72: "final",
        },
    },
    {
        "name": "2008 Bali",
        "url": "https://www.worldsurfleague.com/events/2008/mct/37/rip-curl-pro-search-bali?roundId=",
        "rounds": {
            12: "round_1",
            28: "round_2",
            40: "round_3",
            62: "round_4",
            61: "quarter",
            75: "semi",
            76: "final",
        },
    },
    {
        "name": "2008 Trestles",
        "url": "https://www.worldsurfleague.com/events/2008/mct/48/boost-mobile-pro?roundId=",
        "rounds": {
            26: "round_1",
            11: "round_2",
            19: "round_3",
            24: "round_4",
            32: "quarter",
            55: "semi",
            54: "final",
        },
    },
    {
        "name": "2008 France",
        "url": "https://www.worldsurfleague.com/events/2008/mct/54/quiksilver-pro-france?roundId=",
        "rounds": {
            33: "round_1",
            14: "round_2",
            29: "round_3",
            39: "round_4",
            38: "quarter",
            45: "semi",
            46: "final",
        },
    },
    {
        "name": "2008 Mundaka",
        "url": "https://www.worldsurfleague.com/events/2008/mct/55/billabong-pro-mundaka?roundId=",
        "rounds": {
            1: "round_1",
            34: "round_2",
            2: "round_3",
            47: "round_4",
            41: "quarter",
            44: "semi",
            73: "final",
        },
    },
    {
        "name": "2008 Santa Catarina",
        "url": "https://www.worldsurfleague.com/events/2008/mct/66/hang-loose-santa-catarina-pro?roundId=",
        "rounds": {
            21: "round_1",
            17: "round_2",
            23: "round_3",
            9: "round_4",
            27: "quarter",
            70: "semi",
            67: "final",
        },
    },
    {
        "name": "2008 Pipeline",
        "url": "https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=",
        "rounds": {
            3: "round_1",
            4: "round_2",
            5: "round_3",
            6: "round_4",
            7: "quarter",
            52: "semi",
            77: "final",
        },
    },
    {
        "name": "2009 Gold Coast",
        "url": "https://www.worldsurfleague.com/events/2009/mct/83/quiksilver-pro-gold-coast?roundId=",
        "rounds": {
            88: "round_1",
            104: "round_2",
            91: "round_3",
            117: "round_4",
            10048: "quarter",
            118: "semi",
            130: "final",
        },
    },
    {
        "name": "2009 Bells Beach",
        "url": "https://www.worldsurfleague.com/events/2009/mct/91/rip-curl-pro-bells-beach?roundId=",
        "rounds": {
            94: "round_1",
            106: "round_2",
            105: "round_3",
            108: "quarter",
            141: "semi",
            140: "final",
        },
    },
    {
        "name": "2009 Teahupoo",
        "url": "https://www.worldsurfleague.com/events/2009/mct/95/billabong-pro-teahupoo?roundId=",
        "rounds": {
            82: "round_1",
            81: "round_2",
            101: "round_3",
            102: "quarter",
            99: "semi",
            121: "final",
        },
    },
    {
        "name": "2009 Santa Catarina",
        "url": "https://www.worldsurfleague.com/events/2009/mct/99/hang-loose-santa-catarina-pro?roundId=",
        "rounds": {
            92: "round_1",
            80: "round_2",
            97: "round_3",
            107: "round_4",
            112: "quarter",
            128: "semi",
            129: "final",
        },
    },
    {
        "name": "2009 J-Bay",
        "url": "https://www.worldsurfleague.com/events/2009/mct/104/billabong-pro-j-bay?roundId=",
        "rounds": {
            89: "round_1",
            109: "round_2",
            116: "round_3",
            137: "quarter",
            138: "semi",
            139: "final",
        },
    },
    {
        "name": "2009 Trestles",
        "url": "https://www.worldsurfleague.com/events/2009/mct/116/hurley-pro-trestles?roundId=",
        "rounds": {
            84: "round_1",
            103: "round_2",
            79: "round_3",
            86: "round_4",
            95: "quarter",
            132: "semi",
            136: "final",
        },
    },
]


In [21]:
def get_scores(event_dict=events):
    """Takes a dict of World Surf League Events, URLs and roundIds and scrapes the scores"""
    data = []

    for event in events:
        event_name = event["name"]
        url = event["url"]
        for roundid, round_name in event["rounds"].items():
            req_url = url + str(roundid)
            print(req_url)
            res = get(req_url)
            soup = BeautifulSoup(res.content)
            athletes = soup.find_all("span", class_="athlete-name")
            scores = soup.find_all("div", class_="all-waves all-waves-grid")

            for i in range(len(athletes)):
                athlete_scores = {}
                athlete_scores["event"] = event_name
                athlete_scores["round"] = round_name
                # athlete_scores['heat'] = i
                athlete_scores["name"] = athletes[i].text
                athlete_scores["scores"] = []
                per_wave_scores = scores[i].find_all("span", class_="score")
                for b in range(len(per_wave_scores)):
                    try:
                        athlete_scores["scores"].append(float(per_wave_scores[b].text))
                    except:
                        continue

                data.append(athlete_scores)

            time.sleep(10)

    return pd.DataFrame(data)

In [22]:
data = get_scores(event_dict=events)
data.to_csv('./data/scores.csv', index=False)
data

https://www.worldsurfleague.com/events/2008/mct/4/quiksilver-pro-gold-coast?roundId=16
https://www.worldsurfleague.com/events/2008/mct/4/quiksilver-pro-gold-coast?roundId=36
https://www.worldsurfleague.com/events/2008/mct/4/quiksilver-pro-gold-coast?roundId=20
https://www.worldsurfleague.com/events/2008/mct/4/quiksilver-pro-gold-coast?roundId=42
https://www.worldsurfleague.com/events/2008/mct/4/quiksilver-pro-gold-coast?roundId=48
https://www.worldsurfleague.com/events/2008/mct/4/quiksilver-pro-gold-coast?roundId=69
https://www.worldsurfleague.com/events/2008/mct/4/quiksilver-pro-gold-coast?roundId=71
https://www.worldsurfleague.com/events/2008/mct/8/rip-curl-pro-bells-beach?roundId=31
https://www.worldsurfleague.com/events/2008/mct/8/rip-curl-pro-bells-beach?roundId=10
https://www.worldsurfleague.com/events/2008/mct/8/rip-curl-pro-bells-beach?roundId=50
https://www.worldsurfleague.com/events/2008/mct/8/rip-curl-pro-bells-beach?roundId=56
https://www.worldsurfleague.com/events/2008/mct

Unnamed: 0,event,round,name,scores
0,2008 Gold Coast,round_1,Bobby Martinez,"[7.33, 4.07, 2.87, 4.67, 5.6]"
1,2008 Gold Coast,round_1,Royden Bryson,"[5.17, 0.43, 5.5, 2.73, 4.57]"
2,2008 Gold Coast,round_1,Jihad Khodr,"[2.17, 6.5, 3.33, 0.3, 1.0, 0.57, 3.83]"
3,2008 Gold Coast,round_1,Dean Morrison,"[9.0, 4.33, 1.17, 5.0, 6.17]"
4,2008 Gold Coast,round_1,Kieren Perrow,"[5.5, 0.5, 4.67, 7.5, 0.87]"
...,...,...,...,...
2226,2009 Trestles,semi,Bede Durbidge,"[6.5, 5.83, 3.93, 7.73, 4.03]"
2227,2009 Trestles,semi,Mick Fanning,"[6.67, 8.33, 6.0, 1.0, 7.93, 1.5]"
2228,2009 Trestles,semi,Kelly Slater,"[1.17, 2.33, 3.4, 1.2, 1.0, 2.0, 1.0, 0.63, 0...."
2229,2009 Trestles,final,Mick Fanning,"[7.83, 3.83, 6.17, 8.5, 8.9]"


### Generalize for all years, all events per year

Use a for loop to check every roundid, can be slow but hard coding event info in a dict seems like unnecessary work.

If this gets too messy I can go back to hard coding.

In [3]:
yearly_urls = []
for year in range(2008, 2020):
    yearly_urls.append(f'https://www.worldsurfleague.com/events/{year}/mct?all=1')

In [4]:
res = get(yearly_urls[0])
soup = BeautifulSoup(res.content)

In [39]:
for i in range(11):
    print(soup.find_all("a", class_="event-schedule-details__event-name")[i]['href'])

https://www.worldsurfleague.com/events/2008/mct/4/quiksilver-pro-gold-coast
https://www.worldsurfleague.com/events/2008/mct/8/rip-curl-pro-bells-beach
https://www.worldsurfleague.com/events/2008/mct/20/billabong-pro-teahupoo
https://www.worldsurfleague.com/events/2008/mct/24/globe-pro-fiji
https://www.worldsurfleague.com/events/2008/mct/31/billabong-pro-j-bay
https://www.worldsurfleague.com/events/2008/mct/37/rip-curl-pro-search-bali
https://www.worldsurfleague.com/events/2008/mct/48/boost-mobile-pro
https://www.worldsurfleague.com/events/2008/mct/54/quiksilver-pro-france
https://www.worldsurfleague.com/events/2008/mct/55/billabong-pro-mundaka
https://www.worldsurfleague.com/events/2008/mct/66/hang-loose-santa-catarina-pro
https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters


In [6]:
soup.find_all("a", class_="event-schedule-details__event-name")[0].text

'Quiksilver Pro Gold Coast'

In [7]:
# Create a url for each year that shows the events page of the mens world tour
for year in range(2008, 2020):
    year_url = f"https://www.worldsurfleague.com/events/{year}/mct?all=1"
    data = []

    # Get html content for each year
    result = get(year_url)
    soup = BeautifulSoup(result.content)

    # Get a list of event info for each event in that year
    event_info = soup.find_all("a", class_="event-schedule-details__event-name")

    # Cycle through events and get their URLs
    for event in event_info:
        url = event["href"]
        
        # Cycle through roundId numbers and try to get scores. 
        # This is difficult because roundIds do not appear to be ordered in any way
        for i in range(20_000):
            try:
                # Append roundId integer to try
                res = get(f'{url}?roundId={i}')
                print(f'Got roundId: {i} for {event.text}')

                # Get html soup
                event_soup = BeautifulSoup(res.content)

                #Get list of athlete names
                athletes = event_soup.find_all("span", class_="athlete-name")

                # Get list of scores lists
                scores = event_soup.find_all("div", class_="all-waves all-waves-grid")

                # Go through athletes
                for i in range(len(athletes)):

                    # Create dict to save scores
                    athlete_scores = {}

                    # Save event name
                    # athlete_scores["event"] = event_name

                    # Save round name
                    # athlete_scores["round"] = round_name
                    
                    # Save heat number
                    # athlete_scores['heat'] = i
                    
                    # Athlete name
                    athlete_scores["name"] = athletes[i].text

                    # Empty list for athletes scores in that heat
                    athlete_scores["scores"] = []

                    # Scores list
                    per_wave_scores = scores[i].find_all("span", class_="score")

                    # Append scores
                    for b in range(len(per_wave_scores)):
                        try:
                            athlete_scores["scores"].append(float(per_wave_scores[b].text))
                        except:
                            continue

                    # Add to data list
                    data.append(athlete_scores)

                # Timeout
                time.sleep(2)
            
            except:
                time.sleep(2)
                continue

    # Save yearly csvs
    pd.DataFrame(data).to_csv(f'./data/{year}.csv')


Got roundId: 0 for Quiksilver Pro Gold Coast
Got roundId: 1 for Quiksilver Pro Gold Coast
Got roundId: 2 for Quiksilver Pro Gold Coast
Got roundId: 3 for Quiksilver Pro Gold Coast
Got roundId: 4 for Quiksilver Pro Gold Coast
Got roundId: 5 for Quiksilver Pro Gold Coast
Got roundId: 6 for Quiksilver Pro Gold Coast
Got roundId: 7 for Quiksilver Pro Gold Coast
Got roundId: 8 for Quiksilver Pro Gold Coast
Got roundId: 9 for Quiksilver Pro Gold Coast


### Selenium

In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
# op = webdriver.ChromeOptions()
# op.add_argument('headless')
# driver = webdriver.Chrome(options=op)
driver = webdriver.Chrome()
driver.implicitly_wait(10)

driver.get('https://www.worldsurfleague.com/events/2008/mct/4/quiksilver-pro-gold-coast')

# driver.find_element(By.ID, 'onetrust-accept-btn-handler').click()

element = driver.find_element(By.ID, 'onetrust-accept-btn-handler')
driver.execute_script("arguments[0].click();", element)

# Get html soup
event_soup = BeautifulSoup(driver.page_source)

#Get list of athlete names
athletes = event_soup.find_all("span", class_="athlete-name")
athletes

driver.find_element(By.CLASS_NAME, 'flickity-button-icon').click()

event_soup = BeautifulSoup(driver.page_source)

#Get list of athlete names
athletes = event_soup.find_all("span", class_="athlete-name")
athletes
# driver.find_element(By.CLASS_NAME, 'flickity-button-icon')
# driver.page_source

In [8]:
driver.get('https://www.worldsurfleague.com/events/2008/mct/4/quiksilver-pro-gold-coast')

# driver.find_element(By.ID, 'onetrust-accept-btn-handler').click()

element = driver.find_element(By.ID, 'onetrust-accept-btn-handler')
driver.execute_script("arguments[0].click();", element)

# Get html soup
event_soup = BeautifulSoup(driver.page_source)

#Get list of athlete names
athletes = event_soup.find_all("span", class_="athlete-name")
athletes

driver.find_element(By.CLASS_NAME, 'flickity-button-icon').click()

event_soup = BeautifulSoup(driver.page_source)

#Get list of athlete names
athletes = event_soup.find_all("span", class_="athlete-name")
athletes
# driver.find_element(By.CLASS_NAME, 'flickity-button-icon')
# driver.page_source

[<span class="athlete-name" target="_blank">Kelly Slater</span>,
 <span class="athlete-name" target="_blank">Mick Fanning</span>]

In [10]:
driver.find_element(By.CLASS_NAME, 'flickity-button-icon').click()
time.sleep(2)

event_soup = BeautifulSoup(driver.page_source)

#Get list of athlete names
athletes = event_soup.find_all("span", class_="athlete-name")
athletes

[<span class="athlete-name" target="_blank">Bede Durbidge</span>,
 <span class="athlete-name" target="_blank">Joel Parkinson</span>,
 <span class="athlete-name" target="_blank">Mick Fanning</span>,
 <span class="athlete-name" target="_blank">Dean Morrison</span>,
 <span class="athlete-name" target="_blank">Jeremy Flores</span>,
 <span class="athlete-name" target="_blank">Adrian Buchan</span>,
 <span class="athlete-name" target="_blank">Kelly Slater</span>,
 <span class="athlete-name" target="_blank">Andy Irons</span>]

In [5]:
def get_scores(url):
    res = get(url)
    data = []


    # Get html soup
    event_soup = BeautifulSoup(res.content)

    #Get list of athlete names
    athletes = event_soup.find_all("span", class_="athlete-name")

    # Get list of scores lists
    scores = event_soup.find_all("div", class_="all-waves all-waves-grid")

    # Go through athletes
    for i in range(len(athletes)):

        # Create dict to save scores
        athlete_scores = {}

        # Save event name
        # athlete_scores["event"] = event_name

        # Save round name
        # athlete_scores["round"] = round_name
        
        # Save heat number
        # athlete_scores['heat'] = i
        
        # Athlete name
        athlete_scores["name"] = athletes[i].text

        # Empty list for athletes scores in that heat
        athlete_scores["scores"] = []

        # Scores list
        per_wave_scores = scores[i].find_all("span", class_="score")

        # Append scores
        for b in range(len(per_wave_scores)):
            try:
                athlete_scores["scores"].append(float(per_wave_scores[b].text))
            except:
                continue

        # Add to data list
        data.append(athlete_scores)

    return data

In [None]:
for year in range(2008, 2020):
    year_url = f"https://www.worldsurfleague.com/events/{year}/mct?all=1"
    data = []

    # Get html content for each year
    result = get(year_url)
    soup = BeautifulSoup(result.content)

    # Get a list of event info for each event in that year
    event_info = soup.find_all("a", class_="event-schedule-details__event-name")

    # Cycle through events and get their URLs
    for event in event_info:
        # Get event url
        url = event["href"]
        
        # Get initial scores
        data = get_scores(url)