# Web Scraper

## Imports

In [11]:
import pandas as pd
import numpy as np
from requests import get
import json
from bs4 import BeautifulSoup
import time



## World Surf League Scores

I'm going to focus on the Pipeline Masters first. It is the marquee event of the Championship Tour and is run every year (**verify**).

### Get HTML soup and review

In [None]:
# Starting URLs
pipe_2008_url = "https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=3"
res = get(pipe_2008_url)
soup = BeautifulSoup(res.content)


In [None]:
soup.find_all("div", class_="all-waves all-waves-grid")[0].find_all(
    "span", class_="score"
)[3].text


'1.73'

In [None]:
len(soup.find_all("div", class_="all-waves all-waves-grid"))


32

In [None]:
soup.find_all("span", class_="athlete-name")[0].text


'Kamalei Alexander'

In [None]:
len(soup.find_all("span", class_="athlete-name"))


32

### Round 1

In [3]:
# Starting URLs
pipe_2008_url = "https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=3"
res = get(pipe_2008_url)
soup = BeautifulSoup(res.content)


In [None]:
athletes = soup.find_all("span", class_="athlete-name")
scores = soup.find_all("div", class_="all-waves all-waves-grid")


In [None]:
round_1 = []
for i in range(len(athletes)):
    athlete_scores = {}
    athlete_scores["name"] = soup.find_all("span", class_="athlete-name")[i].text
    athlete_scores["scores"] = []
    for b in range(len(scores[i].find_all("span", class_="score"))):
        try:
            athlete_scores["scores"].append(
                float(scores[i].find_all("span", class_="score")[b].text)
            )
        except:
            continue
    round_1.append(athlete_scores)

round_1


[{'name': 'Kamalei Alexander', 'scores': [8.83, 6.83, 1.5]},
 {'name': 'Daniel Ross', 'scores': [9.93, 1.23]},
 {'name': 'Jesse Merle-Jones', 'scores': [2.83, 0.2, 1.57, 7.17]},
 {'name': 'Tj Barron', 'scores': [3.5, 1.83, 2.77, 0.27, 2.9, 1.8, 4.83, 0.5]},
 {'name': 'Ezra Sitt', 'scores': [1.33, 7.5, 3.83]},
 {'name': 'Daniel Jones', 'scores': [1.17, 5.67, 1.67, 1.63, 2.83, 1.4, 1.97]},
 {'name': 'Tory Barron',
  'scores': [4.33, 6.17, 0.9, 7.67, 5.67, 1.5, 3.43, 1.33]},
 {'name': 'Royden Bryson',
  'scores': [0.6, 3.33, 2.5, 1.6, 4.67, 6.17, 2.1, 4.43, 0.73]},
 {'name': 'Aritz Aranburu', 'scores': [1.33, 9.33, 3.27, 5.67, 4.33]},
 {'name': 'Kalani Chapman', 'scores': [4.17, 0.77, 4.27, 1.0, 7.33]},
 {'name': 'Dave Wassel', 'scores': [9.67, 3.67, 1.93]},
 {'name': 'Ricky Basnett', 'scores': [3.17, 0.83, 6.0]},
 {'name': 'Evan Valiere', 'scores': [1.83, 3.67, 2.57, 6.0, 1.97, 1.77, 6.57]},
 {'name': 'Olamana Eleogram', 'scores': [0.27, 7.67, 0.33, 4.67, 0.83]},
 {'name': 'John John Flo

### Generalize for all rounds

In [16]:
roundids = {
    3: "round_1",
    4: "round_2",
    5: "round_3",
    6: "round_4",
    7: "quarter",
    52: "semi",
    77: "final",
}  # roundIds do not follow a strict logical order

event_name = "2008 Pipeline"
data = []

for roundid, round_name in roundids.items():
    url = f"https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId={roundid}"
    res = get(url)
    soup = BeautifulSoup(res.content)
    athletes = soup.find_all("span", class_="athlete-name")
    scores = soup.find_all("div", class_="all-waves all-waves-grid")

    for i in range(len(athletes)):
        athlete_scores = {}
        athlete_scores["event"] = event_name
        athlete_scores["round"] = round_name
        athlete_scores["name"] = soup.find_all("span", class_="athlete-name")[i].text
        athlete_scores["scores"] = []
        for b in range(len(scores[i].find_all("span", class_="score"))):
            try:
                athlete_scores["scores"].append(
                    float(scores[i].find_all("span", class_="score")[b].text)
                )
            except:
                continue

        data.append(athlete_scores)

    time.sleep(3)


In [17]:
pd.DataFrame(data)

Unnamed: 0,event,round,name,scores
0,2008 Pipeline,round_1,Kamalei Alexander,"[8.83, 6.83, 1.5]"
1,2008 Pipeline,round_1,Daniel Ross,"[9.93, 1.23]"
2,2008 Pipeline,round_1,Jesse Merle-Jones,"[2.83, 0.2, 1.57, 7.17]"
3,2008 Pipeline,round_1,Tj Barron,"[3.5, 1.83, 2.77, 0.27, 2.9, 1.8, 4.83, 0.5]"
4,2008 Pipeline,round_1,Ezra Sitt,"[1.33, 7.5, 3.83]"
...,...,...,...,...
122,2008 Pipeline,semi,Tim Reyes,"[2.07, 2.57, 6.5, 9.1, 5.67]"
123,2008 Pipeline,semi,Chris Ward,"[6.83, 1.13, 1.23, 9.63, 4.5, 1.5]"
124,2008 Pipeline,semi,Adrian Buchan,"[4.33, 1.0, 1.43, 1.4, 3.5, 5.83, 2.67, 0.2]"
125,2008 Pipeline,final,Kelly Slater,"[1.4, 2.0, 7.17, 0.73, 1.1, 1.0, 6.83]"


### Generalize to a function for all events

In [30]:
events = [
    {
        "name": "2008 Pipeline",
        'url': 'https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=',
        "rounds": {
            3: "round_1",
            4: "round_2",
            5: "round_3",
            6: "round_4",
            7: "quarter",
            52: "semi",
            77: "final",
        },
    },
    {
        "name": "2008 Gold Coast",
        'url': 'https://www.worldsurfleague.com/events/2008/mct/4/quiksilver-pro-gold-coast?roundId=',
        "rounds": {
            16: "round_1",
            36: "round_2",
            20: "round_3",
            42: "round_4",
            48: "quarter",
            69: "semi",
            71: "final",
        },
    },
    {
        "name": "2008 Bells Beach",
        'url': 'https://www.worldsurfleague.com/events/2008/mct/8/rip-curl-pro-bells-beach?roundId=',
        "rounds": {
            31: "round_1",
            10: "round_2",
            50: "round_3",
            56: "round_4",
            60: "quarter",
            64: "semi",
            68: "final",
        },
    },
    {
        "name": "2008 Teahupoo",
        'url': 'https://www.worldsurfleague.com/events/2008/mct/20/billabong-pro-teahupoo?roundId=',
        "rounds": {
            8: "round_1",
            35: "round_2",
            30: "round_3",
            43: "round_4",
            51: "quarter",
            65: "semi",
            66: "final",
        },
    },
]


def get_scores(event_dict=events):
    """Takes a dict of World Surf League Events, URLs and roundIds and scrapes the scores"""
    data = []

    for event in events:
        event_name = event["name"]
        url = event['url']
        for roundid, round_name in event["rounds"].items():
            url += str(roundid)
            res = get(url)
            soup = BeautifulSoup(res.content)
            athletes = soup.find_all("span", class_="athlete-name")
            scores = soup.find_all("div", class_="all-waves all-waves-grid")

            for i in range(len(athletes)):
                athlete_scores = {}
                athlete_scores["event"] = event_name
                athlete_scores["round"] = round_name
                athlete_scores["name"] = soup.find_all("span", class_="athlete-name")[
                    i
                ].text
                athlete_scores["scores"] = []
                for b in range(len(scores[i].find_all("span", class_="score"))):
                    try:
                        athlete_scores["scores"].append(
                            float(scores[i].find_all("span", class_="score")[b].text)
                        )
                    except:
                        continue

                data.append(athlete_scores)
                
            time.sleep(5)

    return pd.DataFrame(data)


In [31]:
data = get_scores(event_dict=events)
data.to_csv('./data/scores_2008.csv', index=False)
data

### Round 2

In [None]:
# Round 2
pipe_2008_url_2 = "https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=4"
res = get(pipe_2008_url_2)
soup = BeautifulSoup(res.content)


In [None]:
athletes = soup.find_all("span", class_="athlete-name")
scores = soup.find_all("div", class_="all-waves all-waves-grid")


In [None]:
round_2 = []
for i in range(len(athletes)):
    athlete_scores = {}
    athlete_scores["name"] = soup.find_all("span", class_="athlete-name")[i].text
    athlete_scores["scores"] = []
    for b in range(len(scores[i].find_all("span", class_="score"))):
        try:
            athlete_scores["scores"].append(
                float(scores[i].find_all("span", class_="score")[b].text)
            )
        except:
            continue
    round_2.append(athlete_scores)

round_2


[{'name': 'Kamalei Alexander', 'scores': [6.33, 3.0, 5.57, 5.07, 0.67]},
 {'name': 'Mikael Picon', 'scores': [2.67, 1.0, 5.17, 5.77, 5.9, 4.43]},
 {'name': 'Evan Valiere', 'scores': [2.67, 0.57, 1.57, 5.5, 1.67, 1.37, 9.07]},
 {'name': 'Mick Campbell', 'scores': [1.5, 3.17, 5.67, 1.0, 8.67]},
 {'name': "Jamie O'Brien",
  'scores': [5.0, 6.0, 1.33, 3.83, 1.73, 2.23, 5.73, 9.0]},
 {'name': 'Roy Powers', 'scores': [4.17, 6.67, 2.2]},
 {'name': 'Chris Ward',
  'scores': [4.5, 2.83, 9.93, 5.17, 1.07, 0.93, 2.33, 9.97, 1.83, 1.77]},
 {'name': 'Tory Barron', 'scores': [1.17, 8.33, 0.93, 1.9, 3.4]},
 {'name': 'Damien Hobgood', 'scores': [4.83, 2.5, 5.5, 5.57, 4.5, 8.83, 6.4]},
 {'name': 'Dave Wassel', 'scores': [1.5, 0.2, 2.2, 0.53, 3.83, 6.07]},
 {'name': 'Dusty Payne',
  'scores': [1.17, 1.6, 0.73, 2.17, 1.73, 8.83, 2.93, 6.33, 8.33, 6.83]},
 {'name': 'Heitor Alves', 'scores': [1.07, 0.43, 2.0, 3.5, 0.83, 8.37]},
 {'name': 'Daniel Wills', 'scores': [7.83, 0.43, 1.8, 8.43, 5.17, 6.67, 0.97]},

### Round 3

In [None]:
# Round 3
pipe_2008_url_3 = "https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=5"
res = get(pipe_2008_url_3)
soup = BeautifulSoup(res.content)


In [None]:
athletes = soup.find_all("span", class_="athlete-name")
scores = soup.find_all("div", class_="all-waves all-waves-grid")


In [None]:
round_3 = []
for i in range(len(athletes)):
    athlete_scores = {}
    athlete_scores["name"] = soup.find_all("span", class_="athlete-name")[i].text
    athlete_scores["scores"] = []
    for b in range(len(scores[i].find_all("span", class_="score"))):
        try:
            athlete_scores["scores"].append(
                float(scores[i].find_all("span", class_="score")[b].text)
            )
        except:
            continue
    round_3.append(athlete_scores)

round_3


[{'name': 'Marcus Hickman', 'scores': [7.33, 3.17, 7.23, 1.1]},
 {'name': 'Mick Fanning', 'scores': [2.17, 0.4, 5.5, 8.33, 0.23]},
 {'name': 'Luke Stedman', 'scores': [0.33, 4.67, 2.67]},
 {'name': 'Damien Hobgood', 'scores': [1.83, 0.83, 3.67, 0.87]},
 {'name': 'Joel Parkinson',
  'scores': [2.83, 8.67, 10.0, 0.23, 1.67, 10.0, 2.07]},
 {'name': 'Dusty Payne', 'scores': [1.17, 5.83, 9.33, 1.23, 0.57]},
 {'name': 'Tim Reyes', 'scores': [2.5, 7.0, 3.17, 7.83]},
 {'name': 'Freddy Patacchia Jr.', 'scores': [3.83, 6.73, 1.27, 0.97]},
 {'name': 'Kelly Slater',
  'scores': [1.33, 1.1, 5.0, 8.17, 1.9, 0.23, 8.67, 9.1]},
 {'name': 'Ezra Sitt', 'scores': [1.23, 0.73, 1.2, 0.27, 6.67]},
 {'name': 'Kieren Perrow',
  'scores': [0.5, 0.83, 2.0, 0.8, 1.7, 0.57, 8.0, 6.83, 1.53]},
 {'name': 'Taylor Knox', 'scores': [1.43, 0.7, 5.0, 5.5]},
 {'name': 'Aritz Aranburu', 'scores': [3.67, 6.17, 5.33, 1.83]},
 {'name': 'Bobby Martinez', 'scores': [1.0, 1.27, 2.5, 1.13, 4.1, 1.07]},
 {'name': "Jamie O'Brien",

### Round 4

In [5]:
# Round 4
pipe_2008_url_4 = "https://www.worldsurfleague.com/events/2008/mct/75/billabong-pipeline-masters?roundId=6"
res = get(pipe_2008_url_4)
soup = BeautifulSoup(res.content)


In [6]:
athletes = soup.find_all("span", class_="athlete-name")
scores = soup.find_all("div", class_="all-waves all-waves-grid")


In [7]:
round_4 = []
for i in range(len(athletes)):
    athlete_scores = {}
    athlete_scores["name"] = soup.find_all("span", class_="athlete-name")[i].text
    athlete_scores["scores"] = []
    for b in range(len(scores[i].find_all("span", class_="score"))):
        try:
            athlete_scores["scores"].append(
                float(scores[i].find_all("span", class_="score")[b].text)
            )
        except:
            continue
    round_4.append(athlete_scores)

round_4


[{'name': 'Luke Stedman', 'scores': [4.67, 1.23, 8.0, 1.73, 0.77]},
 {'name': 'Marcus Hickman', 'scores': [1.67, 1.6, 1.63, 2.6, 9.6, 2.3, 0.73]},
 {'name': 'Tim Reyes', 'scores': [4.33, 8.33]},
 {'name': 'Joel Parkinson', 'scores': [5.33, 1.17, 0.67, 6.5]},
 {'name': 'Kelly Slater', 'scores': [1.7, 3.5, 8.17, 1.8, 9.63, 9.77, 0.93]},
 {'name': 'Kieren Perrow', 'scores': [1.5, 1.1, 1.63, 0.2, 1.07, 6.33, 0.3]},
 {'name': "Jamie O'Brien", 'scores': [4.5, 5.83, 1.67, 1.0]},
 {'name': 'Aritz Aranburu', 'scores': [0.93, 2.17, 4.5, 1.33, 1.27]},
 {'name': 'Adrian Buchan', 'scores': [2.0, 3.67, 6.67, 0.23, 9.1, 1.4]},
 {'name': 'Adriano de Souza', 'scores': [4.33, 4.83, 6.73]},
 {'name': 'Kamalei Alexander', 'scores': [6.0, 1.83, 4.17]},
 {'name': 'Tom Whitaker', 'scores': [7.0, 0.9, 1.67, 2.4, 0.93, 1.3]},
 {'name': 'Chris Ward', 'scores': [6.0, 0.5, 2.17, 1.5, 1.83, 3.23, 7.43]},
 {'name': 'Bede Durbidge', 'scores': [2.73, 6.83, 1.27, 1.87]},
 {'name': 'Andy Irons', 'scores': [8.83, 0.9, 0