In this notebook we scrape the moonboard climbing route database and save it in a pandas dataframe.  We leave the cleaning and inspection of the data to the [Data cleaning](Data-cleaning-and-analysis.ipynb) notebook.

In [1]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import pandas as pd
import json
from lxml import html
import os
from tqdm import tnrange, tqdm_notebook, tqdm

# load username and password
login_file = "login_details.json"

with open(login_file, "r") as read_file:
    login_details = json.load(read_file)

base_url = "https://www.moonboard.com/Problems/Index"
problems_url = "https://www.moonboard.com/Problems/GetProblems"
login_url = "https://www.moonboard.com/Account/Login?ReturnUrl=%2FProblems%2FIndex"

# Create a session and use own transport adapter to automatically retry with backoff
session = requests.session()

retries = Retry(total=5, backoff_factor=0.3,
                status_forcelist=[500, 502, 503, 504])
session.mount('https://www.moonboard.com', HTTPAdapter(max_retries=retries))

spoof_header = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-GB,en;q=0.9",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}
session.headers.update(spoof_header)

In [2]:
# get the authenticity token and login
r = session.get(login_url)
soup = BeautifulSoup(r.content, "lxml")
tag = soup.find('input', attrs={'name': '__RequestVerificationToken'})
authenticity_token = tag['value']

login_payload = {"Login.Username": login_details['username'],
                 "Login.Password": login_details['password'],
                 "__RequestVerificationToken": authenticity_token}

r = session.post(login_url, data=login_payload,
                 headers=dict(referer=base_url), timeout=1)

# check if error
if r.status_code != requests.codes.ok:
    print(result)

Now let's try to scrape the problems data.  We can request it via post and it's returned to us in JSON form.  Usually only 15 problems are requested at a time.  We first request page 1 and use it to create a dataframe and find the total number of entries in the database

In [5]:
page = 1
# Website didn't like me requesting large page sizes
pageSize = 15

problems_payload = {"page": page, "pageSize": pageSize,
                    "sort": '', "group": '', 'filter': ''}
r = session.post(problems_url, data=problems_payload,
                 timeout=1, headers=dict(referer=base_url))
# check for error
if r.status_code != requests.codes.ok:
    print(r)

data = r.json()
total_entries = data['Total']
df = pd.DataFrame(data['Data'])
print(f"Total entries {total_entries}")

Total entries 13569


In [6]:
# Find the total size of the database and request all of it

total_pages = total_entries // pageSize
if total_entries % pageSize != 0:
    total_pages += 1

# loop through the rest of the requests
for i in tnrange(2, total_pages + 1):
    problems_payload['page'] = i
    r = session.post(problems_url, data=problems_payload,
                     timeout=10, headers=dict(referer=base_url))
    # check if error
    if r.status_code != requests.codes.ok:
        print(r)

    df = df.append(r.json()['Data'], ignore_index=True)

HBox(children=(IntProgress(value=0, max=904), HTML(value='')))




In [7]:
# Save the datafram
directory = "data"
if not os.path.exists(directory):
    os.makedirs(directory)
path_pickle = os.path.join(directory, 'database.zip')
path_json = os.path.join(directory, 'database.json')

# df.to_json(path_json)
df.to_pickle(path_pickle)

In [8]:
df.describe()

Unnamed: 0,ApiId,Attempts,Id,MoonBoardConfigurationId,Rating,Repeats,UserRating
count,13570.0,13570.0,13570.0,13570.0,13570.0,13570.0,13570.0
mean,0.0,0.0,319500.482756,0.0,0.0,20.145173,1.896242
std,0.0,0.0,6045.95704,0.0,0.0,133.964718,1.105394
min,0.0,0.0,305445.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,315438.25,0.0,0.0,1.0,2.0
50%,0.0,0.0,319151.5,0.0,0.0,3.0,2.0
75%,0.0,0.0,324666.75,0.0,0.0,9.0,3.0
max,0.0,0.0,330118.0,0.0,0.0,4616.0,3.0


In [9]:
df.columns

Index(['ApiId', 'Attempts', 'DateDeleted', 'DateInserted', 'DateTimeString',
       'DateUpdated', 'FirstAscender', 'Grade', 'Holdsets', 'Holdsetup', 'Id',
       'IsAssessmentProblem', 'IsBenchmark', 'Locations', 'Method',
       'MoonBoardConfiguration', 'MoonBoardConfigurationId', 'Moves', 'Name',
       'NameForUrl', 'NumberOfTries', 'ProblemType', 'Rating', 'RepeatText',
       'Repeats', 'Setter', 'UserGrade', 'UserRating'],
      dtype='object')

In [10]:
df['Moves'][0]

[{'Id': 1820187, 'Description': 'C5', 'IsStart': True, 'IsEnd': False},
 {'Id': 1820188, 'Description': 'F6', 'IsStart': True, 'IsEnd': False},
 {'Id': 1820189, 'Description': 'D9', 'IsStart': False, 'IsEnd': False},
 {'Id': 1820190, 'Description': 'G12', 'IsStart': False, 'IsEnd': False},
 {'Id': 1820191, 'Description': 'C14', 'IsStart': False, 'IsEnd': False},
 {'Id': 1820192, 'Description': 'C16', 'IsStart': False, 'IsEnd': False},
 {'Id': 1820193, 'Description': 'A18', 'IsStart': False, 'IsEnd': True}]

In [11]:
df['Method']

0                    Feet follow hands
1                    Feet follow hands
2                    Feet follow hands
3        Feet follow hands + screw ons
4                 Footless + kickboard
5                    Feet follow hands
6                    Feet follow hands
7                    Feet follow hands
8                    Feet follow hands
9        Feet follow hands + screw ons
10                   Feet follow hands
11                   Feet follow hands
12                   Feet follow hands
13                   Feet follow hands
14                   Feet follow hands
15                   Feet follow hands
16                   Feet follow hands
17                   Feet follow hands
18                   Feet follow hands
19                   Feet follow hands
20                   Feet follow hands
21                   Feet follow hands
22                   Feet follow hands
23                   Feet follow hands
24                   Feet follow hands
25                   Feet

In the [next notebook](Data-cleaning-and-analysis.ipynb) we clean the data, perform some analysis on it and prepare it for the machine learning.