In this notebook we scrape the moonboard climbing route database and save it in a pandas dataframe.  We leave the cleaning and inspection of the data to the [Data cleaning](Data-cleaning-and-analysis.ipynb) notebook.

In [1]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import pandas as pd
import json
from lxml import html
import os
from tqdm import tnrange, tqdm_notebook, tqdm

# load username and password
# login_file = "login_details.json"
# 
# with open(login_file, "r") as read_file:
#     login_details = json.load(read_file)

base_url = "https://www.moonboard.com/Dashboard/Index"
problems_url = "https://www.moonboard.com/Problems/GetProblems"
login_url = "https://www.moonboard.com/Account/Login?ReturnUrl=%2FProblems%2FIndex"

# Create a session and use own transport adapter to automatically retry with backoff
session = requests.session()

retries = Retry(total=5, backoff_factor=0.3,
                status_forcelist=[500, 502, 503, 504])
session.mount('https://www.moonboard.com', HTTPAdapter(max_retries=retries))

spoof_header = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-GB,en;q=0.9",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"
}
session.headers.update(spoof_header)

In [2]:
# get the authenticity token and login
r = session.get(login_url)
soup = BeautifulSoup(r.content, "lxml")
tag = soup.find('input', attrs={'name': '__RequestVerificationToken'})
authenticity_token = tag['value']

login_payload = {"Login.Username": "???",
                 "Login.Password": "???",
                 "__RequestVerificationToken": authenticity_token}

r = session.post(login_url, data=login_payload,
                 headers=dict(referer=base_url), timeout=1)

# check if error
if r.status_code != requests.codes.ok:
    print(result)

Now let's try to scrape the problems data.  We can request it via post and it's returned to us in JSON form.  Usually only 15 problems are requested at a time.  We first request page 1 and use it to create a dataframe and find the total number of entries in the database

In [3]:
page = 1
# Website didn't like me requesting large page sizes
pageSize = 15

problems_payload = {"page": page, "pageSize": pageSize,
                    "sort": '', "group": '', 'filter': ''}
r = session.post(problems_url, data=problems_payload,
                 timeout=1, headers=dict(referer=base_url))
# check for error
if r.status_code != requests.codes.ok:
    print(r)

data = r.json()
total_entries = data['Total']
df = pd.DataFrame(data['Data'])
print(f"Total entries {total_entries}")

<Response [403]>


JSONDecodeError: [Errno Expecting value] <!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>403 Forbidden</title>
</head><body>
<h1>Forbidden</h1>
<p>You don't have permission to access this resource.</p>
<p>Additionally, a 404 Not Found
error was encountered while trying to use an ErrorDocument to handle the request.</p>
</body></html>
: 0

In [None]:
# Find the total size of the database and request all of it

total_pages = total_entries // pageSize
if total_entries % pageSize != 0:
    total_pages += 1

# loop through the rest of the requests
for i in tnrange(2, total_pages + 1):
    problems_payload['page'] = i
    r = session.post(problems_url, data=problems_payload,
                     timeout=10, headers=dict(referer=base_url))
    # check if error
    if r.status_code != requests.codes.ok:
        print(r)

    df = df.append(r.json()['Data'], ignore_index=True)

In [None]:
# Save the datafram
directory = "data"
if not os.path.exists(directory):
    os.makedirs(directory)
path_pickle = os.path.join(directory, 'database.zip')
path_json = os.path.join(directory, 'database.json')

# df.to_json(path_json)
df.to_pickle(path_pickle)

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df['Moves'][0]

In [None]:
df['Method']

In the [next notebook](Data-cleaning-and-analysis.ipynb) we clean the data, perform some analysis on it and prepare it for the machine learning.