# Mountainproject.com Rock Climbing Route Recommender System
## Mike Bell
### http://github.io/mikebell7

## 1. Data Collection 

This notebook contains the code used to scrape area/route data from mountainproject.com (MP).

We use the requests and BeautifulSoup packages to recursively scrape MP's route/area directory tree
structure. 

Some things to note:


1. The mountainproject directory is organized in a tree consisting of 'area' pages, all of which have a sidebar of links to the smaller subareas contained in that area, or links to the climbing routes found in that area. It is important to note that area pages which contain climbing routes do NOT contain links to any other subareas, so such areas are considered leaf nodes in the area directory tree. Hence, when recursively scraping the directory tree we can easily identify leaves by the presence of any route pages 
2. Area Pages have URL: www.mountainproject.com/area/AREA_ID/AREA_NAME
3. Route Pages have URL: www.mountainproject.com/route/ROUTE_ID/ROUTE_NAME
4. Areas and routes are identified by unique 9-digit IDs, appearing as AREA_ID or ROUTE_ID in their repsective urls shown above.
5. Registered mountainproject.com users have a unique userID, which is a string of integers of variable length.
5. Mountainproject.com users are able to 'rate' climbing routes on a scale of 0 - 4 stars. 
6. For each route, we collect its average rating along with a list of individual userIDs and ratings. 

- The 'root' nodes for what I call 'area trees' on mountainproject.com are: Each of the 50 states 'Alabama', ... 'Wyoming', 'International', and '* In Progress'. For this project we ignore the 'International' and '* In Progress ' pages, and recursively scrape all areas and routes in all 50 US states.


- The recursive scraping algorithm is as follows:
For the current area page, we collect all relevant area information (areaID, area name, GPS coordinates, elevation, description etc.) as well as all sidebar links. If the sidebar links contain /route/ URLs, then this area page is a leaf node and we iterate through the list of routes on this page and scrape and store all relevant information from these pages (routeID, route name, description, average star rating, list of users and ratings). 


- Else, if the current sidebar links are to area pages (subareas), we add each of these subareas URLs to a queue and recursively scrape each of these subareas as above. 

- To scrape most efficiently, we first traverse the area/route tree of a state and collect a list of URLs for each area and route, tracking parent information



In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

In [3]:
# extracts the 9-digit area/route id from a given url
def get_id(url):
    return int(url.split('/')[4])

# checks if a given url is of 'route' type, all route pages are of the form 
# www.mountainproject.com/route/ROUTE_ID/ROUTE_NAME
def is_route(url):
    return url.split('/')[3] == 'route'

# Extract all relevant information from an area page
def read_area(url):
    try:         
        res = requests.get(url,headers={"User-Agent":"Mozilla/5.0"})
        soup = BeautifulSoup(res.content, 'lxml')
        requests.session().close()

        # An area will initially be a dictionary with the following keys
        # name, id, parent_id, elevation, latitude, longitude
        area = {}
        
        # get area name and id 
        area['name'] =  soup.find('h1').contents[0].strip() 
        area['id'] = get_id(url)

        # get the parent area ID, if it has one
        if len(soup.find('div', {'class' : 'mb-half small text-warm'}).find_all('a')) > 1:
            area['parent_id'] = get_id(soup.find('div', {'class' : 'mb-half small text-warm'}).find_all('a')[-1]['href'])
        else:
            area['parent_id'] = None

        # Elevation and GPS coordinates (if present) are found in the 'description-details' section of an area
        description_details = soup.find('table', {'class' : 'description-details'}).find_all('td')

        # get elevation, lat/lon coordinates, if present (not all areas have this information)
        area['elevation'] = None
        area['latitude'] = None
        area['longitude'] = None    

        for i, desc in enumerate(description_details):
            if 'Elevation' in desc.text:
                area['elevation'] = description_details[i+1].text.strip().split('ft')[0].strip()
            if 'GPS' in desc.text:
                gps = description_details[i+1].contents[0].strip().split(',')

                area['latitude'] = gps[0]
                area['longitude'] = gps[1]
        return area
    
    except Exception as e:
        print(e)
        print(url)
        return {}
    
# Extract all relevant information from a route page    
def read_route(url):   
    try:
        res = requests.get(url,headers={"User-Agent":"Mozilla/5.0"})
        soup = BeautifulSoup(res.content, 'lxml')
        requests.session().close()
        description_details = soup.find('table', {'class' : 'description-details'}).find_all('td')

        route = {}
        # A route will initially be a dictionary with the following keys
        # name, id, area_id, description, type, height, pitches, grade, score, number of votes
        # star_ratings: a dictionary of pairs consisting of user_id :rating for each user rating 
        # for the given route
        
        # get route description
        descr = [x.strip() for x in description_details[1].text.split(',')]

        route['name'] = soup.find('h1').text.strip()    
        route['id'] = get_id(url)
        
        # We are only interested in routes of type trad/sport/boulder, ignore all others
        if any([ignore_type in descr for ignore_type in ['Ice', 'Aid', 'Mixed', 'Alpine']]):
            return {}
        if 'Trad' in descr:
            route['type'] = 'Trad'
        elif 'Sport' in descr:
            route['type'] = 'Sport'
        elif 'Boulder' in descr:
            route['type'] = 'Boulder'
        else:
            return {}

        # Get the parent areaID
        route['area_id'] = get_id(soup.find('div', {'class' : 'mb-half small text-warm'}).find_all('a')[-1]['href'])

        # get number of pitches and height, grade, score, and votes
        route_pitches = 1
        route_height = 0

        for token in descr: 
            if 'ft' in token: 
                route_height = int(token.split('ft')[0])
            if 'pitches' in token: 
                route_pitches = int(token.split('pitches')[0])

        route['grade'] =  soup.find('span', {'class' :"rateYDS"}).text.split()[0]    
        route['height'] = route_height
        route['pitches'] = route_pitches

        route_score_str = soup.find('span', {'id' : f'starsWithAvgText-{route["id"]}'}).text.split()
        route['score'] = float(route_score_str[1])
        route['votes'] = int(route_score_str[3].replace(',',''))
        route['description'] = soup.find('div', {'class' : 'fr-view'}).text  

        # get the link to the page containing the users and ratings
        stats_url = soup.find('span', {'id' : 'route-star-avg'}).find('a')['href']

        # If a route has any user ratings, scrape the associated ratings page for
        # all users and their ratings
        if route['votes'] > 0:
            route['star_ratings'] = get_star_ratings(stats_url)
        else:
            route['votes'] = None

        return route
    
    except Exception as e:
        print(e)
        print(url)
        return {}
    
# Extract the star ratings for a given route, tracking the unique userID and corresponding rating
def get_star_ratings(url):
    try:
        res = requests.get(url,headers={"User-Agent":"Mozilla/5.0"})
        soup = BeautifulSoup(res.content, 'lxml')
        requests.session().close()

        stats = soup.find('table',{'class':'table table-striped'})
        
        # Route ratings are displayed as a table row element with a link to the user's page
        # followed by a number of star images, corresponding to the given rating
        #
        # We extract the userID from the page link and count the number of star images associated to this user
        #
        # What results is a dictionary with entries of the form {userID: star_rating}.
        star_ratings = {}
        
        for user in stats.find_all('tr'):
            star_ratings[get_id(user.find('a')['href'])] = len(user.find_all('img', {'src' : "/img/stars/starBlue.svg"}))            
        return star_ratings
    
    except Exception as e:
        print(e)
        print(url)
        return {}


# This is the main recursive function which traverses the area tree
# The parameters n and start can be used to in case of disconnection during scraping
# to start from the last point the backup was saved.
#
# Note that this function does not yet navigate the route pages
# it simply logs the area-tree structure encountered, along with the URLs of any routes contained in 
# node area pages. 
#
# Once an area tree has been collected completely, we then run through the list of routes and 
# scrape the route info / ratings.
def get_areas_route_list(url, areas, route_list, name, n = 0, start = 0):
    
    # If the current URL is a route, extract the route information and add to the route list.
    # We keep a backup 
    if is_route(url):
        route_list.append(url)
        if len(route_list) % 500 == 0:
            pd.DataFrame(areas).to_csv(f'./data/{name}_areas_backup.csv', index = False)
            pd.DataFrame(route_list).to_csv(f'./data/{name}_route_list_backup.csv', index = False)
    else: 
        
        print('-'*n + f'AREA: {url}')
        areas.append(read_area(url))
        
        try:
            res = requests.get(url,headers={"User-Agent":"Mozilla/5.0"})
            soup = BeautifulSoup(res.content, 'lxml')
            requests.session().close()
            
            
            sidebar = soup.find('div', {'class' : 'mp-sidebar'})
            
            # Area which contain further sub-areas will contain a div of class 'lef-nav-row'
            # in its sidebar
            # Gather all sub-area URLs 
            urls = sidebar.find_all('div', {'class':'lef-nav-row'})
            
            # If no 'lef-nav-row' was found, we have reached a node-area
            # and now we look for the route table and collect all listed route URLs
            if urls == []: 
                sidebar = soup.find('div', {'class' : 'mp-sidebar'})
                if(sidebar.find('table', {'id':'left-nav-route-table'})):
                    urls = sidebar.find('table', {'id':'left-nav-route-table'}).find_all('tr')

            # Now recursively scrape all URLs found in the above, either the sub-areas or route pages
            for i,row in enumerate(urls):
                #time.sleep(2)
                if n == 0:
                    if i >= start:
                        if row.find('a'):
                            sub_url = row.find('a')['href']
                            get_areas_route_list(sub_url, areas, route_list,name, n+1)
                else:
                    if row.find('a'):
                        sub_url = row.find('a')['href']
                        get_areas_route_list(sub_url, areas, route_list,name, n+1)

        except Exception as e:
            print(e)
            print(url)                           

# This function runs through the list of routes collected by the above function
# and reads the route info/ratings. 
def read_route_list(name, routes, route_list, count, start = 0):       
    for url in route_list[start:]:
        if is_route(url):
            route = read_route(url)
            if route != {}:
                routes.append(read_route(url))
  
            if count % 100 == 0:
                
                print(f'count: {count}, routes: {len(routes)-1}')
                pd.DataFrame(routes).to_csv(f'./data/{name}_routes_{start}.csv', index = False)
        count += 1
        
    pd.DataFrame(routes).to_csv(f'./data/{name}_routes_{start}.csv', index = False)          
                        

In [3]:
# Retrieve the list of climbing grades for roped routes and boulders, in order according to difficulty.
# Sport/Trad climbing routes and Bouldering routes are graded according to different diffuclty scales:
# The Yosemite Decimal System (YDS) and V-Scale, respectively.
# 
# See https://www.mountainproject.com/international-climbing-grades for a complete list.
def get_grades():
    grades_url = 'https://www.mountainproject.com/international-climbing-grades'
    res = requests.get(url,headers={"User-Agent":"Mozilla/5.0"})
    soup = BeautifulSoup(res.content, 'lxml')
    requests.session().close()

    tables = soup.find_all('table', {'class':'table table-condensed table-striped'})

    rows = tables[0].find_all('tr')
    rank = 0
    climb_grades = {}
    for row in rows:
        if(row.find('td')):
            grade = row.find('td').text
            climb_grades[grade] = rank
            rank += 1

    rows = tables[1].find_all('tr')


    boulder_grades = {}
    rank = 0
    for row in rows:
        if(row.find('td')):
            grade = row.find('td').text
            boulder_grades[grade] = rank
            rank += 1

    climb_grades = pd.DataFrame.from_dict(climb_grades, orient = 'index',columns=['rank']).reset_index().rename(columns = {'index' : 'grade'})
    boulder_grades = pd.DataFrame.from_dict(boulder_grades, orient = 'index',columns=['rank']).reset_index().rename(columns = {'index' : 'grade'})

    climb_grades.to_csv('./data/climb_grades.csv', index = False)
    boulder_grades.to_csv('./data/boulder_grades.csv', index = False)

## Sorted state scraper by number of routes

In [4]:
# Get state and state area URL data
# This gives a list of the 'root' nodes for our area-tree traversal process
# We order the list of states by the number of routes contained in each state
# In this way, we can organize our scraping from smallest to largest
#
url = 'http://www.mountainproject.com/'
res = requests.get(url,headers={"User-Agent":"Mozilla/5.0"})
soup = BeautifulSoup(res.content, 'lxml')
requests.session().close()

states = []
ignore = ['International', '* In Progress']
for col in soup.find('div', {'id' : 'route-guide'}).find_all('div' , {'class' : 'col-sm-3 hidden-md-down'}):
    for row in col.find_all('strong'):
        state = {}
        state['state'] = row.find('a').text
        state['url'] = row.find('a').attrs['href']
        state['size'] = int(row.find('small').text.replace(',',''))
        if(state['state'] not in ignore):
            states.append(state)

In [5]:
sorted_states = sorted(list(map( lambda x : (x['state'], x['url'], x['size']), states)),key = lambda x: x[2])

In [7]:
# The following can be used to ignore certain states, and start other states at different 
# levels of the recursion tree, in case of interrupted scraping / loading backups.
completed = []
routes = {}
starts = [0]*len(sorted_states)

In [None]:
# read all areas and route lists from states from smallest to largest 
import os

for i,state in enumerate(sorted_states):
    if state[0] not in completed:
        areas = []
        route_list = []
        print('Starting ',state[0])
        
        # See if the route/area lists already exist for the current state. If it does, read it in from
        # the existing files, otherwise begin recursively scraping the area tree for that state
        # and gather area/route URL info.
        if os.path.exists(f'./data/{state[0]}_route_list.csv') and os.path.exists(f'./data/{state[0]}_areas.csv'):
            route_list = pd.read_csv(f'./data/{state[0]}_route_list.csv') 
            areas = pd.read_csv(f'./data/{state[0]}_areas.csv')
            print(f'{state[0]} RL and Areas found')
        else:        
            print("Route list not find, generating.")
            get_areas_route_list(state[1], areas, route_list, state[0])                 
            pd.DataFrame(areas).to_csv(f'./data/{state[0]}_areas.csv', index = False)
            pd.DataFrame(route_list).to_csv(f'./data/{state[0]}_route_list.csv', index = False) 
        
        # Now read in the route list for the current state (which either exists from above, or has been generated)
        rl = list(pd.read_csv(f'./data/{state[0]}_route_list.csv').values.flatten())
        print(f"Reading {state[0]}'s {state[2]} routes.")
        
        # For each route, scrape the route info and uer ratings.
        routes[state[0]] = []
        read_route_list(state[0], routes[state[0]] , rl, starts[i])
        pd.DataFrame(routes[state[0]]).to_csv(f"./data/{state[0]}_routes.csv", index = False)