#Traversing Mountain Project

In [None]:
def is_climb(page):
    # If you a span with class="rateYDS" inside an h3 header, then the page is a climb
    # Alternative, on a climb page you will be able to find the words "You & This Route" inside a div with ...
    # id="youContainer" whereas on an area page you will instead be able ot find the words "You & This Area"
    return bool

In [None]:
def get_all_links_down_hierarchy(page):
    # The links are found inside of div with id="viewerLeftNavColContent"
    # The first div inside this div should have id="mpbox########" where #### will be different for every page
    # Every link in id="mpbox########" should be a link going further down the hierarchy with the exception:
        # The link associated with <img src="/img/up.gif" .../> should be excluded
        # Any links to mountainproject.com/scripts/... should be excluded
    return list_of_links

In [None]:
def scrape(url):
    # Beautiful soup (or whatever) code here to get page from url
    # Ideally, we will do things such that we pass this data on and don't request the link again to ...
    # exact other information from the page
    return page

In [None]:
def get_number_of_ratings(page):
    # Each review on a climb page is in its own table of class="comvis"
    # Inside each table are 2 <td> cells; everything we need is in the first; probably best to code this in ...
    # because the second is user input and could have weird stuff that throws us off.
    # We can get the link to the user by looking for href="/u/...".
    # We can get where the user rated the route by see if the <img> with ... 
    # src="http://www.mountainproject.com/img/stars/star_b.gif" is present in the table.
    return number_of_ratings

In [None]:
def extract_climb_info(page):
    # Will be copied and adapted from "Climb Page Scraping.ipynb"
    return climb_info

In [None]:
def extract_links_to_user_pages(page):
    # The links from get_number_of_ratings above.  *We only want the links to users who actually rated the
    # route and didn't just submit a comment with no rating.
    return list_of_links

In [None]:
def extract_user_info(url):
    # Will be copied and adapted from "User Page Scraping.ipynb"
    return user_info

In [None]:
# Let's do the traversing recursively

def traverse(url, min_climb_ratings, climbs_info, user_links):
    page = scrape(url)
    
    if !is_climb(page):
        links = get_all_links_down_hierarchy(page)
        for link in links:
            traverse(link, min_climb_ratings)
    else:
        num_ratings = get_number_of_ratings(page)
        if num_ratings >= min_climb_ratings
            climbs_info.append(extract_climb_info(page)) # or .update() if we want climbs_info to be a dict
            user_links.extend([(link, 1) for link in extract_links_to_user_pages(page)])

min_climb_ratings = 10
min_user_ratings = 10
            
climbs_info = [] # or {} if we want climbs_info to be a dict
user_links = []
top_page = scrape("http://www.mountainproject.com/destinations/")
state_links = get_all_links_to_state_pages(top_page)
for link in state_links:
    traverse(link, min_climb_ratings)
    
user_links_reduced = map_reduce(user_links) # user_links is now a list of tuples (link_to_user, num_ratings)

user_links_filtered = [user[0] for user in user_links_filtered if user[1] >= min_user_ratings]

users_info = [] # or {} if we want users_info to be a dict
for link in user_links_filtered:
    users_info.append(extract_user_info(link))

In [1]:
# The %... is an iPython thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline
# See all the "as ..." contructs? They're just aliasing the package names.
# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
from bs4 import BeautifulSoup
# The "requests" library makes working with HTTP requests easier
# than the built-in urllib libraries.
import requests

The query tool from mountain project allows us to traverse the climbs along three dimensions: location (state), type of climb, and page. The url of each query takes the form:

"http://www.mountainproject.com ... &selectedIds=%[state id]&type=[type string] ... &page=[page #]"

Each page contains a table where each row is a different climb, and contains the link to each climb.

First, we need to design a state dictionary which maps each state to its state_id, which will allow us to traverse by state.

In [3]:
#It is fairly easy to get the state id's from the destinations page.
states_req = requests.get("http://www.mountainproject.com/destinations/")
states_soup = BeautifulSoup(states_req.text, "html.parser")

#State information is countained in the table with the following attributes/
states_table = states_soup.find("table", attrs = {"align":"center","cellspacing":"5", "cellpadding":"0"})
states_entries = states_table.find_all("span", attrs = {"class":"destArea"})[:-2]

#The state id is at the end of the url associated with the state. states_dict is a dictionary where the keys are the
#name of the state and the values are the id values.
states_dict = {entry.get_text():entry.find("a").get("href").split('/')[-1] for entry in states_entries}

Each page from the query is highly standardized, so a single function can be written to get the url's for each route present on a single page.

In [4]:
def get_route_url_loc(page_soup):
    #The second table with the following attributes is the table containing the url's of interest.
    table = page_soup.find_all("table", attrs = {"class" : "objectList"})[1]
    rows = table.find_all("tr")
    
    #Ignore the first row as it is just the column titles.
    all_urls = [r.find_all("a") for r in rows[1:]]
    
    #The first link present will always be the link for the route. The last link will be the most specific location
    #associated with that route.
    route_urls = ['http://www.mountainproject.com' + u[0].get("href") for u in all_urls]
    loc_urls = ['http://www.mountainproject.com' + u[-1].get("href") for u in all_urls]
    
    #route_ids = [url.split('/')[-1] for url in route_urls]
    
    return route_urls, loc_urls

In [5]:
#The routes can be queried along 5 different types.
types = ['mixed','rock','boulder','ice','aid']

#Note, the code below currently only runs for NH.
nh_url_start = "http://www.mountainproject.com/scripts/Search.php?searchType=routeFinder&minVotes=10&selectedIds=%27105872225&type="
nh_url_end = "&diffMinrock=1800&diffMinboulder=20000&diffMinaid=70000&diffMinice=30000&diffMinmixed=50000&diffMaxrock=5500&diffMaxboulder=21400&diffMaxaid=75260&diffMaxice=38500&diffMaxmixed=60000&is_trad_climb=1&is_sport_climb=1&is_top_rope=1&stars=0&pitches=0&sort1=area&sort2=quality_stars+desc&page=1" 

route_urls = []
location_urls = []

for t in types:
    #Str conctenation is prefered to string interpolation because the strings are long, causing the interpolation to take long.
    nh_url = nh_url_start + t + nh_url_end
    nh_request = requests.get(nh_url)
    nh_soup = BeautifulSoup(nh_request.text, "html.parser")
    
    #The query displays 50 routes/page, so the total number of routes can be used to determine how many pages must be scraped.
    #There is an edge case where the limit on number of routes is 1000, which still needs to be dealt with.
    num_routes = int(nh_soup.find("div", attrs ={"id": "navBox"}).get_text().split(' ')[-2])
    num_pages = num_routes/50 + 1
    
    
    a,b = get_route_url_loc(nh_soup)
    route_urls.extend(a)
    location_urls.extend(b)
    time.sleep(1)
    
    #If there are more than one pages, then we need to iterate over subsequent pages.
    if num_pages > 1:
        for i in range(2,num_pages + 1):
            iter_url = nh_url[:-1] + str(i) #Replace the page number at the end of the url with the next one.
            
            iter_request = requests.get(iter_url)
            iter_soup = BeautifulSoup(iter_request.text, "html.parser")
            
            a,b = get_route_url_loc(iter_soup)
            route_urls.extend(a)
            location_urls.extend(b)
            
            time.sleep(1)

In [21]:
def get_location(url):
    numerical_coords = (0,0)
    loc_req = requests.get(url)
    loc_soup = BeautifulSoup(loc_req.text, "html.parser")
    
    #This brings up the table near the top of the page that has location information, if available
    loc_table = loc_soup.find("div", attrs ={"class":"rspCol", "style":"max-width:500px;"})
    #The most direct way to find the location is with the url's: a location is always followed by a map url
    table_urls = loc_table.find_all("a", attrs ={"target":"_blank"})
    
    for u in table_urls:
        #Each map url is labeled by View Map, which allows us to select the location row.
        if u.get_text() == 'View Map':
            #The most direct way to get the coordinates of the location is by using the coordinates present in the url.
            #To access these locations, the url string is split - This splitting is consistent across all map url's.
            coords = u.get("href").split('=')[1].split('&')[0].split(',')
            numerical_coords = (float(coords[0]), float(coords[1])) #convert unicode coordinates to floats in a tuple.
    time.sleep(1)        
    return numerical_coords

In [22]:
lat_long = [get_location(loc) for loc in location_urls[0:120]] #

In [27]:
location_urls[108:120]

[u'http://www.mountainproject.com/v/the-parking-lot-wall/105898430',
 u'http://www.mountainproject.com/v/the-prudential/105907474',
 u'http://www.mountainproject.com/v/the-prudential/105907474',
 u'http://www.mountainproject.com/v/the-prudential/105907474',
 u'http://www.mountainproject.com/v/the-prudential/105907474',
 u'http://www.mountainproject.com/v/no-money-down-left/105907981',
 u'http://www.mountainproject.com/v/no-money-down-left/105907981',
 u'http://www.mountainproject.com/v/no-money-down-left/105907981',
 u'http://www.mountainproject.com/v/no-money-down-left/105907981',
 u'http://www.mountainproject.com/v/no-money-down-left/105907981',
 u'http://www.mountainproject.com/v/no-money-down-left/105907981',
 u'http://www.mountainproject.com/v/no-money-down-left/105907981']

In [33]:
len(set(location_urls))

151

In [8]:
loc_url = location_urls[0]

In [9]:
loc_req = requests.get(loc_url)
loc_soup = BeautifulSoup(loc_req.text, "html.parser")

In [10]:
loc_table = loc_soup.find_all("table", attrs ={"cellpadding":"0","cellspacing":"0"} )

In [11]:
loc_table = loc_soup.find("div", attrs ={"class":"rspCol", "style":"max-width:500px;"})
urls = loc_table.find_all("a", attrs ={"target":"_blank"})

In [12]:
for u in urls:
    if u.get_text() == 'View Map':
        coords = u.get("href").split('=')[1].split('&')[0].split(',')
        numerical_coords = [float(x) for x in coords]

In [13]:
numerical_coords

[44.15516, -71.36736]