#Traversing Mountain Project For Climbs

In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
from bs4 import BeautifulSoup
import requests

##Extracting State Data

The query tool from mountain project allows us to traverse the climbs along three dimensions: location (state), type of climb, and page. The url of each query takes the form:

"http://www.mountainproject.com ... &selectedIds=%[state id]&type=[type string] ... &page=[page #]"

Each page contains a table where each row is a different climb, and contains the link to each climb.

First, we need to design a state dictionary which maps each state to its state_id, which will allow us to traverse by state.

In [3]:
#It is fairly easy to get the state id's from the destinations page.
states_req = requests.get("http://www.mountainproject.com/destinations/")
states_soup = BeautifulSoup(states_req.text, "html.parser")

#State information is countained in the table with the following attributes:
states_table = states_soup.find("table", attrs = {"align":"center","cellspacing":"5", "cellpadding":"0"})
states_entries = states_table.find_all("span", attrs = {"class":"destArea"})[:-2]

#The state id is at the end of the url associated with the state. states_dict is a dictionary where the keys are the
#name of the state and the values are the id values.
states_dict = {entry.get_text():entry.find("a").get("href").split('/')[-1] for entry in states_entries}

Each page from the query is highly standardized, so a single function can be written to get the url's for each route present on a single page.

##Getting route urls

The route urls will be fetched and put into a dataframe where every row is a url. These url's can be passed into notebook 2 for further data scraping of the quality ratings and user information.

In [4]:
def get_route_url_loc(page_soup, state):
    #The second table with the following attributes is the table containing the url's of interest.
    table = page_soup.find_all("table", attrs = {"class" : "objectList"})[1]
    rows = table.find_all("tr")
    
    #Ignore the first row as it is just the column titles.
    all_urls = [r.find_all("a") for r in rows[1:]]
    
    #The first link present will always be the link for the route. The last link will be the most specific location
    #associated with that route.
    route_urls = ['http://www.mountainproject.com' + u[0].get("href") for u in all_urls]
    loc_urls = ['http://www.mountainproject.com' + u[-1].get("href") for u in all_urls]
    
    state_list = [state] * len(route_urls)
    #route_ids = [url.split('/')[-1] for url in route_urls]
    
    return route_urls, loc_urls, state_list

In [5]:
import math

#The routes can be queried along 5 different types.
types = ['mixed','rock','boulder','ice','aid']

url_start = "http://www.mountainproject.com/scripts/Search.php?searchType=routeFinder&minVotes=10&selectedIds=%27"
url_end = "&diffMinrock=1800&diffMinboulder=20000&diffMinaid=70000&diffMinice=30000&diffMinmixed=50000&diffMaxrock=5500&diffMaxboulder=21400&diffMaxaid=75260&diffMaxice=38500&diffMaxmixed=60000&is_trad_climb=1&is_sport_climb=1&is_top_rope=1&stars=0&pitches=0&sort1=area&sort2=quality_stars+desc&page=1" 

route_urls = []
location_urls = []
large_states = {}
state_list = []

expected_length = 0

#Iterate over each state and each type of climb.
for state_name, state_id in states_dict.iteritems():
    for t in types:
        #Str conctenation is prefered to string interpolation because the strings are long, causing the interpolation to take long.
        
        url = url_start + state_id + "&type=" + t + url_end
        request = requests.get(url)
        soup = BeautifulSoup(request.text, "html.parser")

        #The query displays 50 routes/page, so the total number of routes can be used to determine how many pages must be scraped.
        num_routes_str = soup.find("div", attrs ={"id": "navBox"}).get_text().split(' ')[-2]
        
        #There is an edge case where the limit on number of routes is 1000. This is stored in a dictionary to be dealt with later in this notebook.
        if num_routes_str == "1,000":
            large_states[state_name] = (state_id, t)
        else:
            num_routes = int(num_routes_str)
            expected_length += num_routes
            
            #print "Number of routes is: " + str(num_routes)
            
            if num_routes > 0:
                num_pages = int(math.ceil(float(num_routes)/50.0))
                
                #print "num of pages is: " + str(num_pages)
                a,b,c = get_route_url_loc(soup, state_name)
                
                #print "length of route urls page 1 is: " + str(len(a))
                
                route_urls.extend(a)
                location_urls.extend(b)
                state_list.extend(c)
                
                time.sleep(1)

                #If there are more than one pages, then we need to iterate over subsequent pages.
                if num_pages > 1:
                    for i in range(2,num_pages + 1):
                        iter_url = url[:-1] + str(i) #Replace the page number at the end of the url with the next one.

                        iter_request = requests.get(iter_url)
                        iter_soup = BeautifulSoup(iter_request.text, "html.parser")

                        a,b,c = get_route_url_loc(iter_soup, state_name)
                        
                        #print "length of route urls page"+ str(i) + " is: " + str(len(a))
                        
                        route_urls.extend(a)
                        location_urls.extend(b)
                        state_list.extend(c)

                        time.sleep(1)

In [6]:
len(route_urls)

9595

In [7]:
len(state_list)

9595

In [8]:
expected_length

9595

We can see above that all the available routes were successfully placed in the correct lists since they all contain the same lengths. The length of the state_list matches the length of the route urls, indicating that the states were successfully stored for each route which will alow for state-based identification of different routes.

In [9]:
print large_states

{u'California': (u'105708959', 'rock'), u'Arizona': (u'105708962', 'rock'), u'Utah': (u'105708957', 'rock'), u'Colorado': (u'105708956', 'rock')}


However, there are still routes missing for 4 "large states" whose query reached the limit of 1,000 routes. All of the large results were associeted with rock climbs.

###Extracting States Above the Query Limit

The routes on the Mountain Climb website are organized into a hierarchy. In order to bypass the query results limit, the code will dive down the hierarchy and be run on a per-area basis, where the area is the sublevel below the state.

In [10]:
def area_url_state(state):
    state_url = "http://mountainproject.com/v/" + state.lower() +'/' + large_states[state][0]
    state_req = requests.get(state_url)
    state_soup = BeautifulSoup(state_req.text,"html.parser")
    
    #This table includes all the areas found in a state
    area_table = state_soup.find("div", attrs ={"class": "roundedTop"})
    
    #The first and last two "a" html tags are not areas. They are related to a drop down menu, and google maps links
    #at the beginning and end respecitvley. These entries can be dropped safely due to the uniformity in page 
    list_of_areas = area_table.find_all("a")[1:-2]
    area_url = [x.get("href") for x in list_of_areas]
    area_id = [x.split('/')[-1] for x in area_url]
    
    return area_url, area_id

In [11]:
url_start = "http://www.mountainproject.com/scripts/Search.php?searchType=routeFinder&minVotes=10&selectedIds=%27"
url_end = "&type=rock&diffMinrock=1800&diffMinboulder=20000&diffMinaid=70000&diffMinice=30000&diffMinmixed=50000&diffMaxrock=5500&diffMaxboulder=21400&diffMaxaid=75260&diffMaxice=38500&diffMaxmixed=60000&is_trad_climb=1&is_sport_climb=1&is_top_rope=1&stars=0&pitches=0&sort1=area&sort2=quality_stars+desc&page=1" 

area_route_urls = []
area_location_urls = []
large_areas = {}
area_state_list = []

#expected length is a debugging variable to keep track of what the array size should be at the end of running this code.
expected_length_area = 0

for state in large_states:
    
    area_url_list, area_id_list = area_url_state(state)
    
    #Now the iteration runs only over area and not type, since all the missing climbs are rock type.
    for area_id in area_id_list:
        area_query_url = url_start + area_id + url_end
        area_req = requests.get(area_query_url)
        area_soup = BeautifulSoup(area_req.text, "html.parser")
        
        #The query displays 50 routes/page, so the total number of routes can be used to determine how many pages must be scraped.
        #There is an edge case where the limit on number of routes is 1000, which still needs to be dealt with.
        num_routes_str = area_soup.find("div", attrs ={"id": "navBox"}).get_text().split(' ')[-2]
        
        #As a precaution, the same large query edge case is included to determine if further traversing down the hierarchy will be needed.
        if num_routes_str == "1,000":
            large_areas[area_id] = state
        else:
            num_routes = int(num_routes_str)
            expected_length_area += num_routes
            
            if num_routes > 0:
                num_pages = int(math.ceil(float(num_routes)/50.0))

                a,b,c = get_route_url_loc(area_soup, state)
                area_route_urls.extend(a)
                area_location_urls.extend(b)
                area_state_list.extend(c)
                
                time.sleep(1)

                #If there are more than one pages, then we need to iterate over subsequent pages.
                if num_pages > 1:
                    for i in range(2,num_pages + 1):
                        iter_url = area_query_url[:-1] + str(i) #Replace the page number at the end of the url with the next one.

                        iter_request = requests.get(iter_url)
                        iter_soup = BeautifulSoup(iter_request.text, "html.parser")

                        a,b,c = get_route_url_loc(iter_soup, state)
                        area_route_urls.extend(a)
                        area_location_urls.extend(b)
                        area_state_list.extend(c)

                        time.sleep(1)

In [12]:
len(area_route_urls)

8300

In [13]:
expected_length_area

8300

Once again, the code got all the expected routes based on the length of the output array. Additionally, we can see that without the edge case many routes would have been, these four "large" states contribute almost as many routes on their own as all the other "smaller" states. If the limiting edge case had not been caught, these routes would only have contributed 4,000 routes.

In [14]:
large_areas

{u'105739213': u'Utah'}

One of the states includes large sub areas. This is linked to an identifying sub-area id, which can be put in the url for further traversal to the next sub-area level.

### Down the Hierarchy Once More: Large Sub-Area

The code below runs very similarly to all the code above for similar scraping, just one level further down the hierarchy.

In [15]:
sub_state = large_areas.values()[0]
sub_area_url = "http://www.mountainproject.com/v/wasatch-range/105739213"

sub_area_req = requests.get(sub_area_url)
sub_area_soup = BeautifulSoup(sub_area_req.text,"html.parser")
sub_area_table = sub_area_soup.find("div", attrs ={"class": "roundedTop"})
list_of_sub_areas = sub_area_table.find_all("a")[1:-2]
    
sub_area_url = [x.get("href") for x in list_of_sub_areas]
sub_area_id = [x.split('/')[-1] for x in sub_area_url]


url_start = "http://www.mountainproject.com/scripts/Search.php?searchType=routeFinder&minVotes=10&selectedIds=%27"
url_end = "&type=rock&diffMinrock=1800&diffMinboulder=20000&diffMinaid=70000&diffMinice=30000&diffMinmixed=50000&diffMaxrock=5500&diffMaxboulder=21400&diffMaxaid=75260&diffMaxice=38500&diffMaxmixed=60000&is_trad_climb=1&is_sport_climb=1&is_top_rope=1&stars=0&pitches=0&sort1=area&sort2=quality_stars+desc&page=1" 

sub_area_route_urls = []
sub_area_location_urls = []
large_sub_areas = {}
sub_area_state_list = []

expected_length_sub_area = 0

for area_id in sub_area_id:
    area_query_url = url_start + area_id + url_end

    area_req = requests.get(area_query_url)
    area_soup = BeautifulSoup(area_req.text, "html.parser")

    #The query displays 50 routes/page, so the total number of routes can be used to determine how many pages must be scraped.
    #There is an edge case where the limit on number of routes is 1000, which still needs to be dealt with.
    num_routes_str = area_soup.find("div", attrs ={"id": "navBox"}).get_text().split(' ')[-2]
    
    if num_routes_str == "1,000":
        large_sub_areas[area_id] = state
    else:
        num_routes = int(num_routes_str)
        expected_length_sub_area += num_routes
        
        if num_routes > 0:
            num_pages = int(math.ceil(float(num_routes)/50.0))

            a,b,c = get_route_url_loc(area_soup, sub_state)
            sub_area_route_urls.extend(a)
            sub_area_location_urls.extend(b)
            sub_area_state_list.extend(c)

            time.sleep(1)

            #If there are more than one pages, then we need to iterate over subsequent pages.
            if num_pages > 1:
                for i in range(2,num_pages + 1):
                    iter_url = area_query_url[:-1] + str(i) #Replace the page number at the end of the url with the next one.

                    iter_request = requests.get(iter_url)
                    iter_soup = BeautifulSoup(iter_request.text, "html.parser")

                    a,b,c = get_route_url_loc(iter_soup, sub_state)
                    sub_area_route_urls.extend(a)
                    sub_area_location_urls.extend(b)
                    sub_area_state_list.extend(c)

                    time.sleep(1)

In [16]:
len(sub_area_route_urls)

1006

In [17]:
expected_length_sub_area

1006

Finally, all the route urls are fully scrapped. The sub-area was just above the query limit of 1,000. There were no sub-areas of length over 1,000 at this level.

In [18]:
# All the different lists are concatenated together.
all_route_urls = route_urls + area_route_urls + sub_area_route_urls
all_location_urls = location_urls + area_location_urls + sub_area_location_urls
all_states = state_list + area_state_list + sub_area_state_list

print len(all_route_urls)
print len(all_location_urls)
print len(all_states)

#routes_dict = {'route_url': route_urls, 'location_url': location_urls, 'state':state_list}
#df_routes = pd.DataFrame(routes_dict)
#df_routes.head()

18901
18901
18901


##Creating DataFrames and Data Files

In [63]:
# All of the data is placed into a dataframe.
routes_dict = {'route_url': all_route_urls, 'location_url': all_location_urls, 'state': all_states}
df_routes = pd.DataFrame(routes_dict)
df_routes.head()

Unnamed: 0,location_url,route_url,state
0,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/the-hobbit/10...,Oklahoma
1,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/accidents-wil...,Oklahoma
2,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/super-slide/1...,Oklahoma
3,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/whos-got-the-...,Oklahoma
4,http://www.mountainproject.com/v/headwall/1058...,http://www.mountainproject.com/v/last-of-the-g...,Oklahoma


In [20]:
# One of the large states, Utah, is accounted for.
df_routes[df_routes['state'] == 'Utah'].head(10)

Unnamed: 0,location_url,route_url,state
2416,http://www.mountainproject.com/v/chaos-boulder...,http://www.mountainproject.com/v/circus-trick/...,Utah
2417,http://www.mountainproject.com/v/the-black-box...,http://www.mountainproject.com/v/arete-left-si...,Utah
2418,http://www.mountainproject.com/v/the-black-box...,http://www.mountainproject.com/v/arete-right-s...,Utah
2419,http://www.mountainproject.com/v/the-black-box...,http://www.mountainproject.com/v/slots-of-fun/...,Utah
2420,http://www.mountainproject.com/v/the-black-box...,http://www.mountainproject.com/v/center-start/...,Utah
2421,http://www.mountainproject.com/v/flat-top-boul...,http://www.mountainproject.com/v/silly-wabbit/...,Utah
2422,http://www.mountainproject.com/v/flat-top-boul...,http://www.mountainproject.com/v/the-pregnancy...,Utah
2423,http://www.mountainproject.com/v/flat-top-boul...,http://www.mountainproject.com/v/mr-trujillos-...,Utah
2424,http://www.mountainproject.com/v/flat-top-boul...,http://www.mountainproject.com/v/leftover-love...,Utah
2425,http://www.mountainproject.com/v/flat-top-boul...,http://www.mountainproject.com/v/kick-start-lo...,Utah


In [21]:
# All of the routes data from scrapping are saved to a csv to avoid
df_routes.to_csv("df_routes.csv")

In [3]:
df_routes = pd.read_csv("df_routes.csv")
df_routes.head()

Unnamed: 0.1,Unnamed: 0,location_url,route_url,state
0,0,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/the-hobbit/10...,Oklahoma
1,1,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/accidents-wil...,Oklahoma
2,2,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/super-slide/1...,Oklahoma
3,3,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/whos-got-the-...,Oklahoma
4,4,http://www.mountainproject.com/v/headwall/1058...,http://www.mountainproject.com/v/last-of-the-g...,Oklahoma


##Scraping Geographic Data: Latitude and Longitude

In order to make a map showing climb destinations, we wish to get the coordinates of the climbs. Climb pages themselves do not have the information, but each climb page has a navigation box, which allows one to see the area hierarchy leading to the climb. This will be exploited, as the area pages do contain the latitude/longitude coordinates.



In [4]:
def get_location(url):
    numerical_coords = [0,0]
    loc_req = requests.get(url)
    loc_soup = BeautifulSoup(loc_req.text, "html.parser")
    
    #This brings up the table near the top of the page that has location information, if available
    loc_table = loc_soup.find("div", attrs ={"class":"rspCol", "style":"max-width:500px;"})
    #The most direct way to find the location is with the url's: a location is always followed by a map url
    table_urls = loc_table.find_all("a", attrs ={"target":"_blank"})
    
    for u in table_urls:
        #Each map url is labeled by View Map, which allows us to select the location row.
        if u.get_text() == 'View Map':
            #The most direct way to get the coordinates of the location is by using the coordinates present in the url.
            #To access these locations, the url string is split - This splitting is consistent across all map url's.
            coords = u.get("href").split('=')[1].split('&')[0].split(',')
            numerical_coords = (float(coords[0]), float(coords[1])) #convert unicode coordinates to floats in a tuple.
    time.sleep(0.5)
    
    #Some very specific area pages lack latitude/longitude data. As such, we can check against the default value.
    #If the default value persists, we can proceed one level up the area hierarchy and attempt to find the coordiantes.
    if numerical_coords == [0,0]:
        
        #The nav box object contains the hierarchy of subareas. The subareas go from least specific to most specific
        less_specific_area = loc_soup.find("div", attrs ={"id":"navBox"}).find_all("a")
        less_specific_url = [x.get("href") for x in less_specific_area]
        i= 1
        n = len(less_specific_url)
        
        # The while loop it
        while numerical_coords == [0,0] and i < n:
            #Negative indexing as the goal of the loop is to traverse up the hierarchy to find the most specific
            #coordinate available.
            numerical_coords = get_location("http://www.mountainproject.com" + less_specific_url[-i])
            i += 1
            time.sleep(0.5)
    
    
    return numerical_coords

In [5]:
#initialize the latitude/longitude columns
df_routes['latitude'] = 0
df_routes['longitude'] = 0
df_routes.head()

Unnamed: 0.1,Unnamed: 0,location_url,route_url,state,latitude,longitude
0,0,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/the-hobbit/10...,Oklahoma,0,0
1,1,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/accidents-wil...,Oklahoma,0,0
2,2,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/super-slide/1...,Oklahoma,0,0
3,3,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/whos-got-the-...,Oklahoma,0,0
4,4,http://www.mountainproject.com/v/headwall/1058...,http://www.mountainproject.com/v/last-of-the-g...,Oklahoma,0,0


In [6]:
#To lower run time, we wish to loop over only the unique locations (some routes share the same location.)
len(df_routes['location_url'].unique())

4786

In [8]:
#This loop takes a long time to run and produces the coordinates. The length of the run is due to the wait times per
#url scraped as the hierarchy of areas is climbed.
for location in df_routes['location_url'].unique():
    num_coords = get_location(location)
    df_routes.loc[df_routes['location_url'] == location,'longitude'] = num_coords[1]
    df_routes.loc[df_routes['location_url'] == location,'latitude'] = num_coords[0]

In [10]:
check = df_routes[df_routes['latitude'] == 0]
len(check)

0

All routes now have an associated coordinate, as shown by having NO routes where the default value of 0 was maintained.

In [19]:
df_routes.head()

Unnamed: 0,location_url,route_url,state,latitude,longitude
0,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/the-hobbit/10...,Oklahoma,34.8913,-99.3014
1,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/accidents-wil...,Oklahoma,34.8913,-99.3014
2,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/super-slide/1...,Oklahoma,34.8913,-99.3014
3,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/whos-got-the-...,Oklahoma,34.8913,-99.3014
4,http://www.mountainproject.com/v/headwall/1058...,http://www.mountainproject.com/v/last-of-the-g...,Oklahoma,34.8913,-99.3014


In [20]:
#Finally, all the data is saved to a csv so the scrapping need not be run again.
df_routes.to_csv("df_routes_lat_long.csv", index = False)