#Traversing Mountain Project

In [None]:
def is_climb(page):
    # If you a span with class="rateYDS" inside an h3 header, then the page is a climb
    # Alternative, on a climb page you will be able to find the words "You & This Route" inside a div with ...
    # id="youContainer" whereas on an area page you will instead be able ot find the words "You & This Area"
    return bool

In [None]:
def get_all_links_down_hierarchy(page):
    # The links are found inside of div with id="viewerLeftNavColContent"
    # The first div inside this div should have id="mpbox########" where #### will be different for every page
    # Every link in id="mpbox########" should be a link going further down the hierarchy with the exception:
        # The link associated with <img src="/img/up.gif" .../> should be excluded
        # Any links to mountainproject.com/scripts/... should be excluded
    return list_of_links

In [None]:
def scrape(url):
    # Beautiful soup (or whatever) code here to get page from url
    # Ideally, we will do things such that we pass this data on and don't request the link again to ...
    # exact other information from the page
    return page

In [None]:
def get_number_of_ratings(page):
    # Each review on a climb page is in its own table of class="comvis"
    # Inside each table are 2 <td> cells; everything we need is in the first; probably best to code this in ...
    # because the second is user input and could have weird stuff that throws us off.
    # We can get the link to the user by looking for href="/u/...".
    # We can get where the user rated the route by see if the <img> with ... 
    # src="http://www.mountainproject.com/img/stars/star_b.gif" is present in the table.
    return number_of_ratings

In [None]:
def extract_climb_info(page):
    # Will be copied and adapted from "Climb Page Scraping.ipynb"
    return climb_info

In [None]:
def extract_links_to_user_pages(page):
    # The links from get_number_of_ratings above.  *We only want the links to users who actually rated the
    # route and didn't just submit a comment with no rating.
    return list_of_links

In [None]:
def extract_user_info(url):
    # Will be copied and adapted from "User Page Scraping.ipynb"
    return user_info

In [None]:
# Let's do the traversing recursively

def traverse(url, min_climb_ratings, climbs_info, user_links):
    page = scrape(url)
    
    if !is_climb(page):
        links = get_all_links_down_hierarchy(page)
        for link in links:
            traverse(link, min_climb_ratings)
    else:
        num_ratings = get_number_of_ratings(page)
        if num_ratings >= min_climb_ratings
            climbs_info.append(extract_climb_info(page)) # or .update() if we want climbs_info to be a dict
            user_links.extend([(link, 1) for link in extract_links_to_user_pages(page)])

min_climb_ratings = 10
min_user_ratings = 10
            
climbs_info = [] # or {} if we want climbs_info to be a dict
user_links = []
top_page = scrape("http://www.mountainproject.com/destinations/")
state_links = get_all_links_to_state_pages(top_page)
for link in state_links:
    traverse(link, min_climb_ratings)
    
user_links_reduced = map_reduce(user_links) # user_links is now a list of tuples (link_to_user, num_ratings)

user_links_filtered = [user[0] for user in user_links_filtered if user[1] >= min_user_ratings]

users_info = [] # or {} if we want users_info to be a dict
for link in user_links_filtered:
    users_info.append(extract_user_info(link))

In [1]:
# The %... is an iPython thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline
# See all the "as ..." contructs? They're just aliasing the package names.
# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
from bs4 import BeautifulSoup
# The "requests" library makes working with HTTP requests easier
# than the built-in urllib libraries.
import requests

The query tool from mountain project allows us to traverse the climbs along three dimensions: location (state), type of climb, and page. The url of each query takes the form:

"http://www.mountainproject.com ... &selectedIds=%[state id]&type=[type string] ... &page=[page #]"

Each page contains a table where each row is a different climb, and contains the link to each climb.

First, we need to design a state dictionary which maps each state to its state_id, which will allow us to traverse by state.

In [3]:
#It is fairly easy to get the state id's from the destinations page.
states_req = requests.get("http://www.mountainproject.com/destinations/")
states_soup = BeautifulSoup(states_req.text, "html.parser")

#State information is countained in the table with the following attributes/
states_table = states_soup.find("table", attrs = {"align":"center","cellspacing":"5", "cellpadding":"0"})
states_entries = states_table.find_all("span", attrs = {"class":"destArea"})[:-2]

#The state id is at the end of the url associated with the state. states_dict is a dictionary where the keys are the
#name of the state and the values are the id values.
states_dict = {entry.get_text():entry.find("a").get("href").split('/')[-1] for entry in states_entries}

Each page from the query is highly standardized, so a single function can be written to get the url's for each route present on a single page.

In [4]:
def get_route_url_loc(page_soup, state):
    #The second table with the following attributes is the table containing the url's of interest.
    table = page_soup.find_all("table", attrs = {"class" : "objectList"})[1]
    rows = table.find_all("tr")
    
    #Ignore the first row as it is just the column titles.
    all_urls = [r.find_all("a") for r in rows[1:]]
    
    #The first link present will always be the link for the route. The last link will be the most specific location
    #associated with that route.
    route_urls = ['http://www.mountainproject.com' + u[0].get("href") for u in all_urls]
    loc_urls = ['http://www.mountainproject.com' + u[-1].get("href") for u in all_urls]
    
    state_list = [state] * len(route_urls)
    #route_ids = [url.split('/')[-1] for url in route_urls]
    
    return route_urls, loc_urls, state_list

In [43]:
import math

#The routes can be queried along 5 different types.
types = ['mixed','rock','boulder','ice','aid']

url_start = "http://www.mountainproject.com/scripts/Search.php?searchType=routeFinder&minVotes=10&selectedIds=%27"
url_end = "&diffMinrock=1800&diffMinboulder=20000&diffMinaid=70000&diffMinice=30000&diffMinmixed=50000&diffMaxrock=5500&diffMaxboulder=21400&diffMaxaid=75260&diffMaxice=38500&diffMaxmixed=60000&is_trad_climb=1&is_sport_climb=1&is_top_rope=1&stars=0&pitches=0&sort1=area&sort2=quality_stars+desc&page=1" 

route_urls = []
location_urls = []
large_states = {}
state_list = []

expected_length = 0

for state_name, state_id in states_dict.iteritems():
    for t in types:
        #Str conctenation is prefered to string interpolation because the strings are long, causing the interpolation to take long.
        
        url = url_start + state_id + "&type=" + t + url_end
        request = requests.get(url)
        soup = BeautifulSoup(request.text, "html.parser")

        #The query displays 50 routes/page, so the total number of routes can be used to determine how many pages must be scraped.
        #There is an edge case where the limit on number of routes is 1000, which still needs to be dealt with.
        num_routes_str = soup.find("div", attrs ={"id": "navBox"}).get_text().split(' ')[-2]
        
        if num_routes_str == "1,000":
            large_states[state_name] = (state_id, t)
        else:
            num_routes = int(num_routes_str)
            expected_length += num_routes
            
            #print "Number of routes is: " + str(num_routes)
            
            if num_routes > 0:
                num_pages = int(math.ceil(float(num_routes)/50.0))
                
                #print "num of pages is: " + str(num_pages)
                a,b,c = get_route_url_loc(soup, state_name)
                
                #print "length of route urls page 1 is: " + str(len(a))
                
                route_urls.extend(a)
                location_urls.extend(b)
                state_list.extend(c)
                
                time.sleep(1)

                #If there are more than one pages, then we need to iterate over subsequent pages.
                if num_pages > 1:
                    for i in range(2,num_pages + 1):
                        iter_url = url[:-1] + str(i) #Replace the page number at the end of the url with the next one.

                        iter_request = requests.get(iter_url)
                        iter_soup = BeautifulSoup(iter_request.text, "html.parser")

                        a,b,c = get_route_url_loc(iter_soup, state_name)
                        
                        #print "length of route urls page"+ str(i) + " is: " + str(len(a))
                        
                        route_urls.extend(a)
                        location_urls.extend(b)
                        state_list.extend(c)

                        time.sleep(1)

In [46]:
len(route_urls)

9595

In [44]:
len(state_list)

9595

In [45]:
expected_length

9595

In [47]:
print large_states

{u'California': (u'105708959', 'rock'), u'Arizona': (u'105708962', 'rock'), u'Utah': (u'105708957', 'rock'), u'Colorado': (u'105708956', 'rock')}


In [48]:
def area_url_state(state):
    state_url = "http://mountainproject.com/v/" + state.lower() +'/' + large_states[state][0]
    state_req = requests.get(state_url)
    state_soup = BeautifulSoup(state_req.text,"html.parser")
    area_table = state_soup.find("div", attrs ={"class": "roundedTop"})

    list_of_areas = area_table.find_all("a")[1:-2]
    area_url = [x.get("href") for x in list_of_areas]
    area_id = [x.split('/')[-1] for x in area_url]
    
    return area_url, area_id

In [51]:
url_start = "http://www.mountainproject.com/scripts/Search.php?searchType=routeFinder&minVotes=10&selectedIds=%27"
url_end = "&type=rock&diffMinrock=1800&diffMinboulder=20000&diffMinaid=70000&diffMinice=30000&diffMinmixed=50000&diffMaxrock=5500&diffMaxboulder=21400&diffMaxaid=75260&diffMaxice=38500&diffMaxmixed=60000&is_trad_climb=1&is_sport_climb=1&is_top_rope=1&stars=0&pitches=0&sort1=area&sort2=quality_stars+desc&page=1" 

area_route_urls = []
area_location_urls = []
large_areas = {}
area_state_list = []

expected_length_area = 0

for state in large_states:
    
    area_url_list, area_id_list = area_url_state(state)
    
    #state_area_url_dict[state] = a
    #state_area_id_dict[state] = b
    
    for area_id in area_id_list:
        area_query_url = url_start + area_id + url_end
        area_req = requests.get(area_query_url)
        area_soup = BeautifulSoup(area_req.text, "html.parser")
        
        #The query displays 50 routes/page, so the total number of routes can be used to determine how many pages must be scraped.
        #There is an edge case where the limit on number of routes is 1000, which still needs to be dealt with.
        num_routes_str = area_soup.find("div", attrs ={"id": "navBox"}).get_text().split(' ')[-2]
        
        if num_routes_str == "1,000":
            large_areas[area_id] = state
        else:
            num_routes = int(num_routes_str)
            expected_length_area += num_routes
            
            if num_routes > 0:
                num_pages = int(math.ceil(float(num_routes)/50.0))

                a,b,c = get_route_url_loc(area_soup, state)
                area_route_urls.extend(a)
                area_location_urls.extend(b)
                area_state_list.extend(c)
                
                time.sleep(1)

                #If there are more than one pages, then we need to iterate over subsequent pages.
                if num_pages > 1:
                    for i in range(2,num_pages + 1):
                        iter_url = area_query_url[:-1] + str(i) #Replace the page number at the end of the url with the next one.

                        iter_request = requests.get(iter_url)
                        iter_soup = BeautifulSoup(iter_request.text, "html.parser")

                        a,b,c = get_route_url_loc(iter_soup, state_name)
                        area_route_urls.extend(a)
                        area_location_urls.extend(b)
                        area_state_list.extend(c)

                        time.sleep(1)

In [52]:
len(area_route_urls)

8300

In [53]:
expected_length_area

8300

In [54]:
large_areas

{u'105739213': u'Utah'}

In [55]:
sub_state = large_areas.values()[0]
sub_area_url = "http://www.mountainproject.com/v/wasatch-range/105739213"

sub_area_req = requests.get(sub_area_url)
sub_area_soup = BeautifulSoup(sub_area_req.text,"html.parser")
sub_area_table = sub_area_soup.find("div", attrs ={"class": "roundedTop"})
list_of_sub_areas = sub_area_table.find_all("a")[1:-2]
    
sub_area_url = [x.get("href") for x in list_of_sub_areas]
sub_area_id = [x.split('/')[-1] for x in sub_area_url]


url_start = "http://www.mountainproject.com/scripts/Search.php?searchType=routeFinder&minVotes=10&selectedIds=%27"
url_end = "&type=rock&diffMinrock=1800&diffMinboulder=20000&diffMinaid=70000&diffMinice=30000&diffMinmixed=50000&diffMaxrock=5500&diffMaxboulder=21400&diffMaxaid=75260&diffMaxice=38500&diffMaxmixed=60000&is_trad_climb=1&is_sport_climb=1&is_top_rope=1&stars=0&pitches=0&sort1=area&sort2=quality_stars+desc&page=1" 

sub_area_route_urls = []
sub_area_location_urls = []
large_sub_areas = {}
sub_area_state_list = []

expected_length_sub_area = 0

for area_id in sub_area_id:
    area_query_url = url_start + area_id + url_end

    area_req = requests.get(area_query_url)
    area_soup = BeautifulSoup(area_req.text, "html.parser")

    #The query displays 50 routes/page, so the total number of routes can be used to determine how many pages must be scraped.
    #There is an edge case where the limit on number of routes is 1000, which still needs to be dealt with.
    num_routes_str = area_soup.find("div", attrs ={"id": "navBox"}).get_text().split(' ')[-2]
    
    if num_routes_str == "1,000":
        large_sub_areas[area_id] = state
    else:
        num_routes = int(num_routes_str)
        expected_length_sub_area += num_routes
        
        if num_routes > 0:
            num_pages = int(math.ceil(float(num_routes)/50.0))

            a,b,c = get_route_url_loc(area_soup, sub_state)
            sub_area_route_urls.extend(a)
            sub_area_location_urls.extend(b)
            sub_area_state_list.extend(c)

            time.sleep(1)

            #If there are more than one pages, then we need to iterate over subsequent pages.
            if num_pages > 1:
                for i in range(2,num_pages + 1):
                    iter_url = area_query_url[:-1] + str(i) #Replace the page number at the end of the url with the next one.

                    iter_request = requests.get(iter_url)
                    iter_soup = BeautifulSoup(iter_request.text, "html.parser")

                    a,b,c = get_route_url_loc(iter_soup, state_name)
                    sub_area_route_urls.extend(a)
                    sub_area_location_urls.extend(b)
                    sub_area_state_list.extend(c)

                    time.sleep(1)

In [59]:
len(sub_area_route_urls)

10601

In [58]:
expected_length_sub_area

1006

In [62]:
all_route_urls = route_urls + area_route_urls + sub_area_route_urls
all_location_urls = location_urls + area_location_urls + sub_area_location_urls
all_states = state_list + area_state_list + sub_area_state_list

print len(all_route_urls)
print len(all_location_urls)
print len(all_states)

#routes_dict = {'route_url': route_urls, 'location_url': location_urls, 'state':state_list}
#df_routes = pd.DataFrame(routes_dict)
#df_routes.head()

18901
18901
18901


In [67]:
routes_dict = {'route_url': all_route_urls, 'location_url': all_location_urls, 'state': all_states}
df_routes = pd.DataFrame(routes_dict)
df_routes.head()

Unnamed: 0,location_url,route_url,state
0,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/the-hobbit/10...,Oklahoma
1,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/accidents-wil...,Oklahoma
2,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/super-slide/1...,Oklahoma
3,http://www.mountainproject.com/v/sea-of-scream...,http://www.mountainproject.com/v/whos-got-the-...,Oklahoma
4,http://www.mountainproject.com/v/headwall/1058...,http://www.mountainproject.com/v/last-of-the-g...,Oklahoma


In [68]:
df_routes[df_routes['state'] == 'Utah'].head(10)

Unnamed: 0,location_url,route_url,state
2416,http://www.mountainproject.com/v/chaos-boulder...,http://www.mountainproject.com/v/circus-trick/...,Utah
2417,http://www.mountainproject.com/v/the-black-box...,http://www.mountainproject.com/v/arete-left-si...,Utah
2418,http://www.mountainproject.com/v/the-black-box...,http://www.mountainproject.com/v/arete-right-s...,Utah
2419,http://www.mountainproject.com/v/the-black-box...,http://www.mountainproject.com/v/slots-of-fun/...,Utah
2420,http://www.mountainproject.com/v/the-black-box...,http://www.mountainproject.com/v/center-start/...,Utah
2421,http://www.mountainproject.com/v/flat-top-boul...,http://www.mountainproject.com/v/silly-wabbit/...,Utah
2422,http://www.mountainproject.com/v/flat-top-boul...,http://www.mountainproject.com/v/the-pregnancy...,Utah
2423,http://www.mountainproject.com/v/flat-top-boul...,http://www.mountainproject.com/v/mr-trujillos-...,Utah
2424,http://www.mountainproject.com/v/flat-top-boul...,http://www.mountainproject.com/v/leftover-love...,Utah
2425,http://www.mountainproject.com/v/flat-top-boul...,http://www.mountainproject.com/v/kick-start-lo...,Utah


In [69]:
df_routes.to_csv("df_routes.csv")

In [147]:
def get_location(url):
    numerical_coords = (0,0)
    loc_req = requests.get(url)
    loc_soup = BeautifulSoup(loc_req.text, "html.parser")
    
    #This brings up the table near the top of the page that has location information, if available
    loc_table = loc_soup.find("div", attrs ={"class":"rspCol", "style":"max-width:500px;"})
    #The most direct way to find the location is with the url's: a location is always followed by a map url
    table_urls = loc_table.find_all("a", attrs ={"target":"_blank"})
    
    for u in table_urls:
        #Each map url is labeled by View Map, which allows us to select the location row.
        if u.get_text() == 'View Map':
            #The most direct way to get the coordinates of the location is by using the coordinates present in the url.
            #To access these locations, the url string is split - This splitting is consistent across all map url's.
            coords = u.get("href").split('=')[1].split('&')[0].split(',')
            numerical_coords = (float(coords[0]), float(coords[1])) #convert unicode coordinates to floats in a tuple.
    time.sleep(1)
    
    
    return numerical_coords

The following code takes a long time to run since it includes a 1 second break with each url.

For debugging, I take a sample of first 120 url's

In [145]:
#lat_long = [get_location(loc) for loc in location_urls[0:120]]

numerical_coords = [(0,0)] * 121

for loc in location_urls[0:120]:
    
    


In [34]:
lat_long

[(44.15516, -71.36736),
 (44.15601, -71.68723),
 (44.06349, -71.16594),
 (44.19325, -71.39333),
 (43.80386, -71.84091),
 (43.80386, -71.84091),
 (43.80386, -71.84091),
 (43.80386, -71.84091),
 (43.8025, -71.83593),
 (43.8025, -71.83593),
 (43.8025, -71.83593),
 (43.8025, -71.83593),
 (43.8025, -71.83593),
 (43.8025, -71.83593),
 (43.8025, -71.83593),
 (43.8025, -71.83593),
 (43.8025, -71.83593),
 (43.8025, -71.83593),
 (43.8025, -71.83593),
 (43.8025, -71.83593),
 (43.8025, -71.83593),
 (43.8025, -71.83593),
 (43.80417, -71.84037),
 (43.80417, -71.84037),
 (43.80417, -71.84037),
 (43.80417, -71.84037),
 (43.80417, -71.84037),
 (43.80417, -71.84037),
 (43.80417, -71.84037),
 (43.80417, -71.84037),
 (43.80417, -71.84037),
 (43.80417, -71.84037),
 (43.80417, -71.84037),
 (43.80417, -71.84037),
 (43.80288, -71.83926),
 (43.80288, -71.83926),
 (43.80288, -71.83926),
 (43.80288, -71.83926),
 (43.80288, -71.83926),
 (43.80288, -71.83926),
 (43.80288, -71.83926),
 (43.80288, -71.83926),
 (43.8

(0,0) entries above indicate locations without data. This edge case still needs to be dealt with, but will likely involve going to the next less specific location.

In [33]:
len(set(location_urls)) # There are much fewer unique locations, this can be a time-saving avenue. Likely requires a dictionary.

151

Code below was an example used to determine how to get locations out.

In [85]:
loc_url = location_urls[0]

In [86]:
loc_req = requests.get(loc_url)
loc_soup = BeautifulSoup(loc_req.text, "html.parser")

In [87]:
loc_table = loc_soup.find_all("table", attrs ={"cellpadding":"0","cellspacing":"0"} )

In [88]:
loc_table = loc_soup.find("div", attrs ={"class":"rspCol", "style":"max-width:500px;"})
urls = loc_table.find_all("a", attrs ={"target":"_blank"})

In [95]:
urls

[<a href="http://forecast.weather.gov/MapClick.php?textField1=34.89&amp;textField2=-99.3" target="_blank"><img class="nws" src="/images/weather/httpforecastweathergovnewimagesmediumskcpng.png"/><br/>55° | 30°</a>,
 <a href="http://forecast.weather.gov/MapClick.php?textField1=34.89&amp;textField2=-99.3" target="_blank"><img class="nws" src="/images/weather/httpforecastweathergovnewimagesmediumfewpng.png"/><br/>58° | 34°</a>,
 <a href="http://forecast.weather.gov/MapClick.php?textField1=34.89&amp;textField2=-99.3" target="_blank"><img class="nws" src="/images/weather/httpforecastweathergovnewimagesmediumfewpng.png"/><br/>57° | 36°</a>,
 <a href="http://forecast.weather.gov/MapClick.php?textField1=34.89&amp;textField2=-99.3" target="_blank"><img class="nws" src="/images/weather/httpforecastweathergovnewimagesmediumsctpng.png"/><br/>55° | 36°</a>,
 <a href="http://forecast.weather.gov/MapClick.php?textField1=34.89&amp;textField2=-99.3" target="_blank"><img class="nws" src="/images/weather/

In [138]:
numerical_coords = [0,0]
for u in urls:
    if u.get_text() == 'View Map':
        coords = u.get("href").split('=')[1].split('&')[0].split(',')
        numerical_coords = [float(x) for x in coords]
if numerical_coords == [0,0]:
    less_specific_area = loc_soup.find("div", attrs ={"id":"navBox"}).find_all("a")
    less_specific_url = [x.get("href") for x in less_specific_area]
    i= 1
   # while numerical_coords == [0,0]:
    less_specific_req = requests.get("http://www.mountainproject.com"+ less_specific_url[-i])
    less_specific_soup = BeautifulSoup(less_specific_req.text,"html.parser")
    
    less_specific_table = less_specific_soup.find("div", attrs ={"class":"rspCol", "style":"max-width:500px;"})
    new_urls = less_specific_table.find_all("a", attrs ={"target":"_blank"})
    
    for u in new_urls:
        if u.get_text() == 'View Map':
            coords = u.get("href").split('=')[1].split('&')[0].split(',')
            numerical_coords = [float(x) for x in coords]

In [139]:
numerical_coords

[34.8913, -99.3014]

In [137]:
new_urls

[<a href="http://maps.google.com/maps?q=34.8913,-99.3014&amp;t=h&amp;hl=en" target="_blank">View Map</a>,
 <a href="http://forecast.weather.gov/MapClick.php?textField1=34.89&amp;textField2=-99.3" target="_blank"><img class="nws" src="/images/weather/httpforecastweathergovnewimagesmediumskcpng.png"/><br/>55° | 30°</a>,
 <a href="http://forecast.weather.gov/MapClick.php?textField1=34.89&amp;textField2=-99.3" target="_blank"><img class="nws" src="/images/weather/httpforecastweathergovnewimagesmediumfewpng.png"/><br/>58° | 34°</a>,
 <a href="http://forecast.weather.gov/MapClick.php?textField1=34.89&amp;textField2=-99.3" target="_blank"><img class="nws" src="/images/weather/httpforecastweathergovnewimagesmediumfewpng.png"/><br/>57° | 36°</a>,
 <a href="http://forecast.weather.gov/MapClick.php?textField1=34.89&amp;textField2=-99.3" target="_blank"><img class="nws" src="/images/weather/httpforecastweathergovnewimagesmediumsctpng.png"/><br/>55° | 36°</a>,
 <a href="http://forecast.weather.gov/

In [135]:
less_specific_table.find_all("a", attrs = {"target": "_blank"})

[<a href="http://maps.google.com/maps?q=34.8913,-99.3014&amp;t=h&amp;hl=en" target="_blank">View Map</a>,
 <a href="http://forecast.weather.gov/MapClick.php?textField1=34.89&amp;textField2=-99.3" target="_blank"><img class="nws" src="/images/weather/httpforecastweathergovnewimagesmediumskcpng.png"/><br/>55° | 30°</a>,
 <a href="http://forecast.weather.gov/MapClick.php?textField1=34.89&amp;textField2=-99.3" target="_blank"><img class="nws" src="/images/weather/httpforecastweathergovnewimagesmediumfewpng.png"/><br/>58° | 34°</a>,
 <a href="http://forecast.weather.gov/MapClick.php?textField1=34.89&amp;textField2=-99.3" target="_blank"><img class="nws" src="/images/weather/httpforecastweathergovnewimagesmediumfewpng.png"/><br/>57° | 36°</a>,
 <a href="http://forecast.weather.gov/MapClick.php?textField1=34.89&amp;textField2=-99.3" target="_blank"><img class="nws" src="/images/weather/httpforecastweathergovnewimagesmediumsctpng.png"/><br/>55° | 36°</a>,
 <a href="http://forecast.weather.gov/

In [113]:
loc_url

u'http://www.mountainproject.com/v/sea-of-screams/105866044'

In [120]:
"http://www.mountainproject.com" + less_specific_url[-i]

u'http://www.mountainproject.com/v/quartz-mountain/105854470'

In [106]:
i = 1
-i

-1

In [108]:
less_specific_url

[u'/destinations/', u'/v/oklahoma/105854466', u'/v/quartz-mountain/105854470']

In [83]:
state = 'California'

#  id="mpbox1077091319" 

state_url = "http://mountainproject.com/v/" + state.lower() +'/' + large_states[state][0]
state_req = requests.get(state_url)
state_soup = BeautifulSoup(state_req.text,"html.parser")
area_table = state_soup.find("div", attrs ={"class": "roundedTop"})

list_of_areas = area_table.find_all("a")[1:-2]
area_url = [x.get("href") for x in list_of_areas]
area_id = [x.split('/')[-1] for x in area_url]


print area_id

[u'105796934', u'109078641', u'105865039', u'105791817', u'105903783', u'105720495', u'105904319', u'105733968', u'105846963', u'105793881', u'105790243', u'105733851', u'105991052', u'108147148', u'105798288', u'107098288', u'105798291', u'105788020', u'105833381']
