In [13]:
%matplotlib inline
import collections 
import numpy as np
import scipy as sp
import pandas as pd
import time
from datetime import date, timedelta
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import re
import urllib2
#import regex
#import locale
#import usaddress
#import geograpy
#import nltk
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests

In [66]:
# global data storage
climbsjson = collections.defaultdict(dict) # double nested dict
usersjson = collections.defaultdict(dict) # double nested dict
ucjson = collections.defaultdict(dict) #double nested dict

 # sub-functions and initializations for user/climb_scrape_func-s

def parse_climb_link(href_link):
    return href_link.split('/')[-1], href_link.split('/')[-2]

def parse_user_link(href_link):
    return href_link.split('/')[-1], href_link.split('/')[-3]

# 16. get star info from td pairs for climb parser
def star_quality_func(td1, td2, climb_id):
    user_id, user_name = parse_user_link(td1('a').attr('href'))
    rating = int(re.search(r'starsHtml\(([0-5]),', 
                       td2('script').text()).group(1)) - 1
    ucjson[climb_id + "_" + user_id]["user_id"] = user_id
    ucjson[climb_id + "_" + user_id]["climb_id"] = climb_id
    ucjson[climb_id + "_" + user_id]["star_rating"] = rating

# 19. get sugg_rate info from td pairs
def sugg_rating_func(td1, td2, climb_id):
    user_id, user_name = parse_user_link(td1('a').attr('href'))
    ucjson[climb_id + "_" + user_id]["user_id"] = user_id
    ucjson[climb_id + "_" + user_id]["climb_id"] = climb_id
    
    for key, regex in diff_regexes.iteritems():
        temp = re.search(regex, td2.text())
        if  temp != None:
            ucjson[climb_id + "_" + user_id]["sugg_" + key] = temp.group()
        else:
            ucjson[climb_id + "_" + user_id]["sugg_" + key] = 0

aid_re = r'(A[0-5])|(C[0-5])[-,+]?'
boulder_re = r'V\d\d?[-,+]?'
ice_re = r'WI[1-6][+,-]?'
mixed_re = r'M\d\d?[+,-]?'
rock_re = r'5.\d\d?(([a-d]/[a-d])|([a-d]))?[+,-]?'

diff_regexes = {"aid_grade": aid_re, "boulder_grade": boulder_re, 
               "ice_grade": ice_re, "mixed_grade": mixed_re, 
               "rock_grade": rock_re}

user_type_regexes = {"Trad": rock_re, "Sport": rock_re, 
                     "Boulders": boulder_re, "Aid": aid_re, 
                     "Ice": ice_re, "Mixed": mixed_re}

typesdict = {"trad": 0, "alpine": 0, "ice": 0, "sport": 0,
        "boulder": 0, "aid": 0, "mixed": 0, "TR,": 0}



In [67]:
def climb_scrape_func(climb_link, state='undef', lat=0, lng=0):
    
    # initialize local climbdict
    climbdict = {"climb_id": 0, "climb_name": 'undef',
             "avg_stars": 0, "pitches": 0, "feet": 0, "climb_link": climb_link}
    
    # add latittude and logitude and state
    climbdict["latitude"] = lat
    climbdict["longitude"] = lng
    climbdict["state"] = state
        
    # 1. get keys
    climb_id, climb_name = parse_climb_link(climb_link)
    climbdict["climb_id"] = climb_id
    climbdict["climb_name"] = climb_name
    
    # climb already scraped, then abort
    if len(climbsjson[climb_id]) != 0:
        return
    
    # get page into pyquery object
    climb_req=requests.get(climb_link)
    climb_=pq(climb_req.text)

    # 0. get areas
    climb_loc = climb_("#navBox").find('a')
    for i in range(0, len(climb_loc)):
        climbdict["area_" + str(i)] = (climb_loc.eq(i).attr('href'))
    
    # 2. separate climb div
    climb_div = climb_('div#rspCol800')
    
    # 3. get climb summary div
    climb_summstats = climb_div('div.rspCol[style="max-width:500px;"]')
    
    # 4. get grade ratings
    climb_summstats_subtitle = climb_summstats('h3')
    
    for key, regex in diff_regexes.items():
        temp = re.search(regex, climb_summstats_subtitle.text())
        if  temp != None:
            climbdict["guide_" + key] = temp.group()
        else:
            climbdict["guide_" + key] = 0
            
    # 5. get average star rating
    average_stars_script_html = climb_summstats('span#routeStars')('script').html()
    average_stars_regexobj = re.search(r'[1-5]\.\d\d?\d?\d?', 
                                       average_stars_script_html)
    if average_stars_regexobj != None:     
        avg_stars = float(average_stars_regexobj.group()) - 1
        climbdict["avg_stars"] = avg_stars
    else:
        avg_stars = 0
        
    # 6. get rows of table
    climb_summstats_table = climb_summstats('table')

    # get climb types, pitches, feet, grade
    climb_summstats_type_cell = climb_summstats_table("td:contains('Type:')")
    # 7. get climb types
    if climb_summstats_type_cell:
        climb_summstats_type_text = climb_summstats_type_cell.next().text()
        for key in typesdict:
            if re.search(r'{}'.format(key), 
                        climb_summstats_type_text, 
                        flags = re.I) != None:
                climbdict["type_" + key] = 1
            else:
                climbdict["type_" + key] = 0

        # 8. get the number of pitches
        pitches_regobj = re.search(r'(\d\d?) pitch(es)?', 
                        climb_summstats_type_text, 
                        flags = re.I)
        if pitches_regobj:
            climbdict["pitches"] = int(pitches_regobj.group(1))
        else:
            climbdict["pitches"] = None

        # 9. get number of feet
        feet_regobj = re.search(r" (\d\d?\d?\d?\d?)'", 
                        climb_summstats_type_text, 
                        flags = re.I)
        if feet_regobj:
            climbdict["feet"] = int(feet_regobj.group(1))
        else:
            climbdict["feet"] = None

        # 10. get commitment grade
        grade_regobj = re.search(r'Grade ([I,II,III,IV,V,VI])', 
                        climb_summstats_type_text, 
                        flags = re.I)
        if grade_regobj:
            climbdict["grade"] = grade_regobj.group(1)
        else:
            climbdict["grade"] = None

    # 11. get concensus rating
    climb_summstats_table_consensus_cell = climb_summstats_table("td:contains('Consensus')")
    if climb_summstats_table_consensus_cell:
        climb_summstats_table_consensus_text = climb_summstats_table_consensus_cell.next().text()
        for key, regex in diff_regexes.iteritems():
            temp = re.search(regex,climb_summstats_table_consensus_text)
            if  temp != None:
                climbdict["concensus_" + key] = temp.group()
            else:
                climbdict["concensus_" + key] = 0

    # 12. get FA year
    climb_summstats_table_fa_cell = climb_summstats_table("td:contains('FA:')")
    if climb_summstats_table_fa_cell:
        climb_summstats_table_fa_text = climb_summstats_table_fa_cell.next().text()
        fa_regobj = re.search(r"([18,19,20]\d\d\d)",climb_summstats_table_fa_text)
        if fa_regobj:
            climbdict["fa_year"] = int(fa_regobj.group(1))
        else:
            climbdict["fa_year"] = None
    
    # 13. get page views
    climb_summstats_table_pv_cell = climb_summstats_table("td:contains('Page Views')")
    if climb_summstats_table_pv_cell:
        climb_summstats_table_pv_text = climb_summstats_table_pv_cell.next().text()    
        views_regobj = re.search(r"\d?\d?\d?,?\d?\d\d",climb_summstats_table_pv_text )
        if views_regobj:
            climbdict["page_views"] = int(views_regobj.group().replace(',',''))
        else:
            climbdict["page_views"] = None
        
    # 14. get detailed users-climb page
    stats_link = ("http://mountainproject.com/scripts/ShowObjectStats.php?id=%s"
                  % climbdict['climb_id'])
    climb_stats_req=requests.get(stats_link)
    stats_=pq(climb_stats_req.text)

    # 15. get star quality votes tables
    stats_stars_table = stats_("span:contains('Star Quality Votes')").next().next().find('tr')  
    
    # 17. iterate through stars table and populate ucjson 
    for i in range(0, len(stats_stars_table)):
        tds = stats_stars_table.eq(i).find('td')
        if len(tds) > 0:
            star_quality_func(tds.eq(0),tds.eq(1), climb_id)
            if len(tds) == 4:
                star_quality_func(tds.eq(2),tds.eq(3), climb_id)
                
    # 18. get suggested ratings rows
    stats_sugg_table = stats_("span:contains('Suggested Ratings')").next().next().find('tr')  
    
    # 20. iterate through suggested ratings table and populate ucjson 
    for i in range(0, len(stats_sugg_table)):
        tds = stats_sugg_table.eq(i).find('td')
        if len(tds) > 1:
            sugg_rating_func(tds.eq(0),tds.eq(1), climb_id)
            if len(tds) == 4:
                sugg_rating_func(tds.eq(2),tds.eq(3), climb_id)
                
    # 21. 
    stats_ticks_table = stats_("span:contains('Ticks')").next().next().find('tr')  

    # 22. iterate through ticks ratings table and populate ucjson 
    # this might require a tweek if dates are ever listed as ("\d\d? days ago")
    dates_wrong = 0
    for i in range(0, len(stats_ticks_table)):
        tds = stats_ticks_table.eq(i).find('td')
        if len(tds) == 3:
            user_id, user_name = parse_user_link(tds.eq(0)('a').attr('href'))
            ucjson[climb_id + "_" + user_id]["user_id"] = user_id
            ucjson[climb_id + "_" + user_id]["climb_id"] = climb_id
            try: 
                date = time.strptime(tds.eq(1).text(), "%b %d, %Y")
                ucjson[climb_id + "_" + user_id]["tick_date"] = date
                ucjson[climb_id + "_" + user_id]["ticked"] = 1
            except ValueError:
                dates_wrong += 1
                
    # add climbdict to climbsjson
    climbsjson[climb_id] = climbdict

In [68]:
climb_scrape_func("https://www.mountainproject.com/v/intertwine/105905517")

In [69]:
climbsjson

defaultdict(dict,
            {'105905517': {'area_0': '/destinations/',
              'area_1': '/v/massachusetts/105908062',
              'area_2': '/v/leominster-area/108175250',
              'area_3': '/v/crow-hill/105905492',
              'area_4': '/v/main-face/105905514',
              'avg_stars': 3.2916999999999996,
              'climb_id': '105905517',
              'climb_link': 'https://www.mountainproject.com/v/intertwine/105905517',
              'climb_name': 'intertwine',
              'concensus_aid_grade': 0,
              'concensus_boulder_grade': 0,
              'concensus_ice_grade': 0,
              'concensus_mixed_grade': 0,
              'concensus_rock_grade': u'5.8',
              'fa_year': None,
              'feet': 45,
              'grade': None,
              'guide_aid_grade': 0,
              'guide_boulder_grade': 0,
              'guide_ice_grade': 0,
              'guide_mixed_grade': 0,
              'guide_rock_grade': u'5.8',
             

In [19]:
climbdict = {}
# get page into pyquery object
climb_req=requests.get("https://www.mountainproject.com/v/intertwine/105905517")
climb_=pq(climb_req.text)

# 0. get areas
climb_loc = climb_("#navBox").find('a')
for i in range(0, len(climb_loc)):
    climbdict["area_" + str(i)] = (climb_loc.eq(i).attr('href'))

# 2. separate climb div
climb_div = climb_('div#rspCol800')

# 3. get climb summary div
climb_summstats = climb_div('div.rspCol[style="max-width:500px;"]')

# 4. get grade ratings
climb_summstats_subtitle = climb_summstats('h3')

for key, regex in diff_regexes.items():
    temp = re.search(regex, climb_summstats_subtitle.text())
    if  temp != None:
        climbdict["guide_" + key] = temp.group()
    else:
        climbdict["guide_" + key] = 0

# 5. get average star rating
average_stars_script_html = climb_summstats('span#routeStars')('script').html()
average_stars_regexobj = re.search(r'[1-5]\.\d\d?\d?\d?', 
                                   average_stars_script_html)
if average_stars_regexobj != None:     
    avg_stars = float(average_stars_regexobj.group()) - 1
    climbdict["avg_stars"] = avg_stars
else:
    avg_stars = 0

# 6. get rows of table
climb_summstats_table = climb_summstats('table')

# 7. get climb types
climb_summstats_type_row = climb_summstats_table("tr:contains('Type')")
for key in typesdict:
    if re.search(r'{}'.format(key), 
                climb_summstats_type_row.text(), 
                flags = re.I) != None:
        climbdict["type_" + key] = 1
    else:
        climbdict["type_" + key] = 0

# 8. get the number of pitches
pitches_regobj = re.search(r'(\d\d?) pitches', 
                climb_summstats_table.eq(0).text(), 
                flags = re.I)
if pitches_regobj:
    climbdict["pitches"] = int(pitches_regobj.group(1))

# 9. get number of feet
feet_regobj = re.search(r" (\d\d?\d?\d?\d?)'", 
                climb_summstats_table.eq(0).text(), 
                flags = re.I)
if feet_regobj:
    climbdict["feet"] = int(feet_regobj.group(1))

# 10. get number of feet
grade_regobj = re.search(r'Grade ([I,II,III,IV,V,VI])', 
                climb_summstats_table.eq(0).text(), 
                flags = re.I)
if grade_regobj:
    climbdict["grade"] = grade_regobj.group(1)

# 11. get concensus rating
for key, regex in diff_regexes.iteritems():
    temp = re.search(regex,climb_summstats_table.eq(1).text())
    if  temp != None:
        climbdict["concensus_" + key] = temp.group()
    else:
        climbdict["concensus_" + key] = 0

# 12. get FA year
fa_regobj = re.search(r"([18,19,20]\d\d\d)",climb_summstats_table.eq(2).text())
if fa_regobj:
    climbdict["fa_year"] = int(fa_regobj.group(1))

# 13. get page views
views_regobj = re.search(r"\d?\d?\d?,?\d?\d\d",climb_summstats_table.eq(3).text())
if views_regobj:
    climbdict["page_views"] = int(views_regobj.group().replace(',',''))

In [21]:
climb_summstats_type_row

[]

In [28]:
climb_summstats_table("td:contains('Type:')").next().text()

"Trad, TR, 1 pitch, 45'"