Below, we begin implementing the pseudo code bit by bit, with the goal of determining best methods for specific steps. Later, we will combine the code and pay additional attention to its organization. 

In [1]:
%matplotlib inline
import collections
import numpy as np
import scipy as sp
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import re
import regex
import locale
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests

In [2]:
climbsjson = collections.defaultdict(dict) # double nested dict
usersjson = collections.defaultdict(dict) # double nested dict
ucjson = collections.defaultdict(dict) #double nested dict

In [4]:
# CLIMB VARIABLES - all initialized to undefined:
climb_id = 0 
climb_name = 'undef'
climbdict = {"climb_id": 0, "climb_name": 'undef',
             "avg_stars": 0, "pitches": 0, "feet": 0}

In [5]:
def parse_climb_link(href_link):
    return href_link.split('/')[-1], href_link.split('/')[-2]

def parse_user_link(href_link):
    return href_link.split('/')[-1], href_link.split('/')[-3]

In [6]:
wg_link = "http://mountainproject.com/v/whitney-gilman-ridge/105872668"
link_parsed = wg_link.split('/')

# 1. get keys
climb_id = link_parsed[-1]
climbdict["climb_id"] = climb_id
climb_name = link_parsed[-2]
climbdict["climb_name"] = climb_name

In [7]:
climb_req=requests.get(wg_link)
climb_=pq(climb_req.text)

In [8]:
# 2. separate climb div
climb_div = climb_('div#rspCol800')

In [9]:
# 3. get climb summary div
climb_summstats = climb_div('div.rspCol[style="max-width:500px;"]')

In [10]:
# 4. get grade ratings
climb_summstats_subtitle = climb_summstats('h3')

In [11]:
# 4. get grade ratings continued 
guide_grades_dict = {}
diff_regexes = {"aid_grade": r'(A[0-5])|(C[0-5])', 
               "boulder_grade": r'V\d\d?[-,+]?', 
               "ice_grade": r'WI[1-6][+,-]?',
               "mixed_grade": r'M\d\d?[+,-]?', 
               "rock_grade": r'5.\d\d?(([a-d]/[a-d])|([a-d]))?[+,-]?'}

for key, regex in diff_regexes.iteritems():
    temp = re.search(regex, climb_summstats_subtitle.text())
    if  temp != None:
        guide_grades_dict[key] = temp.group()
    else:
        guide_grades_dict[key] = 0

guide_grades_dict

{'aid_grade': 0,
 'boulder_grade': 0,
 'ice_grade': 0,
 'mixed_grade': 0,
 'rock_grade': u'5.7'}

In [12]:
# 5. get average star rating
average_stars_script_html = climb_summstats('span#routeStars')('script').html()
average_stars_regexobj = re.search(r'[1-5]\.\d\d?\d?\d?', 
                                   average_stars_script_html)
if average_stars_regexobj != None:     
    avg_stars = float(average_stars_regexobj.group()) - 1
    climbdict["avg_stars"] = avg_stars
else:
    avg_stars = 0
    

In [13]:
# 6. get rows of table
climb_summstats_table = climb_summstats('table')('tr')

In [17]:
# 7. get climb types
typesdict = {"trad": 0, "alpine": 0, "ice": 0, "sport": 0,
        "boulder": 0, "aid": 0, "mixed": 0}
for key in typesdict:
    if re.search(r'{}'.format(key), 
                climb_summstats_table.eq(0).text(), 
                flags = re.I) != None:
        typesdict[key] = 1

typesdict

{'aid': 0,
 'alpine': 1,
 'boulder': 0,
 'ice': 0,
 'mixed': 0,
 'sport': 0,
 'trad': 1}

In [20]:
# 8. get the number of pitches
pitches_regobj = re.search(r'(\d\d?) pitches', 
                climb_summstats_table.eq(0).text(), 
                flags = re.I)
if pitches_regobj:
    climbdict["pitches"] = int(temp.group(1))

TypeError: int() argument must be a string or a number, not 'NoneType'

In [21]:
# 9. get number of feet
feet_regobj = re.search(r" (\d\d?\d?\d?\d?)'", 
                climb_summstats_table.eq(0).text(), 
                flags = re.I)
if feet_regobj:
    climbdict["feet"] = int(feet_regobj.group(1))

In [22]:
# 10. get number of feet
grade_regobj = re.search(r'Grade ([I,II,III,IV,V,VI])', 
                climb_summstats_table.eq(0).text(), 
                flags = re.I)
if grade_regobj:
    climbdict["grade"] = grade_regobj.group(1)

In [23]:
# 11. get concensus rating
concensus_grades_dict = {}

for key, regex in diff_regexes.iteritems():
    temp = re.search(regex,climb_summstats_table.eq(1).text())
    if  temp != None:
        concensus_grades_dict["concensus_" + key] = temp.group()
    else:
        concensus_grades_dict["concensus_" + key] = 0

concensus_grades_dict

{'concensus_aid_grade': 0,
 'concensus_boulder_grade': 0,
 'concensus_ice_grade': 0,
 'concensus_mixed_grade': 0,
 'concensus_rock_grade': u'5.7'}

In [24]:
# 12. get FA year
fa_regobj = re.search(r"[18,19,20]\d\d\d",climb_summstats_table.eq(2).text())
if fa_regobj:
    climbdict["fa_year"] = int(fa_regobj.group())

In [25]:
# 13. get page views
views_regobj = re.search(r"\d?\d?\d?,?\d?\d\d",climb_summstats_table.eq(3).text())
if views_regobj:
    climbdict["page_views"] = int(views_regobj.group().replace(',',''))

In [26]:
# 14. get detailed users-climb page
stats_link = ("http://mountainproject.com/scripts/ShowObjectStats.php?id=%s"
              % climbdict['climb_id'])
climb_stats_req=requests.get(stats_link)
stats_=pq(climb_stats_req.text)


In [27]:
# 15. get star quality votes tables
stats_stars_table = stats_("span:contains('Star Quality Votes')").next().next().find('tr')

In [28]:
user_id, user_name = parse_user_link(stats_stars_table.eq(0).find('td').eq(0)('a').attr('href'))  
print stats_stars_table.eq(0).find('td').eq(1)('script').text()
int(re.search(r'starsHtml\(([0-5]),', stats_stars_table.eq(0).find('td').eq(1)('script').text()).group(1))-1

$(function(){$('#stars_1331829086').html(starsHtml(5, ''));});


4

In [29]:
# 16.
def star_quality_func(td1, td2):
    user_id, user_name = parse_user_link(td1('a').attr('href'))
    rating = int(re.search(r'starsHtml\(([0-5]),', 
                       td2('script').text()).group(1)) - 1
    ucjson[climb_id + "_" + user_id]["user_id"] = user_id
    ucjson[climb_id + "_" + user_id]["climb_id"] = climb_id
    ucjson[climb_id + "_" + user_id]["star_rating"] = rating

In [30]:
for i in range(0, len(stats_stars_table)):
    tds = stats_stars_table.eq(i).find('td')
    star_quality_func(tds.eq(0),tds.eq(1))
    if len(tds) == 4:
        star_quality_func(tds.eq(2),tds.eq(3))


In [31]:
ucjson

defaultdict(dict,
            {'105872668_10186': {'climb_id': '105872668',
              'star_rating': 4,
              'user_id': '10186'},
             '105872668_10359': {'climb_id': '105872668',
              'star_rating': 2,
              'user_id': '10359'},
             '105872668_105790750': {'climb_id': '105872668',
              'star_rating': 4,
              'user_id': '105790750'},
             '105872668_105798761': {'climb_id': '105872668',
              'star_rating': 4,
              'user_id': '105798761'},
             '105872668_105801390': {'climb_id': '105872668',
              'star_rating': 3,
              'user_id': '105801390'},
             '105872668_105806132': {'climb_id': '105872668',
              'star_rating': 4,
              'user_id': '105806132'},
             '105872668_105814360': {'climb_id': '105872668',
              'star_rating': 2,
              'user_id': '105814360'},
             '105872668_105832216': {'climb_id': '105872668',
     