Below, we begin implementing the pseudo code bit by bit, with the goal of determining best methods for specific steps. Later, we will combine the code and pay additional attention to its organization. 

In [1]:
%matplotlib inline
import collections 
import numpy as np
import scipy as sp
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import re
import regex
import locale
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests

In [12]:
climbsjson = collections.defaultdict(dict) # double nested dict
usersjson = collections.defaultdict(dict) # double nested dict
ucjson = collections.defaultdict(dict) #double nested dict

In [13]:
# CLIMB VARIABLES - all initialized to undefined:
climb_id = 0 
climb_name = 'undef'
climbdict = {"climb_id": 0, "climb_name": 'undef',
             "avg_stars": 0, "pitches": 0, "feet": 0}

In [14]:
def parse_climb_link(href_link):
    return href_link.split('/')[-1], href_link.split('/')[-2]

def parse_user_link(href_link):
    return href_link.split('/')[-1], href_link.split('/')[-3]

In [15]:
wg_link = "https://www.mountainproject.com/v/pick-o-the-vic/106086237"
link_parsed = wg_link.split('/')

# 1. get keys
climb_id = link_parsed[-1]
climbdict["climb_id"] = climb_id
climb_name = link_parsed[-2]
climbdict["climb_name"] = climb_name

In [16]:
climb_req=requests.get(wg_link)
climb_=pq(climb_req.text)

In [17]:
# 2. separate climb div
climb_div = climb_('div#rspCol800')

In [18]:
# 3. get climb summary div
climb_summstats = climb_div('div.rspCol[style="max-width:500px;"]')

In [19]:
# 4. get grade ratings
climb_summstats_subtitle = climb_summstats('h3')

In [20]:
# 4. get grade ratings continued 
guide_grades_dict = {}
diff_regexes = {"aid_grade": r'(A[0-5])|(C[0-5])', 
               "boulder_grade": r'V\d\d?[-,+]?', 
               "ice_grade": r'WI[1-6][+,-]?',
               "mixed_grade": r'M\d\d?[+,-]?', 
               "rock_grade": r'5.\d\d?(([a-d]/[a-d])|([a-d]))?[+,-]?'}

for key, regex in diff_regexes.iteritems():
    temp = re.search(regex, climb_summstats_subtitle.text())
    if  temp != None:
        guide_grades_dict[key] = temp.group()
    else:
        guide_grades_dict[key] = 0

guide_grades_dict

{'aid_grade': 0,
 'boulder_grade': 0,
 'ice_grade': 'WI4',
 'mixed_grade': 0,
 'rock_grade': 0}

In [21]:
# 5. get average star rating
average_stars_script_html = climb_summstats('span#routeStars')('script').html()
average_stars_regexobj = re.search(r'[1-5]\.\d\d?\d?\d?', 
                                   average_stars_script_html)
if average_stars_regexobj != None:     
    avg_stars = float(average_stars_regexobj.group()) - 1
    climbdict["avg_stars"] = avg_stars
else:
    avg_stars = 0
    

In [22]:
# 6. get rows of table
climb_summstats_table = climb_summstats('table')('tr')

In [26]:
# 7. get climb types
typesdict = {"trad": 0, "alpine": 0, "ice": 0, "sport": 0,
        "boulder": 0, "aid": 0, "mixed": 0, "TR,": 0}
for key in typesdict:
    if re.search(r'{}'.format(key), 
                climb_summstats_table.eq(0).text(), 
                flags = re.I) != None:
        typesdict[key] = 1

typesdict

{'TR,': 1,
 'aid': 0,
 'alpine': 0,
 'boulder': 0,
 'ice': 1,
 'mixed': 0,
 'sport': 0,
 'trad': 1}

In [27]:
# 8. get the number of pitches
pitches_regobj = re.search(r'(\d\d?) pitches', 
                climb_summstats_table.eq(0).text(), 
                flags = re.I)
if pitches_regobj:
    climbdict["pitches"] = int(pitches_regobj.group(1))

In [28]:
# 9. get number of feet
feet_regobj = re.search(r" (\d\d?\d?\d?\d?)'", 
                climb_summstats_table.eq(0).text(), 
                flags = re.I)
if feet_regobj:
    climbdict["feet"] = int(feet_regobj.group(1))

In [29]:
# 10. get number of feet
grade_regobj = re.search(r'Grade ([I,II,III,IV,V,VI])', 
                climb_summstats_table.eq(0).text(), 
                flags = re.I)
if grade_regobj:
    climbdict["grade"] = grade_regobj.group(1)

In [30]:
# 11. get concensus rating
concensus_grades_dict = {}

for key, regex in diff_regexes.iteritems():
    temp = re.search(regex,climb_summstats_table.eq(1).text())
    if  temp != None:
        concensus_grades_dict["concensus_" + key] = temp.group()
    else:
        concensus_grades_dict["concensus_" + key] = 0

concensus_grades_dict

{'concensus_aid_grade': 0,
 'concensus_boulder_grade': 0,
 'concensus_ice_grade': u'WI4',
 'concensus_mixed_grade': 0,
 'concensus_rock_grade': 0}

In [31]:
# 12. get FA year
fa_regobj = re.search(r"[18,19,20]\d\d\d",climb_summstats_table.eq(2).text())
if fa_regobj:
    climbdict["fa_year"] = int(fa_regobj.group())

In [32]:
# 13. get page views
views_regobj = re.search(r"\d?\d?\d?,?\d?\d\d",climb_summstats_table.eq(3).text())
if views_regobj:
    climbdict["page_views"] = int(views_regobj.group().replace(',',''))

In [33]:
# 14. get detailed users-climb page
stats_link = ("http://mountainproject.com/scripts/ShowObjectStats.php?id=%s"
              % climbdict['climb_id'])
climb_stats_req=requests.get(stats_link)
stats_=pq(climb_stats_req.text)


In [34]:
# 15. get star quality votes tables
stats_stars_table = stats_("span:contains('Star Quality Votes')").next().next().find('tr')

In [35]:
# 16. get star info from td pairs
def star_quality_func(td1, td2):
    user_id, user_name = parse_user_link(td1('a').attr('href'))
    rating = int(re.search(r'starsHtml\(([0-5]),', 
                       td2('script').text()).group(1)) - 1
    ucjson[climb_id + "_" + user_id]["user_id"] = user_id
    ucjson[climb_id + "_" + user_id]["climb_id"] = climb_id
    ucjson[climb_id + "_" + user_id]["star_rating"] = rating

In [38]:
# 17. iterate through stars table and populate ucjson 
for i in range(0, len(stats_stars_table)):
    tds = stats_stars_table.eq(i).find('td')
    if len(tds) > 0:
        star_quality_func(tds.eq(0),tds.eq(1))
        if len(tds) == 4:
            star_quality_func(tds.eq(2),tds.eq(3))


In [39]:
# 18. get suggested ratings rows
stats_sugg_table = stats_("span:contains('Suggested Ratings')").next().next().find('tr')  

In [40]:
# 19. get sugg_rate info from td pairs
def sugg_rating_func(td1, td2):
    user_id, user_name = parse_user_link(td1('a').attr('href'))
    ucjson[climb_id + "_" + user_id]["user_id"] = user_id
    ucjson[climb_id + "_" + user_id]["climb_id"] = climb_id
    
    for key, regex in diff_regexes.iteritems():
        temp = re.search(regex, td2.text())
        if  temp != None:
            ucjson[climb_id + "_" + user_id]["sugg_" + key] = temp.group()
        else:
            ucjson[climb_id + "_" + user_id]["sugg_" + key] = 0
    

In [41]:
# 20. iterate through suggested ratings table and populate ucjson 
for i in range(0, len(stats_sugg_table)):
    tds = stats_sugg_table.eq(i).find('td')
    if len(tds) > 1:
        sugg_rating_func(tds.eq(0),tds.eq(1))
        if len(tds) == 4:
            sugg_rating_func(tds.eq(2),tds.eq(3))

In [42]:
# 21. 
stats_ticks_table = stats_("span:contains('Ticks')").next().next().find('tr')  


In [43]:
# 22. iterate through suggested ratings table and populate ucjson 
dates_wrong = 0
for i in range(0, len(stats_ticks_table)):
    tds = stats_ticks_table.eq(i).find('td')
    if len(tds) == 3:
        user_id, user_name = parse_user_link(tds.eq(0)('a').attr('href'))
        ucjson[climb_id + "_" + user_id]["user_id"] = user_id
        ucjson[climb_id + "_" + user_id]["climb_id"] = climb_id
        try: 
            date = time.strptime(tds.eq(1).text(), "%b %d, %Y")
            ucjson[climb_id + "_" + user_id]["tick_date"] = date
        except ValueError:
            dates_wrong += 1
                

In [44]:
ucjson

defaultdict(dict,
            {'106086237_10359': {'climb_id': '106086237',
              'star_rating': 3,
              'user_id': '10359'},
             '106086237_105084578': {'climb_id': '106086237',
              'star_rating': 4,
              'user_id': '105084578'},
             '106086237_105801316': {'climb_id': '106086237',
              'tick_date': time.struct_time(tm_year=2008, tm_mon=2, tm_mday=8, tm_hour=0, tm_min=0, tm_sec=0, tm_wday=4, tm_yday=39, tm_isdst=-1),
              'user_id': '105801316'},
             '106086237_105811645': {'climb_id': '106086237',
              'star_rating': 4,
              'user_id': '105811645'},
             '106086237_105815147': {'climb_id': '106086237',
              'star_rating': 3,
              'user_id': '105815147'},
             '106086237_105855048': {'climb_id': '106086237',
              'star_rating': 4,
              'tick_date': time.struct_time(tm_year=2008, tm_mon=1, tm_mday=28, tm_hour=0, tm_min=0, tm_sec=0, tm_wd