In [2]:
%matplotlib inline
import collections 
import numpy as np
import scipy as sp
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import re
#import regex
#import locale
#import usaddress
#import geograpy
#import nltk
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests

In [29]:
# global data storage
climbsjson = collections.defaultdict(dict) # double nested dict
usersjson = collections.defaultdict(dict) # double nested dict
ucjson = collections.defaultdict(dict) #double nested dict

In [30]:
# sub-functions and initializations for user/climb_scrape_func-s

def parse_climb_link(href_link):
    return href_link.split('/')[-1], href_link.split('/')[-2]

def parse_user_link(href_link):
    return href_link.split('/')[-1], href_link.split('/')[-3]

aid_re = r'(A[0-5])|(C[0-5])[-,+]?'
boulder_re = r'V\d\d?[-,+]?'
ice_re = r'WI[1-6][+,-]?'
mixed_re = r'M\d\d?[+,-]?'
rock_re = r'5.\d\d?(([a-d]/[a-d])|([a-d]))?[+,-]?'

diff_regexes = {"aid_grade": aid_re, "boulder_grade": boulder_re, 
               "ice_grade": ice_re, "mixed_grade": mixed_re, 
               "rock_grade": rock_re}

user_type_regexes = {"Trad": rock_re, "Sport": rock_re, 
                     "Boulders": boulder_re, "Aid": aid_re, 
                     "Ice": ice_re, "Mixed": mixed_re}

typesdict = {"trad": 0, "alpine": 0, "ice": 0, "sport": 0,
        "boulder": 0, "aid": 0, "mixed": 0, "TR,": 0}

In [33]:
def user_scrape_func(userlink):
    
    # initialize local user dict to be appended to usersjson at end
    userdict = {"user_id": 0, "user_name": "undef", "location_place": "undef", 
                "location_state": "undef", "age": 0, "gender": "undef", 
                "member_date": 0}
    
    # 1. get user_name and id-s
    user_id, user_name = parse_user_link(userlink)
    userdict["user_id"], userdict["user_name"] = user_id, user_name
    
    # get page and convert to pyquery obj
    user_req=requests.get(userlink)
    user_=pq(user_req.text)
    
    # extract personal data tag
    user_main = user_('div.personalData')
    user_pers = user_main("div:contains('Personal:')")('em')
    
    # 2. if exists, get place of residence
    if re.search('Lives in', user_pers.text()) != None:
        userdict["location_place"] = user_pers.text().split(',')[0].replace('Lives in ',"")
        
        # check for state separated by comma (will need to clean this)
        maybe_state = user_pers.text().split(',')[1]
        if (re.search(r'(years old)|(male)|(female)', maybe_state) == None):
            userdict["location_state"] = maybe_state
    
    # 3. get age
    temp_age = re.search(r'(\d\d?) years old', user_pers.text())
    if temp_age:
        userdict["age"] = int(temp_age.group(1))
    
    # 4. get gender
    temp_gender = re.search(r'(male)|(female)', user_pers.text(), re.I)
    if temp_gender:
        userdict["gender"] = temp_gender.group()
    
    # 5. get liked types
    # this is ridiculously unnecessary but apparently in pyquery actually necessary
    user_diffs = user_main('table')
    
    # trad
    tdwithkey = user_diffs.find("tr").children(":contains('Trad')") 
    if tdwithkey:
        userdict["climbs_trad"] = 1
        leadhtml = tdwithkey.next().html()
        followhtml = tdwithkey.next().next().html()  
        templeaddiff = re.search(rock_re, leadhtml)
        tempfollowdiff = re.search(rock_re, followhtml)
        if templeaddiff:
            userdict["diff_lead_" + 'trad'] = templeaddiff.group() 
        else:
            userdict["diff_lead_" + 'trad'] = '0'
        if tempfollowdiff:
            userdict["diff_follow_" + 'trad'] = tempfollowdiff.group()
        else:
            userdict["diff_follow_" + 'trad'] = '0'
    else:
        userdict["climbs_trad"] = 0
    
    # sport
    tdwithkey = user_diffs.find("tr").children(":contains('Sport')") 
    if tdwithkey:
        userdict["climbs_sport"] = 1
        leadhtml = tdwithkey.next().html()
        followhtml = tdwithkey.next().next().html()  
        templeaddiff = re.search(rock_re, leadhtml)
        tempfollowdiff = re.search(rock_re, followhtml)
        if templeaddiff:
            userdict["diff_lead_" + 'sport'] = templeaddiff.group() 
        else:
            userdict["diff_lead_" + 'sport'] = '0'
        if tempfollowdiff:
            userdict["diff_follow_" + 'sport'] = tempfollowdiff.group()
        else:
            userdict["diff_follow_" + 'sport'] = '0'
    else:
        userdict["climbs_sport"] = 0
    
    # aid
    tdwithkey = user_diffs.find("tr").children(":contains('Aid')") 
    if tdwithkey:
        userdict["climbs_aid"] = 1
        leadhtml = tdwithkey.next().html()
        followhtml = tdwithkey.next().next().html()  
        templeaddiff = re.search(aid_re, leadhtml)
        tempfollowdiff = re.search(aid_re, followhtml)
        if templeaddiff:
            userdict["diff_lead_" + 'aid'] = templeaddiff.group() 
        else:
            userdict["diff_lead_" + 'aid'] = '0'
        if tempfollowdiff:
            userdict["diff_follow_" + 'aid'] = tempfollowdiff.group()
        else:
            userdict["diff_follow_" + 'aid'] = '0'
    else:
        userdict["climbs_aid"] = 0
    
    # ice
    tdwithkey = user_diffs.find("tr").children(":contains('Ice')") 
    if tdwithkey:
        userdict["climbs_ice"] = 1
        leadhtml = tdwithkey.next().html()
        followhtml = tdwithkey.next().next().html()  
        templeaddiff = re.search(ice_re, leadhtml)
        tempfollowdiff = re.search(ice_re, followhtml)
        if templeaddiff:
            userdict["diff_lead_" + 'ice'] = templeaddiff.group() 
        else:
            userdict["diff_lead_" + 'ice'] = '0'
        if tempfollowdiff:
            userdict["diff_follow_" + 'ice'] = tempfollowdiff.group()
        else:
            userdict["diff_follow_" + 'ice'] = '0'
    else:
        userdict["climbs_ice"] = 0
    
    # mixed
    tdwithkey = user_diffs.find("tr").children(":contains('Mixed')") 
    if tdwithkey:
        userdict["climbs_mixed"] = 1
        leadhtml = tdwithkey.next().html()
        followhtml = tdwithkey.next().next().html()  
        templeaddiff = re.search(mixed_re, leadhtml)
        tempfollowdiff = re.search(mixed_re, followhtml)
        if templeaddiff:
            userdict["diff_lead_" + 'mixed'] = templeaddiff.group() 
        else:
            userdict["diff_lead_" + 'mixed'] = '0'
        if tempfollowdiff:
            userdict["diff_follow_" + 'mixed'] = tempfollowdiff.group()
        else:
            userdict["diff_follow_" + 'mixed'] = '0'
    else:
        userdict["climbs_mixed"] = 0
    
    # bouldering
    boulderhtml = user_diffs.find("tr").children(":contains('Boulders')")
    if boulderhtml:
        userdict["climbs_boulder"] = 1
        boulderdiff = boulderhtml.next().html()
        tempboulderdiff = re.search(boulder_re, boulderdiff)
        if tempboulderdiff:
            userdict["boulder_diff"] = tempboulderdiff.group()
    else:
        userdict["climbs_boulder"] = 0
    
    # 6. get date 
    user_left_box = user_("td[width='190']")('b')
    if user_left_box:
        userdict["member_date"] = time.strptime(user_left_box.eq(0).text(), 
                                                "%b %d, %Y")
    
    
    
    
    
    
    return userdict

    

In [34]:
userlink = "http://mountainproject.com/u/rajko-radovanovic//108972429"
user_scrape_func(userlink)

{'age': 22,
 'climbs_aid': 0,
 'climbs_boulder': 0,
 'climbs_ice': 1,
 'climbs_mixed': 1,
 'climbs_sport': 1,
 'climbs_trad': 1,
 'diff_follow_ice': 'WI5',
 'diff_follow_mixed': 'M5',
 'diff_follow_sport': u'5.10a',
 'diff_follow_trad': u'5.9',
 'diff_lead_ice': u'WI3',
 'diff_lead_mixed': u'M1',
 'diff_lead_sport': u'5.9',
 'diff_lead_trad': u'5.6',
 'gender': 'Male',
 'location_place': 'Cambridge',
 'location_state': ' MA',
 'member_date': time.struct_time(tm_year=2014, tm_mon=5, tm_mday=13, tm_hour=0, tm_min=0, tm_sec=0, tm_wday=1, tm_yday=133, tm_isdst=-1),
 'user_id': '108972429',
 'user_name': 'rajko-radovanovic'}