In [2]:
%matplotlib inline
import collections 
import numpy as np
import scipy as sp
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import re
#import regex
#import locale
#import usaddress
#import geograpy
#import nltk
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests

In [29]:
# global data storage
climbsjson = collections.defaultdict(dict) # double nested dict
usersjson = collections.defaultdict(dict) # double nested dict
ucjson = collections.defaultdict(dict) #double nested dict

In [35]:
# sub-functions and initializations for user/climb_scrape_func-s

def parse_climb_link(href_link):
    return href_link.split('/')[-1], href_link.split('/')[-2]

def parse_user_link(href_link):
    return href_link.split('/')[-1], href_link.split('/')[-3]

aid_re = r'(A[0-5])|(C[0-5])[-,+]?'
boulder_re = r'V\d\d?[-,+]?'
ice_re = r'WI[1-6][+,-]?'
mixed_re = r'M\d\d?[+,-]?'
rock_re = r'5.\d\d?(([a-d]/[a-d])|([a-d]))?[+,-]?'

diff_regexes = {"aid_grade": aid_re, "boulder_grade": boulder_re, 
               "ice_grade": ice_re, "mixed_grade": mixed_re, 
               "rock_grade": rock_re}

user_type_regexes = {"Trad": rock_re, "Sport": rock_re, 
                     "Boulders": boulder_re, "Aid": aid_re, 
                     "Ice": ice_re, "Mixed": mixed_re}

typesdict = {"trad": 0, "alpine": 0, "ice": 0, "sport": 0,
        "boulder": 0, "aid": 0, "mixed": 0, "TR,": 0}



In [36]:
def user_scrape_func(userlink):
    
    # initialize local user dict to be appended to usersjson at end
    userdict = {"user_id": 0, "user_name": "undef", "location_place": "undef", 
                "location_state": "undef", "age": 0, "gender": "undef", 
                "member_date": 0}
    
    # 1. get user_name and id-s
    user_id, user_name = parse_user_link(userlink)
    userdict["user_id"], userdict["user_name"] = user_id, user_name
    
    # get page and convert to pyquery obj
    user_req=requests.get(userlink)
    user_=pq(user_req.text)
    
    # extract personal data tag
    user_main = user_('div.personalData')
    user_pers = user_main("div:contains('Personal:')")('em')
    
    # 2. if exists, get place of residence
    if re.search('Lives in', user_pers.text()) != None:
        userdict["location_place"] = user_pers.text().split(',')[0].replace('Lives in ',"")
        
        # check for state separated by comma (will need to clean this)
        maybe_state = user_pers.text().split(',')[1]
        if (re.search(r'(years old)|(male)|(female)', maybe_state) == None):
            userdict["location_state"] = maybe_state
    
    # 3. get age
    temp_age = re.search(r'(\d\d?) years old', user_pers.text())
    if temp_age:
        userdict["age"] = int(temp_age.group(1))
    
    # 4. get gender
    temp_gender = re.search(r'(male)|(female)', user_pers.text(), re.I)
    if temp_gender:
        userdict["gender"] = temp_gender.group()
    
    # 5. get liked types and difficulties
    user_diffs = user_main('table')

    for key, value in user_type_regexes.items():
        if key != "Boulders":
            tdwithkey = user_diffs.find("tr").children("td:contains('{}')".format(key))
            if tdwithkey:
                userdict["climbs_" + str.lower(key)] = 1
                leadhtml = tdwithkey.next().html()
                followhtml = tdwithkey.next().next().html()  
                templeaddiff = re.search("{}".format(value), leadhtml, re.I)
                tempfollowdiff = re.search("{}".format(value), followhtml, re.I)
                if templeaddiff:
                    userdict["lead_diff_" + str.lower(key)] = templeaddiff.group() 
                else:
                    userdict["lead_diff_" + str.lower(key)] = '0'
                if tempfollowdiff:
                    userdict["follow_diff_" + str.lower(key)] = tempfollowdiff.group()
                else:
                    userdict["follow_diff_" + str.lower(key)] = '0'
            else:
                userdict["climbs_" + str.lower(key)] = 0

        elif key == "Boulders":
            boulderhtml = user_diffs.find("tr").children("td:contains('{}')".format(key))
            if boulderhtml:
                userdict["climbs_boulder"] = 1
                boulderdiff = boulderhtml.next().html()
                tempboulderdiff = re.search("{}".format(value), boulderdiff)
                if tempboulderdiff:
                    userdict["boulder_diff"] = tempboulderdiff.group()
            else:
                userdict["climbs_boulder"] = 0
            
    
    # 6. get date 
    user_left_box = user_("td[width='190']")('b')
    if user_left_box:
        userdict["member_date"] = time.strptime(user_left_box.eq(0).text(), 
                                                "%b %d, %Y")
    
    
    # 
    
    
    
    return userdict

    

In [37]:
userlink = "http://mountainproject.com/u/rajko-radovanovic//108972429"
user_scrape_func(userlink)

{'age': 22,
 'climbs_aid': 0,
 'climbs_boulder': 0,
 'climbs_ice': 1,
 'climbs_mixed': 1,
 'climbs_sport': 1,
 'climbs_trad': 1,
 'follow_diff_ice': 'WI5',
 'follow_diff_mixed': 'M5',
 'follow_diff_sport': u'5.10a',
 'follow_diff_trad': u'5.9',
 'gender': 'Male',
 'lead_diff_ice': u'WI3',
 'lead_diff_mixed': u'M1',
 'lead_diff_sport': u'5.9',
 'lead_diff_trad': u'5.6',
 'location_place': 'Cambridge',
 'location_state': ' MA',
 'member_date': time.struct_time(tm_year=2014, tm_mon=5, tm_mday=13, tm_hour=0, tm_min=0, tm_sec=0, tm_wday=1, tm_yday=133, tm_isdst=-1),
 'user_id': '108972429',
 'user_name': 'rajko-radovanovic'}