In [2]:
%matplotlib inline
import collections 
import numpy as np
import scipy as sp
import pandas as pd
import time
from datetime import date, timedelta
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import re
import urllib2
#import regex
#import locale
#import usaddress
#import geograpy
#import nltk
from pyquery import PyQuery as pq
from bs4 import BeautifulSoup
import requests

In [3]:
# global data storage
climbsjson = collections.defaultdict(dict) # double nested dict
usersjson = collections.defaultdict(dict) # double nested dict
ucjson = collections.defaultdict(dict) #double nested dict

In [4]:
# sub-functions and initializations for user/climb_scrape_func-s

def parse_climb_link(href_link):
    return href_link.split('/')[-1], href_link.split('/')[-2]

def parse_user_link(href_link):
    return href_link.split('/')[-1], href_link.split('/')[-3]



aid_re = r'(A[0-5])|(C[0-5])[-,+]?'
boulder_re = r'V\d\d?[-,+]?'
ice_re = r'WI[1-6][+,-]?'
mixed_re = r'M\d\d?[+,-]?'
rock_re = r'5.\d\d?(([a-d]/[a-d])|([a-d]))?[+,-]?'

diff_regexes = {"aid_grade": aid_re, "boulder_grade": boulder_re, 
               "ice_grade": ice_re, "mixed_grade": mixed_re, 
               "rock_grade": rock_re}

user_type_regexes = {"Trad": rock_re, "Sport": rock_re, 
                     "Boulders": boulder_re, "Aid": aid_re, 
                     "Ice": ice_re, "Mixed": mixed_re}

typesdict = {"trad": 0, "alpine": 0, "ice": 0, "sport": 0,
        "boulder": 0, "aid": 0, "mixed": 0, "TR,": 0}



In [7]:
def user_scrape_func(userlink):
    
    # initialize local user dict to be appended to usersjson at end
    userdict = {"user_link": userlink, "user_id": 0, "user_name": "undef", "location_place": "undef", 
                "location_state": "undef", "age": 0, "gender": "undef", 
                "member_date": 0}
    
    # 1. get user_name and id-s
    user_id, user_name = parse_user_link(userlink)
    userdict["user_id"], userdict["user_name"] = user_id, user_name
    
    if len(usersjson[user_id]) != 0:
        return
    
    # get page and convert to pyquery obj
    user_req=requests.get(userlink)
    user_=pq(user_req.text)
    
    # extract personal data tag
    user_main = user_('div.personalData')
    user_pers = user_main("div:contains('Personal:')")('em')
    
    # 2. if exists, get place of residence
    if re.search('Lives in', user_pers.text()) != None:
        userdict["location_place"] = user_pers.text().split(',')[0].replace('Lives in ',"")
        
        # check for state separated by comma (will need to clean this)
        maybe_state = user_pers.text().split(',')[1]
        if (re.search(r'(years old)|(male)|(female)', maybe_state) == None):
            userdict["location_state"] = maybe_state
    
    # 3. get age
    temp_age = re.search(r'(\d\d?) years old', user_pers.text())
    if temp_age:
        userdict["age"] = int(temp_age.group(1))
    
    # 4. get gender
    temp_gender = re.search(r'(male)|(female)', user_pers.text(), re.I)
    if temp_gender:
        userdict["gender"] = temp_gender.group()
    
    # 5. get liked types and difficulties
    user_diffs = user_main('table')

    for key, value in user_type_regexes.items():
        if key != "Boulders":
            tdwithkey = user_diffs.find("tr").children("td:contains('{}')".format(key))
            if tdwithkey:
                userdict["climbs_" + str.lower(key)] = 1
                leadhtml = tdwithkey.next().html()
                followhtml = tdwithkey.next().next().html()  
                templeaddiff = re.search("{}".format(value), leadhtml, re.I)
                tempfollowdiff = re.search("{}".format(value), followhtml, re.I)
                if templeaddiff:
                    userdict["lead_diff_" + str.lower(key)] = templeaddiff.group() 
                else:
                    userdict["lead_diff_" + str.lower(key)] = '0'
                if tempfollowdiff:
                    userdict["follow_diff_" + str.lower(key)] = tempfollowdiff.group()
                else:
                    userdict["follow_diff_" + str.lower(key)] = '0'
            else:
                userdict["climbs_" + str.lower(key)] = 0

        elif key == "Boulders":
            boulderhtml = user_diffs.find("tr").children("td:contains('{}')".format(key))
            if boulderhtml:
                userdict["climbs_boulder"] = 1
                boulderdiff = boulderhtml.next().html()
                tempboulderdiff = re.search("{}".format(value), boulderdiff)
                if tempboulderdiff:
                    userdict["boulder_diff"] = tempboulderdiff.group()
            else:
                userdict["climbs_boulder"] = 0
            
    
    # 6. get date 
    user_left_box = user_("td[width='190']")('b')
    if user_left_box:
        userdict["member_date"] = time.strptime(user_left_box.eq(0).text(), 
                                                "%b %d, %Y")
    
    
    
    # might be good to add a bgcolor attribute ref to find only populated trs
    #### 7. get stars
    starsmain = "http://www.mountainproject.com/u/{}?action=contribs&what=SCORE&".format(user_id)
    stars_main_req=requests.get(starsmain)
    stars_main_page = pq(stars_main_req.text)
    
    # if multiple pages
    num_pages = stars_main_page("h1").next().next().find('tr').eq(1).text()
    if num_pages:
        pages = int(re.search(r'Page \d\d? of (\d\d?)', num_pages).group(1))
    else:
        pages = 1
    
    # parse all pages
    for page in range(1, pages + 1):
        time.sleep(1)
        starslink = "http://www.mountainproject.com/u/{}?action=contribs&what=SCORE&&page={}".format(user_id, page)
        stars_req=requests.get(starslink)
        stars_page = pq(stars_req.text)
        star_rows = stars_page("table[width*='100%'][border='0'][class='objectList']").eq(1).find('tr')

        for i in range(2, len(star_rows)):
            tds = star_rows.eq(i).find('td')
            # get user and climb id-s
            climb_link = tds.eq(0).find('a').attr('href')
            climb_id, _ = parse_climb_link(climb_link)
            ucjson[climb_id + "_" + user_id]["user_id"] = user_id
            ucjson[climb_id + "_" + user_id]["climb_id"] = climb_id
            #get rating
            rating = int(re.search(r'starsHtml\(([0-5]),', 
                           tds.eq(1)('script').text()).group(1)) - 1   
            ucjson[climb_id + "_" + user_id]["star_rating"] = rating

    # 8. ticks
    ticksmain = "http://www.mountainproject.com/u/{}?action=ticks&".format(user_id)
    ticks_main_req=requests.get(ticksmain)
    ticks_main_page = pq(ticks_main_req.text)
    dates_wrong = 0
    
    # if multiple pages
    num_pages = ticks_main_page("table#stats").next().next().find('tr').eq(0).text()
    if num_pages:
        pages = int(re.search(r'Page \d\d? of (\d\d?)', num_pages).group(1))
    else:
        pages = 1
    
    # parse all pages
    for page in range(1, pages + 1):
        time.sleep(1)
        tickslink = "http://www.mountainproject.com/u/{}?action=ticks&&page={}".format(user_id, page)
        ticks_req=requests.get(tickslink)
        ticks_page = pq(ticks_req.text)
        tick_rows = ticks_page("table[width*='100%'][border='0'][class='objectList']").eq(1).find('tr')

        # parse all rows
        for i in range(2, len(tick_rows)):
            tds = tick_rows.eq(i).find('td')
            
            # get climb id, user id and mark ticked
            climb_link = tds.eq(0).find('a').attr('href')
            climb_id, _ = parse_climb_link(climb_link)
            ucjson[climb_id + "_" + user_id]["user_id"] = user_id
            ucjson[climb_id + "_" + user_id]["climb_id"] = climb_id
            ucjson[climb_id + "_" + user_id]["ticked"] = 1

            # get date
            dates = None
            days_ago = re.search(r'(\d\d?) days ago', tds.eq(4)('p').text())
            if days_ago:
                days_num = int(days_ago.group(1))
                dates = date.today() - timedelta(days=days_num)
                ucjson[climb_id + "_" + user_id]["ticked_date"] = dates
            else:
                # exception if for some dates with year -0001
                try:
                    dates = time.strptime(tds.eq(4)('p').text(), "%b %d, %Y")
                    ucjson[climb_id + "_" + user_id]["ticked_date"] = dates
                except ValueError:
                    dates_wrong += 1
    
    # add userdioct to usersjson
    usersjson[user_id] = userdict

In [8]:
userlinks_test = ["http://www.mountainproject.com/u/claire-stolz//108141508",
                  "http://www.mountainproject.com/u/nick-grant//107851075", 
                  "http://www.mountainproject.com/u/aaron-carney//110105933",
                  'http://www.mountainproject.com/u/aaron-conway//108046170',
                  "http://www.mountainproject.com/u/alan-margolies//108119044",
                  "http://www.mountainproject.com/u/amelia-litz//108635482",
                  "http://www.mountainproject.com/u/aaron-chambers//108225696", 
                  "http://www.mountainproject.com/u/richardo//107499279", 
                  "http://www.mountainproject.com/u/ray-shader//106642411", 
                  "http://www.mountainproject.com/u/rob-albert//106146571", 
                  "http://www.mountainproject.com/u/robbie-mize//107316636", 
                  "http://www.mountainproject.com/u/ross-keller//11047", 
                  "http://www.mountainproject.com/u/ross-merridock//108379018"]

In [9]:
for userlink in userlinks_test:
    user_scrape_func(userlink)
    time.sleep(1)

In [14]:
print "Users: ", len(usersjson)
for key, value in usersjson.items():
    print "Userinfo: " + key + ": ", len(value)

print "User_climb_combos: ", len(ucjson)

for key, value in ucjson.items():
    print "user_climb_data: " + key + ": ", len(value)

Users:  13
Userinfo: 108225696:  14
Userinfo: 108141508:  20
Userinfo: 110105933:  14
Userinfo: 108379018:  14
Userinfo: 108635482:  17
Userinfo: 11047:  20
Userinfo: 107851075:  18
Userinfo: 107499279:  19
Userinfo: 107316636:  21
Userinfo: 108119044:  14
Userinfo: 106146571:  14
Userinfo: 108046170:  14
Userinfo: 106642411:  14
User_climb_combos:  898
user_climb_data: 105956679_107851075:  5
user_climb_data: 105748792_11047:  3
user_climb_data: 106033945_107851075:  5
user_climb_data: 106254851_107851075:  5
user_climb_data: 106269732_106146571:  5
user_climb_data: 105821304_11047:  3
user_climb_data: 106029351_11047:  3
user_climb_data: 105909580_106146571:  5
user_climb_data: 105717568_11047:  3
user_climb_data: 105750361_11047:  3
user_climb_data: 105763881_11047:  3
user_climb_data: 105896493_107851075:  3
user_climb_data: 105867858_107851075:  5
user_climb_data: 105722347_106146571:  5
user_climb_data: 106388170_107851075:  5
user_climb_data: 105799687_106146571:  5
user_climb_d