In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import re
import pandas as pd

pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

# Usersdf EDA and Cleaning

In [2]:
usersdf = pd.DataFrame.from_csv('finalproject-users.csv')

In [3]:
print len(usersdf)
usersdf.head(200)

11323


Unnamed: 0,location_state,user_id,climbs_trad,gender,age,climbs_aid,climbs_ice,climbs_boulder,user_link,climbs_mixed,location_place,climbs_sport,member_date,user_name,lead_diff_sport,boulder_diff,follow_diff_sport,lead_diff_ice,lead_diff_trad,follow_diff_ice,follow_diff_trad,follow_diff_mixed,lead_diff_mixed,follow_diff_aid,lead_diff_aid
10101,WA,10101,0,Male,37,0,0,0,https://www.mountainproject.com/u/michael-koma...,0,Seattle,0,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",michael-komarnitsky,,,,,,,,,,,
10102,undef,10102,1,Male,0,1,1,0,https://www.mountainproject.com/u/ben-mottinge...,1,undef,1,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",ben-mottinger,5.10d,,5.11c,WI4,5.10a,WI6,5.11a,M8,M5,C1,C3
10106,NM,10106,1,Male,35,1,0,1,https://www.mountainproject.com/u/patrick-vern...,0,Albuquerque,1,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",patrick-vernon,5.13a,V8,5.13a,,5.12d,,5.12d,,,C2,C2
10107,undef,10107,0,undef,0,0,0,0,https://www.mountainproject.com/u/matt-bauman/...,0,undef,0,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",matt-bauman,,,,,,,,,,,
10108,CO,10108,0,Male,36,0,0,0,https://www.mountainproject.com/u/quinn-steven...,0,Denver,0,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",quinn-stevens,,,,,,,,,,,
10109,WI,10109,0,Male,0,0,0,0,https://www.mountainproject.com/u/ramin-jamshi...,0,Wauwatosa,0,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",ramin-jamshidi,,,,,,,,,,,
10112,undef,10112,0,undef,0,0,0,0,https://www.mountainproject.com/u/nate-weitzel...,0,undef,0,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",nate-weitzel,,,,,,,,,,,
10113,undef,10113,0,undef,0,0,0,0,https://www.mountainproject.com/u/walt-wehner/...,0,undef,0,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",walt-wehner,,,,,,,,,,,
10116,AZ,10116,1,Male,34,0,0,1,https://www.mountainproject.com/u/peter-franze...,0,Phoenix,1,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",peter-franzen,5.13a,V9,5.13a,,5.11c,,5.12a,,,,
10117,undef,10117,0,undef,0,0,0,0,https://www.mountainproject.com/u/andrew-wellm...,0,undef,0,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",andrew-wellman,,,,,,,,,,,


In [4]:
usersdf.describe()

Unnamed: 0,user_id,climbs_trad,age,climbs_aid,climbs_ice,climbs_boulder,climbs_mixed,climbs_sport
count,11323.0,11323.0,11323.0,11323.0,11323.0,11323.0,11323.0,11323.0
mean,101918600.0,0.481939,15.086726,0.081162,0.129383,0.366864,0.055816,0.542436
std,24036210.0,0.499696,17.085963,0.273096,0.335638,0.48197,0.229575,0.498218
min,10101.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,106347400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,107273700.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,108211800.0,1.0,30.0,0.0,0.0,1.0,0.0,1.0
max,111385600.0,1.0,99.0,1.0,1.0,1.0,1.0,1.0


In [5]:
# fix age such that where there was no age, it is set to NaN and not 0
usersdf.age.replace(0,np.nan, inplace=1)

In [6]:
print "Total User Profiles Scraped: ", len(usersdf)
print "Undeclared Gender: ", len(usersdf[usersdf.gender == 'undef'])
print "Declared Men: ", len(usersdf[usersdf.gender == 'Male'])
print "Declared Women: ", len(usersdf[usersdf.gender == 'Female'])

Total User Profiles Scraped:  11323
Undeclared Gender:  4309
Declared Men:  6222
Declared Women:  792


^^^^ YIKES!

points to note:
- women might be reporting gender at a lower rate
- it is generally well known that the climbing community is predominantly male, though the usual anecdotal estimate is about a 30-70 ratio...

In [7]:
# convert male,female to binary, numeric and add numpy.nan
usersdf.gender.replace(['Male','Female','undef'],[0,1,np.nan], inplace=1)

In [8]:
# check/replace if any 'None' or 'undef' strings
usersdf.replace(['None','undef'], [np.nan,np.nan], inplace=1)


In [9]:
usersdf.head(10)

Unnamed: 0,location_state,user_id,climbs_trad,gender,age,climbs_aid,climbs_ice,climbs_boulder,user_link,climbs_mixed,location_place,climbs_sport,member_date,user_name,lead_diff_sport,boulder_diff,follow_diff_sport,lead_diff_ice,lead_diff_trad,follow_diff_ice,follow_diff_trad,follow_diff_mixed,lead_diff_mixed,follow_diff_aid,lead_diff_aid
10101,WA,10101,0,0.0,37.0,0,0,0,https://www.mountainproject.com/u/michael-koma...,0,Seattle,0,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",michael-komarnitsky,,,,,,,,,,,
10102,,10102,1,0.0,,1,1,0,https://www.mountainproject.com/u/ben-mottinge...,1,,1,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",ben-mottinger,5.10d,,5.11c,WI4,5.10a,WI6,5.11a,M8,M5,C1,C3
10106,NM,10106,1,0.0,35.0,1,0,1,https://www.mountainproject.com/u/patrick-vern...,0,Albuquerque,1,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",patrick-vernon,5.13a,V8,5.13a,,5.12d,,5.12d,,,C2,C2
10107,,10107,0,,,0,0,0,https://www.mountainproject.com/u/matt-bauman/...,0,,0,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",matt-bauman,,,,,,,,,,,
10108,CO,10108,0,0.0,36.0,0,0,0,https://www.mountainproject.com/u/quinn-steven...,0,Denver,0,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",quinn-stevens,,,,,,,,,,,
10109,WI,10109,0,0.0,,0,0,0,https://www.mountainproject.com/u/ramin-jamshi...,0,Wauwatosa,0,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",ramin-jamshidi,,,,,,,,,,,
10112,,10112,0,,,0,0,0,https://www.mountainproject.com/u/nate-weitzel...,0,,0,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",nate-weitzel,,,,,,,,,,,
10113,,10113,0,,,0,0,0,https://www.mountainproject.com/u/walt-wehner/...,0,,0,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",walt-wehner,,,,,,,,,,,
10116,AZ,10116,1,0.0,34.0,0,0,1,https://www.mountainproject.com/u/peter-franze...,0,Phoenix,1,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",peter-franzen,5.13a,V9,5.13a,,5.11c,,5.12a,,,,
10117,,10117,0,,,0,0,0,https://www.mountainproject.com/u/andrew-wellm...,0,,0,"(2001, 1, 1, 0, 0, 0, 0, 1, -1)",andrew-wellman,,,,,,,,,,,


In [10]:
usersdf.describe()

Unnamed: 0,user_id,climbs_trad,gender,age,climbs_aid,climbs_ice,climbs_boulder,climbs_mixed,climbs_sport
count,11323.0,11323.0,7014.0,5291.0,11323.0,11323.0,11323.0,11323.0,11323.0
mean,101918600.0,0.481939,0.112917,32.286335,0.081162,0.129383,0.366864,0.055816,0.542436
std,24036210.0,0.499696,0.316514,8.33009,0.273096,0.335638,0.48197,0.229575,0.498218
min,10101.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,106347400.0,0.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0
50%,107273700.0,0.0,0.0,31.0,0.0,0.0,0.0,0.0,1.0
75%,108211800.0,1.0,0.0,36.0,0.0,0.0,1.0,0.0,1.0
max,111385600.0,1.0,1.0,99.0,1.0,1.0,1.0,1.0,1.0


In [19]:
girls_for_ian = usersdf[(usersdf.lead_diff_sport_number > 12) &
        (usersdf.gender == 0)]

len(girls_for_ian)

1930

## Climbing Grade Conversion

In order to be able to analyze the climbing grades, we need to map them to some sort of numeric scale. Functions for doing so below!

In [12]:
# rock letter to value conversion
def letter_to_num(letter):
    if letter == 'a':
        return 0.
    if letter == 'b':
        return .5
    if letter == 'c':
        return 1.
    if letter == 'd':
        return 1.5

# double digit +,- conversion
def dd_sign_to_num(sign):
    if sign == '-':
        return -.2
    elif sign == '+':
        return .2

# single digit +,- conversion
def sd_sign_to_num(sign):
    if sign == '-':
        return -.3
    elif sign == '+':
        return .3
    
def rock_grade_convert(string):
    #def rock_grade_conversion(gradestr):
    if isinstance(string, str):
        if string == '0':
            return np.nan
        number = 0
        searchobj = re.search(r'5.(\d)(\d)?([a-d])?([+,-])?', string)
    
        # if double digit grade
        if searchobj.group(2):
            number = 10 + int(searchobj.group(2)) * 2

            # check for letter 
            if searchobj.group(3):
                number += letter_to_num(searchobj.group(3))

            if searchobj.group(4):
                number += dd_sign_to_num(searchobj.group(4))

        # if single digit grade
        elif searchobj.group(1):
            number = int(searchobj.group(1))

            if searchobj.group(4):
                number += sd_sign_to_num(searchobj.group(4))

        return number
    else:
        return np.nan

In [13]:
usersdf.columns

Index([u'location_state', u'user_id', u'climbs_trad', u'gender', u'age', u'climbs_aid', u'climbs_ice', u'climbs_boulder', u'user_link', u'climbs_mixed', u'location_place', u'climbs_sport', u'member_date', u'user_name', u'lead_diff_sport', u'boulder_diff', u'follow_diff_sport', u'lead_diff_ice', u'lead_diff_trad', u'follow_diff_ice', u'follow_diff_trad', u'follow_diff_mixed', u'lead_diff_mixed', u'follow_diff_aid', u'lead_diff_aid'], dtype='object')

In [14]:
for column in [u'lead_diff_sport',u'lead_diff_trad',u'follow_diff_trad', u'follow_diff_sport']:
    usersdf[column + "_number"] = usersdf[column].apply(rock_grade_convert)

ERROR: No traceback has been produced, nothing to debug.


In [15]:
usersdf.describe()

Unnamed: 0,user_id,climbs_trad,gender,age,climbs_aid,climbs_ice,climbs_boulder,climbs_mixed,climbs_sport,lead_diff_sport_number,lead_diff_trad_number,follow_diff_trad_number,follow_diff_sport_number
count,11323.0,11323.0,7014.0,5291.0,11323.0,11323.0,11323.0,11323.0,11323.0,5992.0,5249.0,5216.0,5897.0
mean,101918600.0,0.481939,0.112917,32.286335,0.081162,0.129383,0.366864,0.055816,0.542436,11.370577,9.217375,11.300613,12.384518
std,24036210.0,0.499696,0.316514,8.33009,0.273096,0.335638,0.48197,0.229575,0.498218,2.863089,2.805223,2.701004,2.884386
min,10101.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,106347400.0,0.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,10.0,8.0,10.0,11.5
50%,107273700.0,0.0,0.0,31.0,0.0,0.0,0.0,0.0,1.0,11.5,9.0,11.5,12.5
75%,108211800.0,1.0,0.0,36.0,0.0,0.0,1.0,0.0,1.0,13.5,11.0,13.0,14.0
max,111385600.0,1.0,1.0,99.0,1.0,1.0,1.0,1.0,1.0,19.5,19.5,19.5,19.5


# ClimbUserDF EDA and Cleaning

In [None]:
#ucdf = pd.DataFrame.from_csv('finalproject-sorted_climbuserdf.csv')


In [None]:
ucdf.describe()

In [None]:
#climbsdf = pd.DataFrame.from_csv('finalproject-sorted_climbs.csv')