## Execute the properties file so we have all necessary imports and variables

In [None]:
exec(open("props.py"))
exec(open("helpers.py"))
from __future__ import division

## Info to manipulate the data

- Column names we don't want to keep can be added here
- Column names to rename to so it's easier to deal with the data in a dataframe

** MAKE SURE YOUR DATA ALWAYS COMES IN WITH THESE COLUMNS IN THIS ORDER! **

In [None]:
toDrop = ['bikeid']
colNames = ['tripduration',
             'starttime',
             'stoptime',
             'startstationid',
             'startstationname',
             'startstationlatitude',
             'startstationlongitude',
             'endstationid',
             'endstationname',
             'endstationlatitude',
             'endstationlongitude',
             'bikeid',
             'usertype',
             'birthyear',
             'gender']

## Load the data

### Load using pandas

In [None]:
Jun17 = pd.read_csv(os.path.join(dataFolder,'201706_tripdata.csv'))
Jun17.columns = colNames
Jun17.drop(toDrop, axis=1, inplace=True)

months = [Jun17]

### Make necessary modifications

In [103]:
for mIdx,m in enumerate(months):
    """
    User Type is a string - map to numbers
    """
    reassignNans(m,'usertype',0)   
    reassignVals(m,'usertype','Subscriber',1)
    reassignVals(m,'usertype','Customer',2)
    m['usertype'] = m['usertype'].astype('int64') 
    
    """
    Convert times to pandas datetimes so we will have those utils available
    """
    m['starttime'] = pd.to_datetime(m['starttime'])
    m['stoptime'] = pd.to_datetime(m['stoptime'])

### Generate derived columns

#### age from birth year

In [104]:
yearOfData = 2017

for mIdx,m in enumerate(months):
    m['age'] = yearOfData - m['birthyear']

In [105]:
# let's see what the data looks like
# we'll say any age greater than 85 is a bogus birth year that someone entered
# also there are some nans with no birth year data
for mIdx,m in enumerate(months):
    greater85 = m.age[m.age>85].size
    unk = m.age[m.age.isnull()].size
    total = m.age.size
    percentUnk = (greater85+unk)/total*100
    print('There are ')
    print(greater85)
    print('fake ages \n')
    print('There are')
    print(unk)
    print('previously unknown ages \n')
    print('Total unknown ages is ')
    print(greater85+unk)
    print('out of ')
    print(total)
    print('total ages')
    print('for percentage')
    print(percentUnk)

There are 
884
fake ages 

There are
192432
previously unknown ages 

Total unknown ages is 
193316
out of 
1731594
total ages
for percentage
11.1640488475


In [106]:
# reassign >85 people to be unknown (nan)
for mIdx,m in enumerate(months):
    indices = m[m['age']>85].index
    col='age'
    newVal=np.nan
    m.set_value(indices,col,newVal)
    
    # check that we did it correctly...percent of nans should now be the same percentage as we obtained before
    greater85 = m.age[m.age>85].size
    unk = m.age[m.age.isnull()].size
    total = m.age.size
    percentUnk = (greater85+unk)/total*100
    print('There are ')
    print(greater85)
    print('fake ages \n')
    print('There are')
    print(unk)
    print('unknown ages \n')
    print('Total unknown ages is ')
    print(greater85+unk)
    print('out of ')
    print(total)
    print('total ages')
    print('for percentage')
    print(percentUnk)

There are 
0
fake ages 

There are
193316
previously unknown ages 

Total unknown ages is 
193316
out of 
1731594
total ages
for percentage
11.1640488475


#### Day of month, Day of week, Hour of day from starttime/stoptime

In [None]:
for mIdx,m in enumerate(months):
    m['dayOfMonth'] = np.zeros((m.shape[0],1))
    m['dayOfWeek'] = np.zeros((m.shape[0],1))
    m['startHour'] = np.zeros((m.shape[0],1))
    m['stopHour'] = np.zeros((m.shape[0],1))
    for idx in range(m.shape[0]):
        m.set_value(idx,'dayOfMonth',m['starttime'][idx].day)
        m.set_value(idx,'dayOfWeek',m['starttime'][idx].dayofweek)
        m.set_value(idx,'startHour',m['starttime'][idx].hour)
        m.set_value(idx,'stopHour',m['stoptime'][idx].hour)

## Check to make sure everything looks good

In [None]:
# There are 1,731,594 rides in this month!
Jun17.shape

In [109]:
Jun17.head()

Unnamed: 0,tripduration,starttime,stoptime,startstationid,startstationname,startstationlatitude,startstationlongitude,endstationid,endstationname,endstationlatitude,endstationlongitude,usertype,birthyear,gender,age
0,1397,2017-06-01 00:00:02,2017-06-01 00:23:19,515,W 43 St & 10 Ave,40.760094,-73.994618,3285,W 87 St & Amsterdam Ave,40.78839,-73.9747,1,1967.0,1,50.0
1,1103,2017-06-01 00:00:13,2017-06-01 00:18:37,488,W 39 St & 9 Ave,40.756458,-73.993722,297,E 15 St & 3 Ave,40.734232,-73.986923,1,1981.0,1,36.0
2,1810,2017-06-01 00:00:20,2017-06-01 00:30:31,461,E 20 St & 2 Ave,40.735877,-73.98205,465,Broadway & W 41 St,40.755136,-73.98658,1,1982.0,1,35.0
3,1760,2017-06-01 00:00:24,2017-06-01 00:29:45,2009,Catherine St & Monroe St,40.711174,-73.996826,527,E 33 St & 2 Ave,40.744023,-73.976056,1,1973.0,2,44.0
4,2165,2017-06-01 00:00:33,2017-06-01 00:36:38,360,William St & Pine St,40.707179,-74.008873,474,5 Ave & E 29 St,40.745168,-73.986831,1,1985.0,1,32.0


In [27]:
Jun17.tail()

Unnamed: 0,tripduration,starttime,stoptime,startstationid,startstationname,startstationlatitude,startstationlongitude,endstationid,endstationname,endstationlatitude,endstationlongitude,usertype,birth year,gender
1731589,394,2017-06-30 23:59:30,2017-07-01 00:06:05,539,Metropolitan Ave & Bedford Ave,40.715348,-73.960241,3107,Bedford Ave & Nassau Ave,40.723117,-73.952123,1,1986.0,2
1731590,1171,2017-06-30 23:59:40,2017-07-01 00:19:12,195,Liberty St & Broadway,40.709056,-74.010434,438,St Marks Pl & 1 Ave,40.727791,-73.985649,1,1986.0,1
1731591,1272,2017-06-30 23:59:47,2017-07-01 00:20:59,477,W 41 St & 8 Ave,40.756405,-73.990026,3314,W 95 St & Broadway,40.79377,-73.971888,1,1982.0,1
1731592,918,2017-06-30 23:59:45,2017-07-01 00:15:03,328,Watts St & Greenwich St,40.724055,-74.00966,146,Hudson St & Reade St,40.71625,-74.009106,1,1998.0,1
1731593,1398,2017-06-30 23:59:53,2017-07-01 00:23:12,3244,University Pl & E 8 St,40.731437,-73.994903,3002,South End Ave & Liberty St,40.711512,-74.015756,1,1987.0,2
