In [1]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from datetime import datetime

# 1.0 Dask Processing
The other information for the project is small enough to store in memory, but with 13M individual rides, I need to use dask to process and break up the rides

In [2]:
columns = ['trip_id', 'start_time', 'end_time', 'bikeid', 'tripduration',
       'from_station_id', 'from_station_name', 'to_station_id',
       'to_station_name', 'usertype', 'gender', 'birthyear']

In [3]:
rides = dd.read_csv('/home/michael/Documents/Projects/divvydataproject/data/Divvy Trips/Divvy_Trips_*.csv',
                    header = 0, names = columns, parse_dates = ['start_time','end_time'], dtype={'gender': 'object'})

In [4]:
rides.head()

Unnamed: 0,trip_id,start_time,end_time,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,birthyear
0,4118,2013-06-27 12:11:00,2013-06-27 12:16:00,480,316,85,Michigan Ave & Oak St,28,Larrabee St & Menomonee St,Customer,,
1,4275,2013-06-27 14:44:00,2013-06-27 14:45:00,77,64,32,Racine Ave & Congress Pkwy,32,Racine Ave & Congress Pkwy,Customer,,
2,4291,2013-06-27 14:58:00,2013-06-27 15:05:00,77,433,32,Racine Ave & Congress Pkwy,19,Loomis St & Taylor St,Customer,,
3,4316,2013-06-27 15:06:00,2013-06-27 15:09:00,77,123,19,Loomis St & Taylor St,19,Loomis St & Taylor St,Customer,,
4,4342,2013-06-27 15:13:00,2013-06-27 15:27:00,77,852,19,Loomis St & Taylor St,55,Halsted St & James M Rochford St,Customer,,


In [5]:
rides.dtypes

trip_id                       int64
start_time           datetime64[ns]
end_time             datetime64[ns]
bikeid                        int64
tripduration                  int64
from_station_id               int64
from_station_name            object
to_station_id                 int64
to_station_name              object
usertype                     object
gender                       object
birthyear                   float64
dtype: object

## 1.1 Aggregating by Day

In [12]:
rides['startdate'] = rides['start_time'].dt.date

In [14]:
dayagg = rides.groupby(['startdate'])[['trip_id']].count().compute()

In [15]:
dayagg

Unnamed: 0_level_0,trip_id
startdate,Unnamed: 1_level_1
2013-06-27,95
2013-06-28,897
2013-06-29,1201
2013-06-30,1812
2013-07-01,1559
2013-07-02,1108
2013-07-03,1007
2013-07-04,2956
2013-07-05,2463
2013-07-06,3001


In [16]:
dayagg.to_pickle('/home/michael/Documents/Projects/divvydataproject/data/dateagg.pkl')

## 1.2 Aggregating by date and location

In [11]:
rides['starthour'] = rides['start_time'].dt.round('H')
houragg = rides.groupby(['starthour','to_station_id'])[['trip_id']].count().compute()

In [9]:
wrigleystations = [114, 165, 304, 240, 256]
#wrigley = rides[rides['to_station_id'].isin(wrigleystations)]

In [16]:
houragg2 = houragg.reset_index(level = ['starthour','to_station_id'])

In [18]:
houragg2.to_pickle('/home/michael/Documents/Projects/divvydataproject/data/houraggsmall.pkl')

In [28]:
houragg2.shape

(4739356, 3)

In [19]:
houraggfull = rides.groupby(['starthour','to_station_id','from_station_id'])[['trip_id']].count().compute()

In [24]:
houraggfull2 = houraggfull.reset_index(level = ['starthour','to_station_id','from_station_id'])

In [25]:
houraggfull2.to_pickle('/home/michael/Documents/Projects/divvydataproject/data/houraggfull.pkl')

In [26]:
houraggfull2.shape

(11200222, 4)