## Chicago Divvy Bicycle Sharing Data


### Goal:
- To predict the daily need of bikes for each divvy bike station

### Step 1. Data collection
- Collect chicago divvy bicycle sharing data from [website](https://www.divvybikes.com/system-data).
- 
Other data to be considered:
- crimes reports within a week
- weather 
- other transportation (bus, metra, cta and etc.)

In [1]:
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count
import gc
import time
gc.enable()

In [2]:
station_file = 'Divvy_Stations_2017_Q3Q4.csv'
trip_file = 'Divvy_Trips_2017_Q4.csv'
output_file = 'Divvy_data_2017_Q4.feather'

In [3]:
# Load data
s = time.time()

station = pd.read_csv(f'data/{station_file}')
print(f'station data loaded! {time.time()-s:.2f} seconds')

trip = pd.read_csv(f'data/{trip_file}')
print(f'trip data loaded! {time.time()-s:.2f} seconds')

station data loaded! 0.01 seconds
trip data loaded! 1.22 seconds


In [4]:
def parse_start_time(arg):
    """
    Function to parse `start_time` data into:
    - year
    - day
    - month
    - hour
    - minute
    - second
    - dayofweek
    """
    raw = pd.to_datetime(arg['start_time'])
    time_df = pd.DataFrame()
    time_df['trip_id'] = arg['trip_id']
    for t in ['year', 'day', 'month', 'hour', 'minute', 'second', 'dayofweek']:
        time_df[t] = raw.apply(lambda x: getattr(x, t))
    return time_df

In [5]:
# Process start_time data with mutiple processors
n_thread = cpu_count()
time_df_raw = trip[['trip_id', 'start_time']]
args = np.array_split(time_df_raw, n_thread)

with Pool(processes=n_thread) as p:
    result = p.map(parse_start_time, args)

time_df = pd.concat(list(result), ignore_index=True)

# Merge postprocessed data into main dataframe
trip = pd.merge(left=trip, right=time_df, on='trip_id', how='left')

print(f'start_time data processed! {time.time()-s:.2f} seconds')

start_time data processed! 25.35 seconds


In [6]:
# Convert birthyear into age of each customer
trip['Age'] = trip['year'] - trip['birthyear']

# Merge station data into trip data (start)
trip = (pd.merge(left=trip, right=station, left_on='from_station_id', right_on='id', how='left')
          .drop(['id', 'name', 'online_date'], axis=1)
          .rename(columns={'city': 'from_city', 
                           'latitude': 'from_latitude',
                           'longitude': 'from_longitude',
                           'dpcapacity': 'from_dpcapacity'})
    )

# Merge station data into trip data (end)
trip = (pd.merge(left=trip, right=station, left_on='to_station_id', right_on='id', how='left')
          .drop(['id', 'name', 'online_date'], axis=1)
          .rename(columns={'city': 'to_city', 
                           'latitude': 'to_latitude',
                           'longitude': 'to_longitude',
                           'dpcapacity': 'to_dpcapacity'})
    )

# Drop useless columns
trip = trip.drop(['start_time', 'end_time', 'birthyear'], axis=1)

print(f'data process done! {time.time()-s:.2f} seconds')

data process done! 27.20 seconds


In [7]:
# Saving to feather file
trip.to_feather(f'data/{output_file}')
print(f'Data saved to feather file! {time.time()-s:.2f} seconds')

Data saved to feather file! 32.13 seconds


In [8]:
trip

Unnamed: 0,trip_id,bikeid,tripduration,from_station_id,from_station_name,to_station_id,to_station_name,usertype,gender,year,...,from_city,from_latitude,from_longitude,from_dpcapacity,Unnamed: 7_x,to_city,to_latitude,to_longitude,to_dpcapacity,Unnamed: 7_y
0,17536701,3304,284,159,Claremont Ave & Hirsch St,69,Damen Ave & Pierce Ave,Subscriber,Male,2017,...,Chicago,41.907781,-87.685854,11,,Chicago,41.909396,-87.677692,19,
1,17536700,5975,1402,145,Mies van der Rohe Way & Chestnut St,145,Mies van der Rohe Way & Chestnut St,Customer,,2017,...,Chicago,41.898587,-87.621915,19,,Chicago,41.898587,-87.621915,19,
2,17536699,4906,1441,145,Mies van der Rohe Way & Chestnut St,145,Mies van der Rohe Way & Chestnut St,Customer,,2017,...,Chicago,41.898587,-87.621915,19,,Chicago,41.898587,-87.621915,19,
3,17536698,5667,315,340,Clark St & Wrightwood Ave,143,Sedgwick St & Webster Ave,Subscriber,Male,2017,...,Chicago,41.929546,-87.643118,15,,Chicago,41.922167,-87.638888,15,
4,17536697,5353,272,240,Sheridan Rd & Irving Park Rd,245,Clarendon Ave & Junior Ter,Subscriber,Male,2017,...,Chicago,41.954245,-87.654406,27,,Chicago,41.961004,-87.649603,23,
5,17536696,5840,589,93,Sheffield Ave & Willow St,343,Racine Ave & Wrightwood Ave,Subscriber,Male,2017,...,Chicago,41.913688,-87.652855,15,,Chicago,41.928887,-87.658971,15,
6,17536695,6351,301,337,Clark St & Chicago Ave,182,Wells St & Elm St,Subscriber,Male,2017,...,Chicago,41.896544,-87.630931,19,,Chicago,41.903222,-87.634324,23,
7,17536694,2562,141,226,Racine Ave & Belmont Ave,117,Wilton Ave & Belmont Ave,Subscriber,Male,2017,...,Chicago,41.939743,-87.658865,15,,Chicago,41.940180,-87.653040,23,
8,17536693,2471,615,49,Dearborn St & Monroe St,26,McClurg Ct & Illinois St,Subscriber,Male,2017,...,Chicago,41.881320,-87.629521,39,,Chicago,41.890359,-87.617532,31,
9,17536692,6462,743,196,Cityfront Plaza Dr & Pioneer Ct,255,Indiana Ave & Roosevelt Rd,Subscriber,Male,2017,...,Chicago,41.890573,-87.622072,23,,Chicago,41.867888,-87.623041,39,


In [9]:
trip.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 669239 entries, 0 to 669238
Data columns (total 27 columns):
trip_id              669239 non-null int64
bikeid               669239 non-null int64
tripduration         669239 non-null int64
from_station_id      669239 non-null int64
from_station_name    669239 non-null object
to_station_id        669239 non-null int64
to_station_name      669239 non-null object
usertype             669239 non-null object
gender               590659 non-null object
year                 669239 non-null int64
day                  669239 non-null int64
month                669239 non-null int64
hour                 669239 non-null int64
minute               669239 non-null int64
second               669239 non-null int64
dayofweek            669239 non-null int64
Age                  590412 non-null float64
from_city            669239 non-null object
from_latitude        669239 non-null float64
from_longitude       669239 non-null float64
from_dpcapacity   

In [11]:
trip[['year', 'month', 'day', 'hour']]

Unnamed: 0,year,month,day,hour
0,2017,12,31,23
1,2017,12,31,23
2,2017,12,31,23
3,2017,12,31,23
4,2017,12,31,23
5,2017,12,31,23
6,2017,12,31,23
7,2017,12,31,23
8,2017,12,31,23
9,2017,12,31,22


In [12]:
24*90

2160

In [13]:
700000/2160

324.0740740740741