## Chicago Divvy Bicycle Sharing Data

Other data to be considered:
- crimes reports within a week
- weather 
- other transportation (bus, metra, cta and etc.)

In [1]:
import numpy as np
import pandas as pd
from multiprocessing import Pool, cpu_count
import gc
import time
gc.enable()

In [2]:
station_file = 'Divvy_Stations_2017_Q3Q4.csv'
trip_file = 'Divvy_Trips_2017_Q4.csv'
output_file = 'Divvy_data_2017_Q4.feather'

In [3]:
# Load data
s = time.time()

station = pd.read_csv(f'data/{station_file}')
print(f'station data loaded! {time.time()-s:.2f} seconds')

trip = pd.read_csv(f'data/{trip_file}')
print(f'trip data loaded! {time.time()-s:.2f} seconds')

station data loaded! 0.01 seconds
trip data loaded! 1.94 seconds


In [4]:
def parse_start_time(arg):
    """
    Function to parse `start_time` data into:
    - year
    - day
    - month
    - hour
    - minute
    - second
    - dayofweek
    """
    raw = pd.to_datetime(arg['start_time'])
    time_df = pd.DataFrame()
    time_df['trip_id'] = arg['trip_id']
    for t in ['year', 'day', 'month', 'hour', 'minute', 'second', 'dayofweek']:
        time_df[t] = raw.apply(lambda x: getattr(x, t))
    return time_df

In [5]:
# Process start_time data with mutiple processors
n_thread = cpu_count()
time_df_raw = trip[['trip_id', 'start_time']]
args = np.array_split(time_df_raw, n_thread)

with Pool(processes=n_thread) as p:
    result = p.map(parse_start_time, args)

time_df = pd.concat(list(result), ignore_index=True)

# Merge postprocessed data into main dataframe
trip = pd.merge(left=trip, right=time_df, on='trip_id', how='left')

print(f'start_time data processed! {time.time()-s:.2f} seconds')

start_time data processed! 30.85 seconds


In [6]:
# Convert birthyear into age of each customer
trip['Age'] = trip['year'] - trip['birthyear']

# Merge station data into trip data (start)
trip = (pd.merge(left=trip, right=station, left_on='from_station_id', right_on='id', how='left')
          .drop(['id', 'name', 'online_date'], axis=1)
          .rename(columns={'city': 'from_city', 
                           'latitude': 'from_latitude',
                           'longitude': 'from_longitude',
                           'dpcapacity': 'from_dpcapacity'})
    )

# Merge station data into trip data (end)
trip = (pd.merge(left=trip, right=station, left_on='to_station_id', right_on='id', how='left')
          .drop(['id', 'name', 'online_date'], axis=1)
          .rename(columns={'city': 'to_city', 
                           'latitude': 'to_latitude',
                           'longitude': 'to_longitude',
                           'dpcapacity': 'to_dpcapacity'})
    )

# Drop useless columns
trip = trip.drop(['start_time', 'end_time', 'birthyear'], axis=1)

print(f'data process done! {time.time()-s:.2f} seconds')

data process done! 33.86 seconds


In [7]:
# Saving to feather file
trip.to_feather(f'data/{output_file}')
print(f'Data saved to feather file! {time.time()-s:.2f} seconds')

Data saved to feather file! 37.89 seconds
