## Preprocessing

- Clean up divvy bike trip data from 2013 to 2018

In [1]:
import numpy as np
import pandas as pd
import re
from multiprocessing import Pool, cpu_count
import gc
import os
import time
gc.enable()

In [17]:
# Deal with year data with different filenames

TRIP_FILE = 'data/Divvy_Trips'
YEAR = 2018

raw_file_name = {
    '2013': ['all'],
    '2014': ['Q1Q2', 'Q3-07', 'Q3-0809', 'Q4'],
    '2015': ['Q1', 'Q2', '07', '08', '09', 'Q4'],
    '2016': ['Q1', '04', '05', '06', 'Q3', 'Q4'],
    '2017': ['Q1', 'Q2', 'Q3', 'Q4'],
    '2018': ['Q1', 'Q2', 'Q3', 'Q4']
}

In [18]:
%%time
# Load data

df_lst = []
for q in raw_file_name[str(YEAR)]:
    print(f'Loading from {TRIP_FILE}_{YEAR}_{q}.csv...')
    df_tmp = pd.read_csv(TRIP_FILE+'_'+str(YEAR)+'_'+q+'.csv')
    print(f'{TRIP_FILE}_{YEAR}_{q}.csv loaded!')
    df_lst.append(df_tmp)
    
trip = pd.concat(df_lst, ignore_index=True)

# Clean up
del df_lst
del df_tmp
gc.collect()

Loading from data/Divvy_Trips_2018_Q1.csv...
data/Divvy_Trips_2018_Q1.csv loaded!
Loading from data/Divvy_Trips_2018_Q2.csv...
data/Divvy_Trips_2018_Q2.csv loaded!
Loading from data/Divvy_Trips_2018_Q3.csv...
data/Divvy_Trips_2018_Q3.csv loaded!
Loading from data/Divvy_Trips_2018_Q4.csv...
data/Divvy_Trips_2018_Q4.csv loaded!


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.


CPU times: user 14.9 s, sys: 1.77 s, total: 16.7 s
Wall time: 14.6 s


In [19]:
if 'starttime' in trip.columns:
    trip.rename(columns={"starttime": "start_time"}, inplace=True)

In [20]:
trip.head()

Unnamed: 0,01 - Rental Details Bike ID,01 - Rental Details Duration In Seconds Uncapped,01 - Rental Details Local End Time,01 - Rental Details Local Start Time,01 - Rental Details Rental ID,02 - Rental End Station ID,02 - Rental End Station Name,03 - Rental Start Station ID,03 - Rental Start Station Name,05 - Member Details Member Birthday Year,...,end_time,from_station_id,from_station_name,gender,start_time,to_station_id,to_station_name,trip_id,tripduration,usertype
0,3304.0,323.0,2018-01-01 00:17:23,2018-01-01 00:12:00,17536702.0,159.0,Claremont Ave & Hirsch St,69.0,Damen Ave & Pierce Ave,1988.0,...,,,,,,,,,,
1,5367.0,377.0,2018-01-01 00:47:52,2018-01-01 00:41:35,17536703.0,325.0,Clark St & Winnemac Ave (Temp),253.0,Winthrop Ave & Lawrence Ave,1984.0,...,,,,,,,,,,
2,4599.0,2904.0,2018-01-01 01:33:10,2018-01-01 00:44:46,17536704.0,509.0,Troy St & North Ave,98.0,LaSalle St & Washington St,1989.0,...,,,,,,,,,,
3,2302.0,747.0,2018-01-01 01:05:37,2018-01-01 00:53:10,17536705.0,364.0,Larrabee St & Oak St,125.0,Rush St & Hubbard St,1983.0,...,,,,,,,,,,
4,3696.0,183.0,2018-01-01 00:56:40,2018-01-01 00:53:37,17536706.0,205.0,Paulina St & 18th St,129.0,Blue Island Ave & 18th St,1989.0,...,,,,,,,,,,


In [21]:
trip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3603082 entries, 0 to 3603081
Data columns (total 24 columns):
01 - Rental Details Bike ID                         float64
01 - Rental Details Duration In Seconds Uncapped    object
01 - Rental Details Local End Time                  object
01 - Rental Details Local Start Time                object
01 - Rental Details Rental ID                       float64
02 - Rental End Station ID                          float64
02 - Rental End Station Name                        object
03 - Rental Start Station ID                        float64
03 - Rental Start Station Name                      object
05 - Member Details Member Birthday Year            float64
Member Gender                                       object
User Type                                           object
bikeid                                              float64
birthyear                                           float64
end_time                                            ob

In [10]:
def _get_time(string, tp):
    index_dict = {
        'month': 0,
        'day': 1,
        'hour': 2
    }
    if YEAR == 2013:
        return int(re.match(r'[0-9]+-([0-9]+)-([0-9])+ ([0-9]+):', string).groups()[index_dict[tp]])
    else:
        return int(re.match(r'([0-9]+)/([0-9]+)/[0-9]+ ([0-9]+):', string).groups()[index_dict[tp]])
    
def parse_start_time(args):
    """
    Function to parse `start_time` data into:
    - day
    - month
    - hour
    """
    args['year'] = YEAR
    for t in ['day', 'month', 'hour']:
        args[t] = args.apply(lambda x: _get_time(x.start_time, t), axis=1)
    return args.drop('start_time', axis=1)

In [11]:
%%time
# Process start_time data with mutiple processors
n_thread = 4
time_df_raw = trip[['trip_id', 'start_time']]
args = np.array_split(time_df_raw, n_thread)

with Pool(processes=n_thread) as p:
    result = p.map(parse_start_time, args)

time_df = pd.concat(list(result), ignore_index=True)

# Merge postprocessed data into main dataframe
trip = pd.merge(left=trip, right=time_df, on='trip_id', how='left')

print(f'start_time data processed!')

start_time data processed!
CPU times: user 1.01 s, sys: 349 ms, total: 1.36 s
Wall time: 30.3 s


In [14]:
# Convert birthyear into age of each customer
if 'birthyear' in trip.columns:
    trip['Age'] = trip['year'] - trip['birthyear']
else:
    trip['Age'] = np.nan

# # Merge station data into trip data (start)
# trip = (pd.merge(left=trip, right=station, left_on='from_station_id', right_on='id', how='left')
#           .drop(['id', 'name', 'online_date'], axis=1)
#           .rename(columns={'city': 'from_city', 
#                            'latitude': 'from_latitude',
#                            'longitude': 'from_longitude',
#                            'dpcapacity': 'from_dpcapacity'})
#     )

# # Merge station data into trip data (end)
# trip = (pd.merge(left=trip, right=station, left_on='to_station_id', right_on='id', how='left')
#           .drop(['id', 'name', 'online_date'], axis=1)
#           .rename(columns={'city': 'to_city', 
#                            'latitude': 'to_latitude',
#                            'longitude': 'to_longitude',
#                            'dpcapacity': 'to_dpcapacity'})
#     )

# Drop useless columns
for uc in ['start_time', 'end_time', 'stoptime', 'birthyear']:
    if uc in trip.columns:
        trip = trip.drop(uc, axis=1)
    
for c in trip.columns:   
    if 'Unnamed' in c:
        trip = trip.drop([c], axis=1)

print(f'data process done!')

data process done!


In [15]:
trip.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 759788 entries, 0 to 759787
Data columns (total 15 columns):
trip_id              759788 non-null int64
bikeid               759788 non-null int64
tripduration         759788 non-null int64
from_station_id      759788 non-null int64
from_station_name    759788 non-null object
to_station_id        759788 non-null int64
to_station_name      759788 non-null object
usertype             759788 non-null object
gender               403046 non-null object
birthday             402909 non-null float64
year                 759788 non-null int64
day                  759788 non-null int64
month                759788 non-null int64
hour                 759788 non-null int64
Age                  0 non-null float64
dtypes: float64(2), int64(9), object(4)
memory usage: 92.7+ MB


In [16]:
# Saving to feather file
trip.to_feather(f'data/Divvy_data_{YEAR}.feather')
print(f'Data saved to feather file!')

Data saved to feather file!
