## Preprocessing

- Clean up divvy bike trip data from 2013 to 2018

In [1]:
import numpy as np
import pandas as pd
import re
from multiprocessing import Pool, cpu_count
import gc
import os
import time
gc.enable()

In [2]:
# Deal with year data with different filenames

TRIP_FILE = 'data/Divvy_Trips'
YEAR = 2018

raw_file_name = {
    '2013': ['all'],
    '2014': ['Q1Q2', 'Q3-07', 'Q3-0809', 'Q4'],
    '2015': ['Q1', 'Q2', '07', '08', '09', 'Q4'],
    '2016': ['Q1', '04', '05', '06', 'Q3', 'Q4'],
    '2017': ['Q1', 'Q2', 'Q3', 'Q4'],
    '2018': ['Q1', 'Q2', 'Q3', 'Q4']
}

In [3]:
%%time
# Load data

df_lst = []
for q in raw_file_name[str(YEAR)]:
    print(f'Loading from {TRIP_FILE}_{YEAR}_{q}.csv...')
    df_tmp = pd.read_csv(TRIP_FILE+'_'+str(YEAR)+'_'+q+'.csv')
    
    if YEAR == 2018 and q == 'Q1':
        df_tmp.rename(
            columns={
                "01 - Rental Details Rental ID": "trip_id",
                "01 - Rental Details Local Start Time": "start_time",
                "01 - Rental Details Local End Time": "end_time",
                "01 - Rental Details Bike ID": "bikeid",
                "01 - Rental Details Duration In Seconds Uncapped": "tripduration",
                "03 - Rental Start Station ID": "from_station_id",
                "03 - Rental Start Station Name": "from_station_name", 
                "02 - Rental End Station ID": "to_station_id",
                "02 - Rental End Station Name": "to_station_name", 
                "User Type": "usertype",
                "Member Gender": "gender",
                "05 - Member Details Member Birthday Year": "birthyear"
                    }, 
            inplace=True
        )
    print(f'{TRIP_FILE}_{YEAR}_{q}.csv loaded!')
    df_lst.append(df_tmp)
    
trip = pd.concat(df_lst, ignore_index=True)

# Clean up
del df_lst
del df_tmp
gc.collect()

Loading from data/Divvy_Trips_2018_Q1.csv...
data/Divvy_Trips_2018_Q1.csv loaded!
Loading from data/Divvy_Trips_2018_Q2.csv...
data/Divvy_Trips_2018_Q2.csv loaded!
Loading from data/Divvy_Trips_2018_Q3.csv...
data/Divvy_Trips_2018_Q3.csv loaded!
Loading from data/Divvy_Trips_2018_Q4.csv...
data/Divvy_Trips_2018_Q4.csv loaded!
CPU times: user 12.9 s, sys: 1.09 s, total: 14 s
Wall time: 12.6 s


In [4]:
# Rename column names from `startime` to `start_time` for consistency
if 'starttime' in trip.columns:
    trip.rename(columns={"starttime": "start_time"}, inplace=True)

In [5]:
trip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3603082 entries, 0 to 3603081
Data columns (total 12 columns):
trip_id              int64
start_time           object
end_time             object
bikeid               int64
tripduration         object
from_station_id      int64
from_station_name    object
to_station_id        int64
to_station_name      object
usertype             object
gender               object
birthyear            float64
dtypes: float64(1), int64(4), object(7)
memory usage: 329.9+ MB


In [6]:
def _get_time(string, tp):
    index_dict = {
        'month': 0,
        'day': 1,
        'hour': 2
    }
    if '-' in string:
        return int(re.match(r'[0-9]+-([0-9]+)-([0-9]+) ([0-9]+):', string).groups()[index_dict[tp]])
    else:
        return int(re.match(r'([0-9]+)/([0-9]+)/[0-9]+ ([0-9]+):', string).groups()[index_dict[tp]])
    
def parse_start_time(args):
    """
    Function to parse `start_time` data into:
    - day
    - month
    - hour
    """
    args['year'] = YEAR
    for t in ['day', 'month', 'hour']:
        args[t] = args.apply(lambda x: _get_time(x.start_time, t), axis=1)
    return args.drop('start_time', axis=1)

In [7]:
%%time
# Process start_time data with mutiple processors
n_thread = 4
time_df_raw = trip[['trip_id', 'start_time']]
args = np.array_split(time_df_raw, n_thread)

with Pool(processes=n_thread) as p:
    result = p.map(parse_start_time, args)

time_df = pd.concat(list(result), ignore_index=True)

# Merge postprocessed data into main dataframe
trip = pd.merge(left=trip, right=time_df, on='trip_id', how='left')

print(f'start_time data processed!')

start_time data processed!
CPU times: user 5.32 s, sys: 2.79 s, total: 8.11 s
Wall time: 1min 57s


In [8]:
# Convert birthyear into age of each customer
if 'birthyear' in trip.columns:
    trip['Age'] = trip['year'] - trip['birthyear']
else:
    trip['Age'] = np.nan

# # Merge station data into trip data (start)
# trip = (pd.merge(left=trip, right=station, left_on='from_station_id', right_on='id', how='left')
#           .drop(['id', 'name', 'online_date'], axis=1)
#           .rename(columns={'city': 'from_city', 
#                            'latitude': 'from_latitude',
#                            'longitude': 'from_longitude',
#                            'dpcapacity': 'from_dpcapacity'})
#     )

# # Merge station data into trip data (end)
# trip = (pd.merge(left=trip, right=station, left_on='to_station_id', right_on='id', how='left')
#           .drop(['id', 'name', 'online_date'], axis=1)
#           .rename(columns={'city': 'to_city', 
#                            'latitude': 'to_latitude',
#                            'longitude': 'to_longitude',
#                            'dpcapacity': 'to_dpcapacity'})
#     )

# Drop useless columns
for uc in ['start_time', 'end_time', 'stoptime', 'birthyear']:
    if uc in trip.columns:
        trip = trip.drop(uc, axis=1)
    
for c in trip.columns:   
    if 'Unnamed' in c:
        trip = trip.drop([c], axis=1)

print(f'data process done!')

data process done!


In [9]:
trip.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3603082 entries, 0 to 3603081
Data columns (total 14 columns):
trip_id              int64
bikeid               int64
tripduration         object
from_station_id      int64
from_station_name    object
to_station_id        int64
to_station_name      object
usertype             object
gender               object
year                 int64
day                  int64
month                int64
hour                 int64
Age                  float64
dtypes: float64(1), int64(8), object(5)
memory usage: 412.3+ MB


In [10]:
# Saving to feather file
trip.to_feather(f'data/Divvy_data_{YEAR}.feather')
print(f'Data saved to feather file!')

Data saved to feather file!
