In [1]:
import numpy as np
import pandas as pd
import os
import requests, zipfile, io, glob
from datetime import datetime

### Download data directly from Indego website

In [2]:
url_head = "http://u626n26h74f16ig1p3pt0f2g-wpengine.netdna-ssl.com/wp-content/uploads/"
file_list = ["2018/04/indego-trips-2018-q1.csv.zip",
             "2018/01/indego-trips-2017-q4.csv.zip",
             "2015/12/indego-trips-2017-q3.csv.zip",
             "2017/07/indego_gbfs_trips_Q2_2017.csv.zip",
             "2017/04/indego_gbfs_trips_Q1_2017.zip",
             "2017/01/Indego_trips_Q4_2016.zip",
             "2016/10/Q3_2016_trips.zip",
             "2016/07/Indego_Trips_2016Q2.zip",
             "2016/07/Indego_Trips_2016Q1.zip",
             "2016/01/Indego_Trips_2015Q4.zip",
             "2016/01/Indego_Trips_2015Q3.zip",
             "2016/01/Indego_Trips_2015Q2.zip"]

* Note that 2017Q4 csv file name is "data/indego-quarter-echo.bicycletransit.com-2018-01-19-9-57 AM.csv"

In [3]:
data_path = "data/"
if not os.path.exists(data_path):
    os.makedirs(data_path)

In [4]:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}

In [5]:
for file in file_list:
#    if not os.path.exists(data_path + file.split("/")[-1]):
    url = url_head + file
    r = requests.get(url, stream=True, headers=headers)
#     print(io.BytesIO(r.content))
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall(data_path)

### Data Format

https://www.rideindego.com/about/data/

Each .csv file contains data for one quarter of the year. Each file contains the following data points:

- trip_id: Locally unique integer that identifies the trip
- duration: Length of trip in minutes (some are in seconds)
- start_time: The date/time when the trip began, presented in ISO 8601 format in local time
- end_time: The date/time when the trip ended, presented in ISO 8601 format in local time
- start_station: The station ID where the trip originated (for station name and more information on each station see the Station Table)
- start_lat: The latitude of the station where the trip originated
- start_lon: The longitude of the station where the trip originated
- end_station: The station ID where the trip terminated (for station name and more information on each station see the Station Table)
- end_lat: The latitude of the station where the trip terminated
- end_lon: The longitude of the station where the trip terminated
- bike_id:  Locally unique integer that identifies the bike
- plan_duration: The number of days that the plan the passholder is using entitles them to ride; 0 is used for a single ride plan (Walk-up)
- trip_route_category: “Round Trip” for trips starting and ending at the same station or “One Way” for all other trips
- passholder_type: The name of the passholder’s plan


### Check the csv files

In [3]:
%%bash
cat data/*.csv | wc -l
head -2 data/*.csv

 1972680
==> data/Indego_Trips_2015Q2.csv <==
trip_id,duration,start_time,end_time,start_station_id,start_lat,start_lon,end_station_id,end_lat,end_lon,bike_id,plan_duration,trip_route_category,passholder_type
4001912,60,4/23/15 7:44,4/23/15 7:45,3046,39.95012,-75.14472,3046,39.95012,-75.14472,3350,30,Round Trip,Indego30

==> data/Indego_Trips_2015Q3.csv <==
trip_id,duration,start_time,end_time,start_station_id,start_lat,start_lon,end_station_id,end_lat,end_lon,bike_id,plan_duration,trip_route_category,passholder_type
4150104,660,7/1/15 0:06,7/1/15 0:17,3068,39.93549,-75.16711,3028,39.94061,-75.14958,3712,30,One Way,Indego30

==> data/Indego_Trips_2015Q4.csv <==
trip_id,duration,start_time,end_time,start_station_id,start_lat,start_lon,end_station_id,end_lat,end_lon,bike_id,plan_duration,trip_route_category,passholder_type
4376433,1140,10/1/15 0:04,10/1/15 0:23,3041,39.96849,-75.13546,3026,39.94138,-75.14564,3302,0,One Way,Walk-up

==> data/Indego_Trips_2016Q1.csv <==
trip_id,durat

### Merge all csv files into a single data frame

In [7]:
col_names = ["trip_id","duration","start_time","end_time","start_station","start_lat","start_lon","end_station","end_lat","end_lon","bike_id","plan_duration","trip_route_category","passholder_type"]

#### The datetime format will be also synchronized as a standard format

In [73]:
all_files = glob.glob(data_path + "*.csv")
row_num = 0
df_total = pd.DataFrame()
for file in all_files:

    df = pd.read_csv(file, index_col=None, header=0, names=col_names, na_values="\\N", low_memory=False, error_bad_lines=False)
    if "2015" in file:
        df["start_time"] = pd.to_datetime(df["start_time"], format="%m/%d/%y %H:%M")
        df["end_time"] = pd.to_datetime(df["end_time"], format="%m/%d/%y %H:%M")
    elif "2016" in file or "Q1_2017" in file: 
        df["start_time"] = pd.to_datetime(df["start_time"], format="%m/%d/%Y %H:%M")
        df["end_time"] = pd.to_datetime(df["end_time"], format="%m/%d/%Y %H:%M")
    else:
        df["start_time"] = pd.to_datetime(df["start_time"], format="%Y-%m-%d %H:%M:%S")
        df["end_time"] = pd.to_datetime(df["end_time"], format="%Y-%m-%d %H:%M:%S")

    # some durations are recorded in seconds
    if df.duration.max() == 86400:
        df.duration = df.duration/60
        
    print(file, df.shape)
    print(df.duration.describe())
    row_num += df.shape[0]  
    df_total = df_total.append(df)

print("The total number of rows are %d." % row_num)

data/Q3_2016_trips.csv (234946, 14)
count    234946.000000
mean         22.392197
std          58.963784
min           1.000000
25%           8.000000
50%          12.000000
75%          20.000000
max        1440.000000
Name: duration, dtype: float64
data/indego-trips-2018-q1.csv (98993, 14)
count    98993.000000
mean        20.656754
std         77.190572
min          1.000000
25%          7.000000
50%         10.000000
75%         16.000000
max       1440.000000
Name: duration, dtype: float64
data/indego-quarter-echo.bicycletransit.com-2018-01-19-9-57 AM.csv (183909, 14)
count    183909.000000
mean         19.846054
std          63.786005
min           1.000000
25%           7.000000
50%          11.000000
75%          17.000000
max        1440.000000
Name: duration, dtype: float64
data/indego_gbfs_trips_Q1_2017.csv (107772, 14)
count    107772.000000
mean         20.217496
std          70.158835
min           1.000000
25%           7.000000
50%          10.000000
75%          16.000

#### Check dataframe basic information

In [74]:
df_total.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1972668 entries, 0 to 119558
Data columns (total 14 columns):
trip_id                int64
duration               float64
start_time             datetime64[ns]
end_time               datetime64[ns]
start_station          float64
start_lat              float64
start_lon              float64
end_station            int64
end_lat                float64
end_lon                float64
bike_id                object
plan_duration          int64
trip_route_category    object
passholder_type        object
dtypes: datetime64[ns](2), float64(6), int64(3), object(3)
memory usage: 495.3 MB


In [75]:
df_total.describe()

Unnamed: 0,trip_id,duration,start_station,start_lat,start_lon,end_station,end_lat,end_lon,plan_duration
count,1972668.0,1972668.0,1972658.0,1972481.0,1972481.0,1972668.0,1962227.0,1962227.0,1972668.0
mean,86347790.0,22.96095,3050.59,39.95139,-75.167,3090.188,39.83508,-74.94811,32.30585
std,78388650.0,68.37574,178.7423,0.2054352,0.386294,1882.986,2.15882,4.061713,50.65907
min,4001912.0,1.0,3000.0,0.0,-75.22399,3000.0,0.0,-75.22399,0.0
25%,4603069.0,8.0,3023.0,39.94561,-75.17952,3022.0,39.94527,-75.17939,30.0
50%,68783240.0,12.0,3045.0,39.95112,-75.16757,3045.0,39.95071,-75.16711,30.0
75%,155348400.0,20.0,3066.0,39.95923,-75.15813,3064.0,39.95662,-75.15716,30.0
max,242690400.0,1440.0,90018.0,39.99179,0.0,90255.0,39.99179,0.0,365.0


#### Check NA values

In [76]:
df_total.isnull().sum()

trip_id                    0
duration                   0
start_time                 0
end_time                   0
start_station             10
start_lat                187
start_lon                187
end_station                0
end_lat                10441
end_lon                10441
bike_id                  910
plan_duration              0
trip_route_category        0
passholder_type            0
dtype: int64

#### Output dataframe to python pickle file

In [77]:
df_total.to_pickle('indego_df.pkl') 