In [1]:
import numpy as np
import pandas as pd
import math
import scipy.stats
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import Point, Polygon
pd.options.display.float_format = '{:.3f}'.format



In [2]:
def print_nulls(df):
    
    return df.isnull().sum()*100/df.isnull().isnull().count()

## What we know about the data already

I merged the data from the indego website https://www.rideindego.com/about/data/

The data ranges from the start of the program in _start_date_ 2015 through to _end date_2020_. Per the website above trips shorter than 1 minute have been removed and trip length has been capped at 24 hours. I will keep this in mind as I consider outliers. 

We do not have any information other than start and end points, about the routes that riders took.

We also must consider that this is a biased dataset. These data represents a specific population of bikers. 

In [3]:
raw_data = pd.read_csv(r'data/indego/indego-trips-all.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [4]:
df = raw_data.copy()

In [5]:
# Type errors let us know that we may have some problems with missing or improperly typed data. 
# We already know this from merging our data.

In [6]:
# Drop the unique identifier
df.drop('trip_id', axis=1, inplace=True)

## Data Cleaning

In [7]:
# Encode trip type
mask = df['trip_route_category'] == 'One Way'
df.loc[mask, 'trip_route_category'] = 0

mask = df['trip_route_category'] == 'Round Trip'
df.loc[mask, 'trip_route_category'] = 1

df['trip_route_category'] = pd.to_numeric(df['trip_route_category'])

In [8]:
# Check how ballanced our level are in some of our categorical variables
df['passholder_type'].value_counts()

Indego30        2860633
Walk-up          365475
Indego365        299087
Day Pass         215584
IndegoFlex        33713
One Day Pass       6767
Two Day Pass       1603
Name: passholder_type, dtype: int64

In [9]:
df['plan_duration'].value_counts()

30.000     2860504
0.000       362334
365.000     332819
1.000       225500
2.000         1619
180.000        114
Name: plan_duration, dtype: int64

In [10]:
df[df['passholder_type'] == 'Indego30']['plan_duration'].value_counts()

30.000     2860499
365.000        134
Name: plan_duration, dtype: int64

In [11]:
# We propably only need one of these plan type variables, both have decently distributed levels
df.drop('passholder_type', axis=1, inplace=True)

In [12]:
print_nulls(df)

Unnamed: 0             0.000
duration               0.000
start_time             0.000
end_time               0.000
start_station_id      63.496
start_lat              0.017
start_lon              0.017
end_station_id        63.496
end_lat                0.828
end_lon                0.828
bike_id                0.024
plan_duration          0.000
trip_route_category    0.000
start_station         36.504
end_station           36.504
bike_type             62.456
dtype: float64

In [13]:
# First the location data
# Luckily the problem is missing values in a small proportion of columns
 
cols = ['start_lat', 'start_lon', 'end_lat', 'end_lon']

for col in cols:
    df[df[col] == r'\N'] = np.nan
    
    df[col] = pd.to_numeric(df[col])

# entries that have 0 listed for both end coordinates
index = df[df['end_lat'] == 0].index
df.drop(index, inplace=True)

# entries that have 0 listed for both start coordinates 
index = df[df['start_lat'] == 0].index
df.drop(index, inplace=True)

#Two entries have negative latitude values for the start and end, they can be retrieved
mask = df['start_lat'] < 0
df.loc[mask, 'start_lat'] = df.loc[mask, 'start_lat'] * -1


# Three entries where only the end latitude is negative
mask = df['end_lat'] < 0
df.loc[mask, 'end_lat'] = df.loc[mask, 'end_lat'] *-1

In [14]:
# The formatting on this column changed part way through the dataset
# Fill into one column from the other
mask = df['start_station'].isna()
df.loc[mask, 'start_station'] = df['start_station_id']

# and for the end stations
mask = df['end_station'].isna()
df.loc[mask, 'end_station'] = df['end_station_id']

# drop the other columns
df.drop(['start_station_id', 'end_station_id'], axis=1, inplace=True)

In [15]:
# Bike type column
# This column is introduced in 2018-q3 report. This is also when a second bike type was introduced
# Therefore we can assume that all the trips before that were on standard bikes. Nulls only appear for this 
# time before there were electric bikes so it is safe to fill all of them
df['bike_type'] = df['bike_type'].fillna('standard')

In [16]:
# Bike_id values
# Here there are just a few badly formated values that are easily retrieved
index = df[df['bike_id'] == '03556A'].index
df.loc[index, 'bike_id'] = 3556

# 4 values asking to be deleted; we oblidge
index = df[df['bike_id'] == 'delete me'].index
df.drop(index, inplace=True)

df['bike_id'] = pd.to_numeric(df['bike_id'])

In [27]:
# There are a small number of trips that appear to be duplicates
df[['start_time', 'bike_id', 'start_station']].duplicated().value_counts()

False    3552275
dtype: int64

In [18]:
df = df.drop_duplicates(['start_time', 'bike_id', 'start_station'])

In [19]:
# What about trips that end at the same time at the same station?
df[['end_time', 'bike_id', 'end_station']].duplicated().value_counts()

False    3584862
True         214
dtype: int64

In [20]:
df = df.drop_duplicates(['end_time', 'bike_id', 'end_station'])

In [21]:
df.duplicated().value_counts()

False    3584862
dtype: int64

In [22]:
print_nulls(df)

Unnamed: 0            0.000
duration              0.000
start_time            0.000
end_time              0.000
start_lat             0.016
start_lon             0.016
end_lat               0.872
end_lon               0.872
bike_id               0.025
plan_duration         0.000
trip_route_category   0.000
start_station         0.000
end_station           0.000
bike_type             0.000
dtype: float64

In [23]:
df = df.dropna()

In [24]:
print_nulls(df)

Unnamed: 0            0.000
duration              0.000
start_time            0.000
end_time              0.000
start_lat             0.000
start_lon             0.000
end_lat               0.000
end_lon               0.000
bike_id               0.000
plan_duration         0.000
trip_route_category   0.000
start_station         0.000
end_station           0.000
bike_type             0.000
dtype: float64

In [25]:
print('Percentage of data retained after cleaning: ', len(df)/len(raw_data) * 100)

Percentage of data retained after cleaning:  93.90356121247817


### Transforming Timeseries and Location Data

In [28]:
# Next the timestamp values
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])

# Simply recalulate the duration
# The result will be in minutes because that is what all of our start and stop times are rounded to
df['duration'] = (pd.to_timedelta(df['end_time'] - df['start_time']).dt.seconds / 60)

In [None]:
df['start_geometry'] = [Point(xy) for xy in zip(df['start_lon'], df['start_lat'])]

df['end_geometry'] = [Point(xy) for xy in zip(df['end_lon'], df['end_lat'])]

### Outliers

First we should recall that trip duration has been treated already. While the data contain no entries greater than 24 hours as expected there are still some trips with a duration less than one minute. I will remove those as that is one of the assumptions of our dataset.

In [29]:
df['duration'].describe()

count   3552275.000
mean         21.542
std          53.186
min           0.000
25%           7.650
50%          12.000
75%          20.000
max        1439.000
Name: duration, dtype: float64

In [30]:
index = df[df['duration'] < 1].index
print('Dropping {} observations, {}% of the data'.format(len(index), round(len(index)/len(df), 2)))
df.drop(index, inplace=True)

Dropping 2488 observations, 0.0% of the data


### Closing notes on data cleaning

Overall the vast majority of the data was retained. However still more of it might be retained with more metadata or more cleaning. 

The differences between the end coordinates for trips with the same end station should be considered. Perhaps these differences arise from abnormal trips that are outliers in some way?

Lastly issues like duplicate trips or extrememly short trips could be errors in the logging system or user errors. These appear to make up only a small portion of the data but it is still worth considering how to better account for these.

## Aggregation

In [31]:
outdf = df.loc[:, ['duration', 'start_station', 'trip_route_category', 'start_time']]

indf = df.loc[:, ['duration', 'end_station', 'trip_route_category', 'end_time']]

In [32]:
indf['flow'] = 1
outdf['flow'] = -1

In [33]:
indf = indf.rename(columns={'duration' : 'incoming_duration', 'end_station' : 'station', 'end_time' : 'datetime'})

In [34]:
outdf = outdf.rename(columns={'duration' : 'outgoing_duration', 'start_station' : 'station', 'start_time' : 'datetime'})

In [35]:
stapled = pd.concat([indf, outdf])

In [None]:
# create a list of the stations
stations = stapled['station'].unique()

# create a dict to create and hold the dataframes
station_frames = {}

# loop through the data for each station
for station in stations:
    
    #select the data for that station
    mask = stapled['station'] == station
    small = stapled.loc[mask, :]
    
    # aggregate it
    small.set_index('datetime', inplace=True)
    grouped = small.resample('H').agg({'flow' : 'sum'
                         , 'incoming_duration' : 'mean'
                        , 'outgoing_duration' : 'mean'
                        , 'trip_route_category' : 'mean'})
    
    # preserves the int type better than .agg('mean')
    grouped['station'] = station
    
    station_frames[station] = grouped

In [None]:
unified = pd.DataFrame()
for station in stations:
    unified = pd.concat([unified, station_frames[station]])


In [None]:
unified.info()

In [None]:
unified.head()

In [None]:
# nulls can safely be filled with 0
unified.fillna(0, inplace=True)

In [None]:
unified['ds'] = unified.index

In [None]:
unified.head()

## Exploritory Data Analysis



In [None]:
unified.plot('ds', 'flow')

In [None]:
station_3056 = unified[unified['station']  == 3056].copy()

In [None]:
plot('ds', 'flow')

In [None]:
plt.figure(figsize=(20,20))
for key in station_frames:
    value = station_frames[key]
    value['hour'] = value.index.hour
    value.groupby('hour')['flow'].agg('mean').plot()
plt.xticks(np.arange(24))
plt.show()