In [1]:
import numpy as np
import pandas as pd
import math
import scipy.stats
import matplotlib.pyplot as plt
#import geopandas as gpd
#from shapely.geometry import Point, Polygon
pd.options.display.float_format = '{:.3f}'.format

In [2]:
def print_nulls(df):
    
    return df.isnull().sum()*100/df.isnull().isnull().count()

## What we know about the data already

I merged the data from the indego website https://www.rideindego.com/about/data/

The data ranges from the start of the program in _start_date_ 2015 through to _end date_2020_. Per the website above trips shorter than 1 minute have been removed and trip length has been capped at 24 hours. I will keep this in mind as I consider outliers. 

We do not have any information other than start and end points, about the routes that riders took.

We also must consider that this is a biased dataset. These data represents a specific population of bikers. 

In [3]:
raw_data = pd.read_csv('indego-trips-all.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
# Type errors let us know that we may have some problems with missing or improperly typed data. 
# We already know this from merging our data.

## Data Cleaning

In [4]:
raw_data.columns

Index(['trip_id', 'duration', 'start_time', 'end_time', 'start_station_id',
       'start_lat', 'start_lon', 'end_station_id', 'end_lat', 'end_lon',
       'bike_id', 'plan_duration', 'trip_route_category', 'passholder_type',
       'start_station', 'end_station', 'bike_type'],
      dtype='object')

In [5]:
# Only retain a handful of the columns

cols = ['duration', 'start_time', 'end_time', 'start_station_id',
       'end_station_id', 'trip_route_category', 'start_station', 'end_station',]

df = raw_data.loc[:, cols]

In [None]:
# Check how ballanced our level are in some of our categorical variables
df['passholder_type'].value_counts()

In [None]:
df['plan_duration'].value_counts()

In [None]:
df[df['passholder_type'] == 'Indego30']['plan_duration'].value_counts()

In [None]:
# We propably only need one of these plan type variables, both have decently distributed levels
df.drop('passholder_type', axis=1, inplace=True)

In [None]:
print_nulls(df)

In [None]:
# First the location data
# Luckily the problem is missing values in a small proportion of columns
 
cols = ['start_lat', 'start_lon', 'end_lat', 'end_lon']

for col in cols:
    df[df[col] == r'\N'] = np.nan
    
    df[col] = pd.to_numeric(df[col])

# entries that have 0 listed for both end coordinates
index = df[df['end_lat'] == 0].index
df.drop(index, inplace=True)

# entries that have 0 listed for both start coordinates 
index = df[df['start_lat'] == 0].index
df.drop(index, inplace=True)

#Two entries have negative latitude values for the start and end, they can be retrieved
mask = df['start_lat'] < 0
df.loc[mask, 'start_lat'] = df.loc[mask, 'start_lat'] * -1


# Three entries where only the end latitude is negative
mask = df['end_lat'] < 0
df.loc[mask, 'end_lat'] = df.loc[mask, 'end_lat'] *-1

In [None]:
# The formatting on this column changed part way through the dataset
# Fill into one column from the other
mask = df['start_station'].isna()
df.loc[mask, 'start_station'] = df['start_station_id']

# and for the end stations
mask = df['end_station'].isna()
df.loc[mask, 'end_station'] = df['end_station_id']

# drop the other columns
df.drop(['start_station_id', 'end_station_id'], axis=1, inplace=True)

In [None]:
# Bike type column
# This column is introduced in 2018-q3 report. This is also when a second bike type was introduced
# Therefore we can assume that all the trips before that were on standard bikes. Nulls only appear for this 
# time before there were electric bikes so it is safe to fill all of them
df['bike_type'] = df['bike_type'].fillna('standard')

In [None]:
# Bike_id values
# Here there are just a few badly formated values that are easily retrieved
index = df[df['bike_id'] == '03556A'].index
df.loc[index, 'bike_id'] = 3556

# 4 values asking to be deleted; we oblidge
index = df[df['bike_id'] == 'delete me'].index
df.drop(index, inplace=True)

df['bike_id'] = pd.to_numeric(df['bike_id'])

In [None]:
# There are a small number of trips that appear to be duplicates
df[['start_time', 'bike_id', 'start_station']].duplicated().value_counts()

In [None]:
df = df.drop_duplicates(['start_time', 'bike_id', 'start_station'])

In [None]:
# What about trips that end at the same time at the same station?
df[['end_time', 'bike_id', 'end_station']].duplicated().value_counts()

In [None]:
df = df.drop_duplicates(['end_time', 'bike_id', 'end_station'])

In [None]:
df.duplicated().value_counts()

In [None]:
print_nulls(df)

In [None]:
df = df.dropna()

In [None]:
print_nulls(df)

In [None]:
print('Percentage of data retained after cleaning: ', len(df)/len(raw_data) * 100)

### Transforming Timeseries and Location Data

In [None]:
# Next the timestamp values
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])

# Simply recalulate the duration
# The result will be in minutes because that is what all of our start and stop times are rounded to
df['duration'] = (pd.to_timedelta(df['end_time'] - df['start_time']).dt.seconds / 60)

In [None]:
df['start_geometry'] = [Point(xy) for xy in zip(df['start_lon'], df['start_lat'])]

df['end_geometry'] = [Point(xy) for xy in zip(df['end_lon'], df['end_lat'])]

### Outliers

First we should recall that trip duration has been treated already. While the data contain no entries greater than 24 hours as expected there are still some trips with a duration less than one minute. I will remove those as that is one of the assumptions of our dataset.

In [None]:
df['duration'].describe()

In [None]:
index = df[df['duration'] < 1].index
print('Dropping {} observations, {}% of the data'.format(len(index), round(len(index)/len(df), 2)))
df.drop(index, inplace=True)

## Closing notes on data cleaning

Overall the vast majority of the data was retained. However still more of it might be retained with more metadata or more cleaning. 

The differences between the end coordinates for trips with the same end station should be considered. Perhaps these differences arise from abnormal trips that are outliers in some way?

Lastly issues like duplicate trips or extrememly short trips could be errors in the logging system or user errors. These appear to make up only a small portion of the data but it is still worth considering how to better account for these.

## Exploritory Data Analysis



In [None]:
df.info()

In [None]:
df['trip_route_category'].value_counts()

In [None]:
# What is the average tirp duration for each category of trip?
df.groupby(by='trip_route_category')['duration'].mean()

In [None]:
df['start_station'].nunique()

In [None]:
mask = ~df['end_station'].isin(df['start_station'])
df.loc[mask, :]

In [None]:
df[df['end_station'] == 3000]

In [None]:
# Do all of the stations connect with the others via at least one trip, one trip a year, a month?

In [None]:
from itertools import combinations
combs = combinations([df['start_station'], df['end_station']], 2)

Duration could likely benefit from a natural log transormation

In [None]:
plt.figure(figsize=(10,5))

plt.subplot(1,2,1)
plt.hist(df['duration'])
plt.title('Histogram of Trip Duration')

plt.subplot(1, 2, 2)
plt.hist(np.log(df['duration']))
plt.title('Histogram of Log Normal Trip Duration')
plt.show()

In [None]:
df.groupby(by='start_station').describe()

In [None]:
gdf = gpd.GeoDataFrame(df
                      , crs='epsg:4326')

In [None]:
gdf.info()

In [None]:
gdf.plot()

In [None]:
zip_map = gpd.read_file(r'/home/owen/Jupyter/Capstone 2: Supervised Learning/Zipcodes_Poly-shp/Zipcodes_Poly.shp')

In [None]:
plt.figure(figsize=(12,12))
ax = plt.subplot(1,1,1)
gdf[:50000].plot(ax=ax)
zip_map.plot(alpha=.4, ax=ax)
plt.show()

In [None]:
zip_map.info()

In [None]:
neighborhoods = gpd.read_file(r'Neighborhoods_Philadelphia.shp')

In [None]:
neighborhoods = neighborhoods.to_crs('EPSG:4326')

In [None]:
neighborhoods.cx

In [None]:
sgdf = gdf[1000000:1050000] 

In [None]:
neighborhoods.contains(sgdf.geometry)neighborhoods.iloc[1]

In [None]:
neighborhoods.info()

In [None]:
plt.figure(figsize=(15,15))
ax = plt.subplot(1,1,1)
neighborhoods.plot(alpha=.4, color='grey', ax=ax)
sgdf.plot(ax=ax)

In [None]:
subset = neighborhoods.cx[-75.224:39.890, -75.130:39.992]

In [None]:
gdf[['start_lon','start_lat']].describe()

In [None]:
subset.info()

In [None]:
pwd