In [13]:
#use Pandas to import data
import pandas as pd
import zipfile
import requests
import io
import numpy as np
from datetime import datetime

In [14]:
## information on filename
citidir = "https://s3.amazonaws.com/tripdata/"
citi_extension = "-citibike-tripdata"
year = 2015
months = ['01','02','03','04','05','06','07','08','09','10','11','12']

In [15]:
## store all data first in dictionary format using months as keys
d = {}

## download monthly data from the web for each month
for mth in months:
    #create filenames for given month
    filename = str(year) + mth + citi_extension
    citibike_zip_filename = citidir + filename + ".zip"
    citibike_csv_filename = filename + ".csv"
    print(citibike_zip_filename)
    
    #find file and use read_csv to save in dictionary of dataframes
    requested_file = requests.get(citibike_zip_filename)
    z = zipfile.ZipFile(io.BytesIO(requested_file.content))
    d[mth] = pd.read_csv(z.open(citibike_csv_filename), header=0, sep=',', quotechar='"')

https://s3.amazonaws.com/tripdata/201501-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201502-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201503-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201504-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201505-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201506-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201507-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201508-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201509-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201510-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201511-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/201512-citibike-tripdata.zip


In [16]:
## Concatenate into single yearly dataframe
df = pd.DataFrame()

## append results of each month
for mth in months:
    df = df.append(d[mth], ignore_index=True)

In [17]:
#check out headers
print(df.head())

   tripduration      starttime       stoptime  start station id  \
0          1346  1/1/2015 0:01  1/1/2015 0:24               455   
1           363  1/1/2015 0:02  1/1/2015 0:08               434   
2           346  1/1/2015 0:04  1/1/2015 0:10               491   
3           182  1/1/2015 0:04  1/1/2015 0:07               384   
4           969  1/1/2015 0:05  1/1/2015 0:21               474   

        start station name  start station latitude  start station longitude  \
0          1 Ave & E 44 St               40.750020               -73.969053   
1          9 Ave & W 18 St               40.743174               -74.003664   
2     E 24 St & Park Ave S               40.740964               -73.986022   
3  Fulton St & Waverly Ave               40.683178               -73.965964   
4          5 Ave & E 29 St               40.745168               -73.986831   

   end station id             end station name  end station latitude  \
0             265     Stanton St & Chrystie St    

In [18]:
print("Median trip duration:", df["tripduration"].median())

Median trip duration: 629.0


In [19]:
#add new column to dataframe indicating whether start and end station IDs are the same or not
df['same_start_end'] = pd.Series(df["start station id"] == df["end station id"], index=df.index)
same_frac = df['same_start_end'].value_counts(normalize = True)
print("Fraction of rides that start and end at same station:", same_frac[True])

Fraction of rides that start and end at same station: 0.0223583913373


In [20]:
## Use groupby to figure out how many unique stations each bike has visited
bikes = df.groupby('bikeid')

#store the number of visited stations in dictionary format, using each bikeid as key
visited_stations = {}

for k,v in bikes:
    #following gets list of start and end stations visited by each bike
    start_stations = bikes.get_group(k)['start station id']
    end_stations = bikes.get_group(k)['end station id']
    
    #merge into single list, use .unique() to weed out duplicates
    stations = pd.concat([start_stations, end_stations], axis=0, ignore_index = True)
    unique_stations = stations.unique()
    visited_stations[k] = len(unique_stations)

In [21]:
visited_stations_std = np.std(list(visited_stations.values())) #standard deviation of every bike's visited station #
print("Standard deviation of unique stations visited by each bike:", visited_stations_std)

Standard deviation of unique stations visited by each bike: 54.5418965359


In [22]:
## Next question: figure out how average trip duration varies by month
#strategy: convert everything first to datetime
date_format_noseconds = '%m/%d/%Y %H:%M'
date_format_seconds = '%m/%d/%Y %H:%M:%S'
switch_indices = [824308, 2438684, 3379903]

#January-March and Jun don't include seconds on starttime - rely on switch_indices to change import formats
#figured out the index ranges by parsing through data
dates_1 = df['starttime'][:switch_indices[0]].apply(lambda x: datetime.strptime(x, date_format_noseconds))
dates_2 = df['starttime'][switch_indices[0]:switch_indices[1]].apply(lambda x: datetime.strptime(x, date_format_seconds))
dates_3 = df['starttime'][switch_indices[1]:switch_indices[2]].apply(lambda x: datetime.strptime(x, date_format_noseconds))
dates_4 = df['starttime'][switch_indices[2]:].apply(lambda x: datetime.strptime(x, date_format_seconds))

datetime = pd.concat([dates_1,dates_2,dates_3,dates_4], axis=0)
df['startmonth'] = datetime.apply(lambda x: x.month) #add new column startmonth using lambda function on datetime

In [24]:
##group by month:
df_by_month = df.groupby('startmonth')
mean_duration_by_month = df_by_month['tripduration'].mean()
print(mean_duration_by_month)
maxdiff = np.max(mean_duration_by_month)-np.min(mean_duration_by_month)
print('Difference in seconds between max and min monthly mean duration:',maxdiff)

startmonth
1      654.325583
2      649.383207
3      734.316673
4      929.884482
5     1000.233767
6      904.602783
7      967.670092
8     1017.478667
9     1050.849829
10    1079.953503
11     972.018670
12     945.711619
Name: tripduration, dtype: float64
Difference in seconds between max and min monthly mean duration: 430.57029597


In [25]:
## What percentage of riders go overtime (different limit for customer and subscriber)?
users = df.groupby('usertype')

max_ride_time = {'Customer': 1800, 'Subscriber': 2700}
ride_number = {}
rides_overtime = {}

for k,v in users:

    ride_times = users.get_group(k)['tripduration']
    ride_number[k] = len(ride_times)
    rides_overtime[k] = np.sum(ride_times > max_ride_time[k])
    
overtime_frac = np.sum(list(rides_overtime.values()))/np.sum(list(ride_number.values()))
print('Fraction of all rides that go overtime:',overtime_frac)

Fraction of all rides that go overtime: 0.0381067801681


In [None]:
### REMAINING QUESTIONS
### What is the average length, in kilometers, of a trip? 
#Assume trips follow great circle arcs from the start station to the end station. 
#Ignore trips that start and end at the same station, as well as those with obviously wrong data.

### Let us define the hourly usage fraction of a station to be the fraction of all rides 
#starting at that station that leave during a specific hour. 
#A station has surprising usage patterns if it has an hourly usage fraction for an hour 
#significantly different from the corresponding hourly usage fraction of the system as a whole. 
#What is the largest ratio of station hourly usage fraction to system hourly usage fraction 
#(hence corresponding to the most "surprising" station-hour pair)?

### Most of the time, a bike will begin a trip at the same station where its previous trip ended. 
#Sometimes a bike will be moved by the program, either for maintenance or to rebalance the distribution of bikes. 
#What is the average number of times a bike is moved during this period, as detected by seeing 
#if it starts at a different station than where the previous ride ended?