In [1]:
%matplotlib inline
import matplotlib
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 2 * matplotlib.rcParams['savefig.dpi']

In [2]:
# data
# http://data.beta.nyc/dataset/unofficial-mta-transit-data-archive

# documentation 
# http://bustime.mta.info/wiki/Developers/ArchiveData

#gtfs spec
# https://developers.google.com/transit/gtfs/reference
!wget -nc http://data.mytransit.nyc.s3.amazonaws.com/bus_time/2015/2015-01/bus_time_20150128.csv.xz
!7z -y x bus_time_20150128.csv.xz
!wget -nc http://data.mytransit.nyc.s3.amazonaws.com/bus_time/2015/2015-01/bus_time_20150129.csv.xz
!7z -y x bus_time_20150129.csv.xz

!wget -nc http://data.mytransit.nyc.s3.amazonaws.com/gtfs/2015/gtfs_nyct_bus_20150103.zip
!unzip -o gtfs_nyct_bus_20150103.zip

--2017-11-21 13:43:23--  http://data.mytransit.nyc.s3.amazonaws.com/bus_time/2015/2015-01/bus_time_20150128.csv.xz
Resolving data.mytransit.nyc.s3.amazonaws.com (data.mytransit.nyc.s3.amazonaws.com)... 52.216.229.251
Connecting to data.mytransit.nyc.s3.amazonaws.com (data.mytransit.nyc.s3.amazonaws.com)|52.216.229.251|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41847772 (40M) [application/x-xz]
Saving to: ‘bus_time_20150128.csv.xz’


2017-11-21 13:43:24 (64.8 MB/s) - ‘bus_time_20150128.csv.xz’ saved [41847772/41847772]


7-Zip [64] 9.20  Copyright (c) 1999-2010 Igor Pavlov  2010-11-18
p7zip Version 9.20 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,8 CPUs)

Processing archive: bus_time_20150128.csv.xz

Extracting  bus_time_20150128.csv

Everything is Ok

Size:       359163860
Compressed: 41847772
--2017-11-21 13:43:31--  http://data.mytransit.nyc.s3.amazonaws.com/bus_time/2015/2015-01/bus_time_20150129.csv.xz
Resolving data.mytransit.nyc.s3.amazonaws.com (data.

In [14]:
#ETL
import pandas as pd
import numpy as np
from datetime import timedelta, datetime


def convert_to_int64(row):
    try:
        return np.int64(row)
    except ValueError:
        return np.nan

def utc_to_est(row):
    try:
        return row - timedelta(hours=5)
    except:
        raise

csv_f = pd.read_csv
archive = pd.concat([csv_f("bus_time_20150128.csv"),
                     csv_f("bus_time_20150129.csv")])

trips = pd.read_csv("trips.txt")
stops = pd.read_csv("stops.txt")
schedules = pd.read_csv("stop_times.txt")
#schedules.departure_time = pd.to_datetime(schedules.departure_time.apply(_25_to_other))
#archive.timestamp = pd.to_datetime(archive.timestamp).apply(utc_to_est)
archive.next_stop_id = archive.next_stop_id.apply(convert_to_int64)

In [15]:
#query and clean
live_archive = archive[archive.block_assigned > 0] # "assigned" to a route
today = live_archive[live_archive.service_date == 20150128] # due to time shift we get some dates in 1/27 and 1/29
#throw away trips with <15 reports
good_trips_only = today.groupby(today.trip_id).filter(lambda group: len(group) > 15)

#write this as our "clean" dataset
good_trips_only.to_csv("realtime.csv")

#merged dataframe with both 
partial = good_trips_only.merge(trips, on='trip_id') 
df = partial.merge(stops, left_on="next_stop_id", right_on="stop_id")

In [25]:
pd.options.display.max_columns = None
m15_only = df[df.route_id.str.contains("M15")].reset_index()
m15_only["timestamp"] = pd.to_datetime(m15_only["timestamp"])

In [42]:
m15_only.dist_along_route.apply(float).max()

13909.66

In [51]:
g = m15_only.sort_values("timestamp").groupby("trip_id")
g.get_group("OH_A5-Weekday-SDon-002000_M15_2")

Unnamed: 0,index,timestamp,vehicle_id,latitude,longitude,bearing,progress,service_date,trip_id,block_assigned,next_stop_id,dist_along_route,dist_from_stop,route_id,service_id,trip_headsign,direction_id,shape_id,stop_id,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station
0,161849,2015-01-28 05:23:55,6786,40.803310,-73.933166,233.93,0,20150128,OH_A5-Weekday-SDon-002000_M15_2,1,401738,180.23,108.89,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,401738,2 AV/E 125 ST,,40.802494,-73.933891,,,0,
1,161850,2015-01-28 05:24:26,6786,40.802600,-73.933682,234.02,0,20150128,OH_A5-Weekday-SDon-002000_M15_2,1,401738,180.23,18.77,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,401738,2 AV/E 125 ST,,40.802494,-73.933891,,,0,
852,162701,2015-01-28 05:24:58,6786,40.801908,-73.934191,232.14,0,20150128,OH_A5-Weekday-SDon-002000_M15_2,1,401739,422.49,173.05,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,401739,2 AV/E 122 ST,,40.800591,-73.935287,,,0,
1079,162928,2015-01-28 05:26:00,6786,40.796702,-73.937992,234.01,0,20150128,OH_A5-Weekday-SDon-002000_M15_2,1,401741,910.98,0,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,401741,2 AV/E 116 ST,,40.796745,-73.938095,,,0,
1540,163389,2015-01-28 05:26:32,6786,40.796415,-73.938201,234.01,0,20150128,OH_A5-Weekday-SDon-002000_M15_2,1,401742,1152.28,204.87,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,401742,2 AV/E 113 ST,,40.794853,-73.939499,,,0,
1541,163390,2015-01-28 05:27:04,6786,40.795950,-73.938543,233.53,0,20150128,OH_A5-Weekday-SDon-002000_M15_2,1,401742,1152.28,145.69,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,401742,2 AV/E 113 ST,,40.794853,-73.939499,,,0,
1705,163554,2015-01-28 05:27:35,6786,40.792996,-73.940703,233.41,0,20150128,OH_A5-Weekday-SDon-002000_M15_2,1,401743,1391.08,9.01,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,401743,2 AV/E 110 ST,,40.792965,-73.940849,,,0,
1873,163722,2015-01-28 05:28:07,6786,40.790381,-73.942612,233.95,0,20150128,OH_A5-Weekday-SDon-002000_M15_2,1,401745,1723.26,9.01,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,401745,2 AV/E 106 ST,,40.790352,-73.942764,,,0,
1989,163838,2015-01-28 05:28:39,6786,40.787300,-73.944873,234.10,0,20150128,OH_A5-Weekday-SDon-002000_M15_2,1,803182,2543.27,437.1,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,803182,2 AV/E 96 ST,,40.783901,-73.947472,,,0,
1990,163839,2015-01-28 05:29:10,6786,40.785040,-73.946517,233.98,0,20150128,OH_A5-Weekday-SDon-002000_M15_2,1,803182,2543.27,150.25,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,803182,2 AV/E 96 ST,,40.783901,-73.947472,,,0,


In [44]:
def speed(x):
    sort = x.sort_values("timestamp")
    secs = (sort.iloc[-1].timestamp - sort.iloc[0].timestamp).seconds
    distance = sort.iloc[-1]["dist_along_route"]
    return float(distance) / float(secs)
m15_only.groupby("trip_id").apply(speed)

trip_id
OF_A5-Weekday-SDon-028500_SBS15_1     0.000000
OF_A5-Weekday-SDon-029500_SBS15_2     0.000000
OF_A5-Weekday-SDon-030500_SBS15_3     0.000000
OF_A5-Weekday-SDon-031400_SBS15_4     4.866921
OF_A5-Weekday-SDon-032200_SBS15_5     4.122602
OF_A5-Weekday-SDon-032900_SBS15_6     0.000000
OF_A5-Weekday-SDon-033500_SBS15_7     4.476878
OF_A5-Weekday-SDon-033800_SBS15_1     3.792366
OF_A5-Weekday-SDon-034000_SBS15_8     0.000000
OF_A5-Weekday-SDon-034500_SBS15_9     0.000000
OF_A5-Weekday-SDon-035000_SBS15_10    3.782883
OF_A5-Weekday-SDon-035500_SBS15_11    4.050571
OF_A5-Weekday-SDon-035800_SBS15_3     3.855024
OF_A5-Weekday-SDon-035900_SBS15_12    0.000000
OF_A5-Weekday-SDon-036300_SBS15_13    0.000000
OF_A5-Weekday-SDon-036700_SBS15_14    0.000000
OF_A5-Weekday-SDon-036800_SBS15_4     4.430734
OF_A5-Weekday-SDon-037100_SBS15_15    4.524938
OF_A5-Weekday-SDon-037500_SBS15_16    3.916008
OF_A5-Weekday-SDon-037800_SBS15_5     4.497318
OF_A5-Weekday-SDon-037900_SBS15_17    0.000000
OF_A5

Unnamed: 0,index,timestamp,vehicle_id,latitude,longitude,bearing,progress,service_date,trip_id,block_assigned,next_stop_id,dist_along_route,dist_from_stop,route_id,service_id,trip_headsign,direction_id,shape_id,stop_id,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station
0,161849,2015-01-28T05:23:55Z,6786,40.803310,-73.933166,233.93,0,20150128,OH_A5-Weekday-SDon-002000_M15_2,1,401738,180.23,108.89,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,401738,2 AV/E 125 ST,,40.802494,-73.933891,,,0,
1,161850,2015-01-28T05:24:26Z,6786,40.802600,-73.933682,234.02,0,20150128,OH_A5-Weekday-SDon-002000_M15_2,1,401738,180.23,18.77,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,401738,2 AV/E 125 ST,,40.802494,-73.933891,,,0,
2,161851,2015-01-28T06:07:21Z,3904,40.803394,-73.933011,157.83,0,20150128,OH_A5-Weekday-SDon-006000_M15_3,1,401738,180.23,130.16,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,401738,2 AV/E 125 ST,,40.802494,-73.933891,,,0,
3,161852,2015-01-28T06:07:53Z,3904,40.802610,-73.933675,234.02,0,20150128,OH_A5-Weekday-SDon-006000_M15_3,1,401738,180.23,20.03,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,401738,2 AV/E 125 ST,,40.802494,-73.933891,,,0,
4,161853,2015-01-28T06:33:26Z,3892,40.802570,-73.933704,234.02,0,20150128,OH_A5-Weekday-SDon-009000_M15_4,1,401738,180.23,15.02,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,401738,2 AV/E 125 ST,,40.802494,-73.933891,,,0,
5,161854,2015-01-28T07:05:57Z,6702,40.803162,-73.933273,233.93,0,20150128,OH_A5-Weekday-SDon-012000_M15_1,1,401738,180.23,90.11,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,401738,2 AV/E 125 ST,,40.802494,-73.933891,,,0,
6,161855,2015-01-28T07:06:29Z,6702,40.802452,-73.933790,234.02,0,20150128,OH_A5-Weekday-SDon-012000_M15_1,1,401738,180.23,0,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,401738,2 AV/E 125 ST,,40.802494,-73.933891,,,0,
7,161856,2015-01-28T07:24:48Z,6786,40.803224,-73.932591,157.39,2,20150128,OH_A5-Weekday-SDon-015000_M15_2,1,401738,180.23,170.21,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,401738,2 AV/E 125 ST,,40.802494,-73.933891,,,0,
8,161857,2015-01-28T07:25:20Z,6786,40.803224,-73.932591,157.39,2,20150128,OH_A5-Weekday-SDon-015000_M15_2,1,401738,180.23,170.21,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,401738,2 AV/E 125 ST,,40.802494,-73.933891,,,0,
9,161858,2015-01-28T07:25:52Z,6786,40.803224,-73.932591,157.39,2,20150128,OH_A5-Weekday-SDon-015000_M15_2,1,401738,180.23,170.21,M15,OH_A5-Weekday-SDon,SOUTH FERRY via 2 AV,1,M150351,401738,2 AV/E 125 ST,,40.802494,-73.933891,,,0,
