In [1]:
cd ..

/home/moses/galvanize/bus_project


In [2]:
pwd

u'/home/moses/galvanize/bus_project'

In [3]:
import pandas as pd
import numpy as np
import networkx as nx
import cPickle as pickle
from code import utilities as ut

# Reading in static schedule data
gtfs_dir = '../bus_project_data/google_transit/'
stops_full = pd.read_csv(gtfs_dir + 'stops.txt', index_col='stop_id')
routes = pd.read_csv(gtfs_dir + 'routes.txt', index_col='route_id')
trips = pd.read_csv(gtfs_dir + 'trips.txt', index_col='trip_id')
stop_times = pd.read_csv(gtfs_dir + 'stop_times.txt')
shapes = pd.read_csv(gtfs_dir + 'shapes.txt')

# Some of these stops are named "Not a public stop" but are still in trips.
# Luckily, in the few trips they appear in, they're only either at the
# beginning or the end, so we can remove them now and we'll still build
# a nice graph with the connections we expect.
bad_stop_ids = stops_full[stops_full.stop_name == 'Not a public stop'].index.values
stops = stops_full[stops_full.stop_name != 'Not a public stop']
stop_times = stop_times[~stop_times['stop_id'].isin([7520, 7530, 7531, 7540])]

# Oh and some stops are in stops.txt but not used in trips... let's remove 'em
used_stops = set(stop_times['stop_id'].unique())
stops = stops[stops.index.isin(used_stops)]

# Let's make some sorted stop-timepoint lists for each stop_id to
# make lookup faster for things
#all_stop_timepoints = {}
#for stopid in used_stops:
#    node_names = stop_times[stop_times['stop_id'] == stopid].\
#                    apply(lambda x: '{0}_{1}'.\
#                              format(stopid, x['arrival_time']),\
#                          axis=1)
#    all_stop_timepoints[stopid] = sorted(list(set(node_names)))


In [88]:
data_dir = '../bus_project_data/avl-raw/'
raw = pd.read_csv(data_dir + 'sfmtaAVLRawData01062016.csv')
raw['TRAIN_ASSIGNMENT'] = raw['TRAIN_ASSIGNMENT'].apply(lambda x: int(x) if str(x).isdigit() else 'F')
raw = raw[raw['TRAIN_ASSIGNMENT'] != 'F']

In [89]:
trips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29022 entries, 6776796 to 6743838
Data columns (total 6 columns):
route_id         29022 non-null int64
service_id       29022 non-null int64
trip_headsign    28941 non-null object
direction_id     29022 non-null int64
block_id         29022 non-null int64
shape_id         29022 non-null int64
dtypes: int64(5), object(1)
memory usage: 1.5+ MB


In [90]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 90964 entries, 265 to 131071
Data columns (total 9 columns):
REV                 90964 non-null int64
REPORT_TIME         90964 non-null object
VEHICLE_TAG         90964 non-null object
LONGITUDE           90964 non-null float64
LATITUDE            90964 non-null float64
SPEED               90964 non-null float64
HEADING             90964 non-null float64
TRAIN_ASSIGNMENT    90964 non-null object
PREDICTABLE         90964 non-null int64
dtypes: float64(4), int64(2), object(3)
memory usage: 6.9+ MB


In [91]:
raw.head()

Unnamed: 0,REV,REPORT_TIME,VEHICLE_TAG,LONGITUDE,LATITUDE,SPEED,HEADING,TRAIN_ASSIGNMENT,PREDICTABLE
265,1506,01/06/2016 06:12:33,2,-122.42076,37.80681,0.0,351.0,6003,0
266,1506,01/06/2016 06:17:03,2,-122.42076,37.80681,0.0,351.0,6003,0
267,1506,01/06/2016 06:24:05,2,-122.41951,37.80136,0.0,351.0,6003,1
268,1506,01/06/2016 06:37:18,2,-122.41389,37.79418,0.0,351.0,6003,0
269,1506,01/06/2016 06:40:18,2,-122.41389,37.79418,0.0,351.0,6003,0


In [92]:
block = 6003
block_trips = trips[trips.block_id == block]
trip = block_trips.index.values[1]
route_id = trips.loc[trip]['route_id']
shape_id = trips.loc[trip]['shape_id']
route_name = routes.loc[route_id]['route_short_name']
trip_stops = stop_times[stop_times.trip_id == trip]['stop_id'].values
shape = shapes[shapes.shape_id == shape_id]

In [111]:
block_trips[block_trips.service_id == 1]

Unnamed: 0_level_0,route_id,service_id,trip_headsign,direction_id,block_id,shape_id
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6722108,1060,1,Washington & Mason,1,6003,133416
6721818,1060,1,Hyde & Beach,0,6003,133412
6721806,1060,1,Hyde & Beach,0,6003,133412
6721925,1060,1,Hyde & Beach,0,6003,133412
6721917,1060,1,Hyde & Beach,0,6003,133412
6721912,1060,1,Hyde & Beach,0,6003,133412
6721895,1060,1,Hyde & Beach,0,6003,133412
6721891,1060,1,Hyde & Beach,0,6003,133412
6721887,1060,1,Hyde & Beach,0,6003,133412
6722032,1060,1,Hyde & Beach,0,6003,133412


In [107]:
block_trips.shape_id.unique()

array([133416, 133412, 133411, 134627])

In [168]:
block_trip_info = []
for tr in block_trips[block_trips.service_id == 1].index.values:
    block_trip_info.append((tr, stop_times[stop_times.trip_id == tr].arrival_time.min(),\
            stop_times[stop_times.trip_id == tr].arrival_time.max(),\
                   trips.loc[tr].shape_id))
block_trip_info.sort(key=(lambda x: x[1]))

In [169]:
block_trip_info

[(6721876, '05:55:00', '06:04:00', 133411),
 (6722123, '06:12:00', '06:31:00', 134627),
 (6722005, '06:43:00', '07:01:00', 133412),
 (6722126, '07:12:00', '07:31:00', 134627),
 (6721887, '07:43:00', '08:01:00', 133412),
 (6722274, '08:12:00', '08:31:00', 134627),
 (6721891, '08:43:00', '09:01:00', 133412),
 (6722281, '09:12:00', '09:31:00', 134627),
 (6722032, '09:43:00', '10:01:00', 133412),
 (6722301, '10:19:00', '10:40:00', 134627),
 (6721925, '10:50:00', '11:10:00', 133412),
 (6722312, '11:30:00', '11:51:00', 134627),
 (6721895, '12:02:00', '12:24:00', 133412),
 (6722151, '12:49:00', '13:10:00', 134627),
 (6721957, '13:22:00', '13:44:00', 133412),
 (6722163, '14:09:00', '14:30:00', 134627),
 (6721970, '14:42:00', '15:04:00', 133412),
 (6722045, '15:29:00', '15:50:00', 134627),
 (6721806, '16:02:00', '16:25:00', 133412),
 (6722193, '16:49:00', '17:10:00', 134627),
 (6721818, '17:22:00', '17:45:00', 133412),
 (6722204, '18:10:00', '18:30:00', 134627),
 (6721979, '18:42:00', '19:04:00

In [158]:
block_trips[block_trips.service_id == 1].index.values

array([6722108, 6721818, 6721806, 6721925, 6721917, 6721912, 6721895,
       6721891, 6721887, 6722032, 6722005, 6721997, 6721990, 6721979,
       6721970, 6721957, 6721876, 6722163, 6722151, 6722126, 6722123,
       6722086, 6722045, 6722312, 6722301, 6722281, 6722274, 6722243,
       6722232, 6722214, 6722204, 6722193])

In [159]:
stop_times[stop_times.trip_id == 6797583]

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
1050587,6797583,16:02:00,16:02:00,6063,1,,,,
1050588,6797583,16:03:48,16:03:48,6068,2,,,,
1050589,6797583,16:04:41,16:04:41,6058,3,,,,
1050590,6797583,16:05:33,16:05:33,6072,4,,,,
1050591,6797583,16:06:26,16:06:26,6075,5,,,,
1050592,6797583,16:07:25,16:07:25,6047,6,,,,
1050593,6797583,16:08:18,16:08:18,6069,7,,,,
1050594,6797583,16:09:26,16:09:26,6049,8,,,,
1050595,6797583,16:09:59,16:09:59,6073,9,,,,
1050596,6797583,16:10:49,16:10:49,6051,10,,,,


In [93]:
raw[raw.TRAIN_ASSIGNMENT == 6003].VEHICLE_TAG.unique()

array([2, 6, 15, 16, 28], dtype=object)

In [170]:
trace = raw[(raw.TRAIN_ASSIGNMENT == 6003) & (raw.VEHICLE_TAG == 15)]
trace

Unnamed: 0,REV,REPORT_TIME,VEHICLE_TAG,LONGITUDE,LATITUDE,SPEED,HEADING,TRAIN_ASSIGNMENT,PREDICTABLE
61477,1506,01/06/2016 09:16:23,15,-122.41934,37.80051,3.889,171.0,6003,1
61478,1506,01/06/2016 14:29:01,15,-122.41226,37.79485,0.0,4.0,6003,0
61479,1506,01/06/2016 14:32:01,15,-122.41237,37.79498,0.0,307.0,6003,0
61480,1506,01/06/2016 14:36:31,15,-122.41212,37.7951,0.0,193.0,6003,0
61481,1506,01/06/2016 14:41:01,15,-122.41223,37.79501,0.0,268.0,6003,0
61482,1506,01/06/2016 14:59:01,15,-122.41222,37.79498,0.0,0.0,6003,0
61483,1506,01/06/2016 15:09:31,15,-122.41223,37.79501,0.0,67.0,6003,0
61484,1506,01/06/2016 15:14:01,15,-122.41226,37.79498,0.0,0.0,6003,0
61485,1506,01/06/2016 15:21:31,15,-122.41205,37.79475,0.0,44.0,6003,0
61486,1506,01/06/2016 15:30:31,15,-122.41218,37.79455,0.0,133.0,6003,0


In [184]:
print trace.REPORT_TIME.values

['01/06/2016 09:16:23' '01/06/2016 14:29:01' '01/06/2016 14:32:01'
 '01/06/2016 14:36:31' '01/06/2016 14:41:01' '01/06/2016 14:59:01'
 '01/06/2016 15:09:31' '01/06/2016 15:14:01' '01/06/2016 15:21:31'
 '01/06/2016 15:30:31' '01/06/2016 15:42:31' '01/06/2016 15:44:01'
 '01/06/2016 15:51:52' '01/06/2016 16:44:06' '01/06/2016 16:51:22'
 '01/06/2016 16:52:05' '01/06/2016 17:06:22' '01/06/2016 17:12:22'
 '01/06/2016 17:42:34' '01/06/2016 17:50:04' '01/06/2016 18:02:04'
 '01/06/2016 18:06:34' '01/06/2016 18:11:04' '01/06/2016 18:15:34'
 '01/06/2016 18:36:45' '01/06/2016 18:38:04' '01/06/2016 18:48:34'
 '01/06/2016 18:49:37' '01/06/2016 18:53:04' '01/06/2016 19:00:34'
 '01/06/2016 19:01:38' '01/06/2016 19:05:04' '01/06/2016 19:08:04'
 '01/06/2016 19:09:51' '01/06/2016 19:11:04' '01/06/2016 19:15:34'
 '01/06/2016 19:15:45' '01/06/2016 19:20:04' '01/06/2016 19:24:34'
 '01/06/2016 19:30:34']


In [182]:
trace_trips = []
for pt in trace.iterrows():
    pt_time = pt[1].REPORT_TIME[-8:]
    for i, trip in enumerate(block_trip_info):
        if (pt_time >= trip[1]) and (pt_time <= trip[2]):
            trace_trips.append(trip[0])
            break
        elif (pt_time >= trip[2]) and (pt_time <= block_trip_info[i+1][1]):
            trace_trips.append(trip[0])
            break

In [183]:
trace_trips

[6722281,
 6722163,
 6722163,
 6722163,
 6722163,
 6721970,
 6721970,
 6721970,
 6721970,
 6722045,
 6722045,
 6722045,
 6722045,
 6721806,
 6722193,
 6722193,
 6722193,
 6722193,
 6721818,
 6721818,
 6721818,
 6721818,
 6722204,
 6722204,
 6722204,
 6722204,
 6721979,
 6721979,
 6721979,
 6721979,
 6721979,
 6721979,
 6721979,
 6721979,
 6721979,
 6721979,
 6721979,
 6721979,
 6722214,
 6722214]

In [102]:
shapes[shapes.shape_id == 133416]

Unnamed: 0,shape_id,shape_pt_lon,shape_pt_lat,shape_pt_sequence,shape_dist_traveled
28311,133416,-122.421223,37.807025,1,0
28312,133416,-122.420569,37.806651,2,71
28313,133416,-122.420375,37.805714,3,177
28314,133416,-122.420272,37.805232,4,231
28315,133416,-122.42018,37.804786,5,281
28316,133416,-122.420066,37.804232,6,343
28317,133416,-122.419997,37.803884,7,382
28318,133416,-122.419803,37.802921,8,491
28319,133416,-122.41962,37.801993,9,596
28320,133416,-122.419425,37.801056,10,702


In [103]:
shapes.head()

Unnamed: 0,shape_id,shape_pt_lon,shape_pt_lat,shape_pt_sequence,shape_dist_traveled
0,133138,-122.446793,37.787256,1,0
1,133138,-122.448467,37.787042,2,149
2,133138,-122.450106,37.786828,3,295
3,133138,-122.450221,37.78681,4,305
4,133138,-122.451757,37.786614,5,442
