In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import cPickle as pickle
from model import utilities as ut

# Reading in static schedule data
stops_full = pd.read_csv('data/google_transit/stops.txt', index_col='stop_id')
routes = pd.read_csv('data/google_transit/routes.txt', index_col='route_id')
trips = pd.read_csv('data/google_transit/trips.txt', index_col='trip_id')
stop_times = pd.read_csv('data/google_transit/stop_times.txt')
shapes = pd.read_csv('data/google_transit/shapes.txt')

# Some of these stops are named "Not a public stop" but are still in trips.
# Luckily, in the few trips they appear in, they're only either at the
# beginning or the end, so we can remove them now and we'll still build
# a nice graph with the connections we expect.
stops = stops_full[~stops_full.index.isin([7520, 7530, 7531, 7540])]
stop_times = stop_times[~stop_times['stop_id'].isin([7520, 7530, 7531, 7540])]

# Oh and some stops are in stops.txt but not used in trips... let's remove 'em
used_stops = set(stop_times['stop_id'].unique())
stops = stops[stops.index.isin(used_stops)]

# Let's make some sorted stop-timepoint lists for each stop_id to
# make lookup faster for things
all_stop_timepoints = {}
for stopid in used_stops:
    node_names = stop_times[stop_times['stop_id'] == stopid].\
                    apply(lambda x: '{0}_{1}'.\
                              format(stopid, x['arrival_time']),\
                          axis=1)
    all_stop_timepoints[stopid] = sorted(list(set(node_names)))

In [29]:
blocks = pd.read_csv('../project/lookUpBlockIDToBlockNumNam.csv')

In [5]:
open?

In [7]:
with open('../project/test3.csv', 'r+') as f:
    foo = f.readlines()
    foo[0] = foo[0][:89] + '\n' + foo[0][89:]
    f.seek(0)
    f.write(''.join(foo))

In [8]:
raw_test = pd.read_csv('../project/test1.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [28]:
raw_test.iloc[800:822]

Unnamed: 0,REV,REPORT_TIME,VEHICLE_TAG,LONGITUDE,LATITUDE,SPEED,HEADING,TRAIN_ASSIGNMENT,PREDICTABLE
800,1506,01/06/2016 17:39:15,5,-122.42088,37.80689,0.0,307.0,6007.0,0
801,1506,01/06/2016 17:48:15,5,-122.42089,37.8069,0.0,307.0,6007.0,0
802,1506,01/06/2016 17:51:15,5,-122.4209,37.80691,0.0,307.0,6007.0,0
803,1506,01/06/2016 18:13:45,5,-122.41218,37.79493,0.0,157.0,6007.0,0
804,1506,01/06/2016 18:24:15,5,-122.41186,37.79477,0.0,49.0,6007.0,0
805,1506,01/06/2016 18:37:45,5,-122.40996,37.79467,0.0,81.0,,0
806,1506,01/06/2016 18:51:37,5,-122.41269,37.80097,2.5,351.0,5903.0,1
807,1506,01/06/2016 19:04:45,5,-122.41509,37.80463,0.0,352.0,5903.0,0
808,1506,01/06/2016 19:13:45,5,-122.41519,37.80458,0.0,352.0,5903.0,0
809,1506,01/06/2016 19:18:15,5,-122.41515,37.80471,0.0,355.0,5903.0,0


In [10]:
raw_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304830 entries, 0 to 304829
Data columns (total 9 columns):
REV                 304830 non-null int64
REPORT_TIME         304830 non-null object
VEHICLE_TAG         304830 non-null object
LONGITUDE           304830 non-null float64
LATITUDE            304830 non-null float64
SPEED               304830 non-null float64
HEADING             304830 non-null float64
TRAIN_ASSIGNMENT    239082 non-null object
PREDICTABLE         304830 non-null int64
dtypes: float64(4), int64(2), object(3)
memory usage: 20.9+ MB


In [11]:
raw_test.VEHICLE_TAG.unique()

array([1, 2, 3, ..., 'T445', 'T448', 'T727'], dtype=object)

In [12]:
raw_test3 = pd.read_csv('../project/test3.csv')

In [13]:
raw_test3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 304830 entries, 0 to 304829
Data columns (total 9 columns):
REV                 304830 non-null int64
REPORT_TIME         304830 non-null object
VEHICLE_TAG         304830 non-null object
LONGITUDE           304830 non-null float64
LATITUDE            304830 non-null float64
SPEED               304830 non-null float64
HEADING             304830 non-null float64
TRAIN_ASSIGNMENT    239082 non-null object
PREDICTABLE         304830 non-null int64
dtypes: float64(4), int64(2), object(3)
memory usage: 20.9+ MB


In [14]:
routes.head()

Unnamed: 0_level_0,agency_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color
route_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
11047,SFMTA,1,CALIFORNIA,,3,,,
1033,SFMTA,1AX,CALIFORNIA A EXPRESS,,3,,,
1034,SFMTA,1BX,CALIFORNIA B EXPRESS,,3,,,
1031,SFMTA,31AX,BALBOA A EXPRESS,,3,,,
1032,SFMTA,31BX,BALBOA B EXPRESS,,3,,,


In [16]:
raw_test3.TRAIN_ASSIGNMENT.unique()

array([nan, '6003', '6004', ..., 905.0, 2905.0, 8888.0], dtype=object)

In [18]:
trips.head()

Unnamed: 0_level_0,route_id,service_id,trip_headsign,direction_id,block_id,shape_id
trip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6858659,11047,2,Geary + 33rd Avenue,0,115,135594
7048008,11047,1,Geary + 33rd Avenue,0,110,135595
7048006,11047,1,Geary + 33rd Avenue,0,116,135595
7048005,11047,1,Geary + 33rd Avenue,0,103,135595
7048004,11047,1,Geary + 33rd Avenue,0,113,135595


In [22]:
routes.loc[1032]

agency_id                      SFMTA
route_short_name                31BX
route_long_name     BALBOA B EXPRESS
route_desc                          
route_type                         3
route_url                           
route_color                         
route_text_color                    
Name: 1032, dtype: object

In [30]:
blocks.head()

Unnamed: 0,SIGNID,BLOCKID,BLOCKNUM,BLOCKNAME
0,69,332917,107,107
1,69,332918,108,108
2,69,332919,109,109
3,69,332920,110,110
4,69,332921,102,102


In [31]:
blocks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71670 entries, 0 to 71669
Data columns (total 4 columns):
SIGNID       71670 non-null int64
BLOCKID      71670 non-null int64
BLOCKNUM     71670 non-null int64
BLOCKNAME    67983 non-null object
dtypes: int64(3), object(1)
memory usage: 2.2+ MB


In [32]:
len(blocks.BLOCKID.unique())

71670

In [33]:
len(blocks.BLOCKNUM.unique())

2491