In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import cPickle as pickle
from model.utilities import haversine, diff_timestamps

# Reading in static schedule data
stops_full = pd.read_csv('data/google_transit/stops.txt', index_col='stop_id')
routes = pd.read_csv('data/google_transit/routes.txt', index_col='route_id')
trips = pd.read_csv('data/google_transit/trips.txt', index_col='trip_id')
stop_times = pd.read_csv('data/google_transit/stop_times.txt')
shapes = pd.read_csv('data/google_transit/shapes.txt')

# Some of these stops are named "Not a public stop" but are still in trips.
# Luckily, in the few trips they appear in, they're only either at the
# beginning or the end, so we can remove them now and we'll still build
# a nice graph with the connections we expect.
stops = stops_full[~stops_full.index.isin([7520, 7530, 7531, 7540])]
stop_times = stop_times[~stop_times['stop_id'].isin([7520, 7530, 7531, 7540])]

# Oh and some stops are in stops.txt but not used in trips... let's remove 'em
used_stops = set(stop_times['stop_id'].unique())
stops = stops[stops.index.isin(used_stops)]

In [3]:
stops.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3566 entries, 390 to 7854
Data columns (total 6 columns):
stop_name    3566 non-null object
stop_desc    3566 non-null object
stop_lat     3566 non-null float64
stop_lon     3566 non-null float64
zone_id      3566 non-null object
stop_url     3566 non-null object
dtypes: float64(2), object(4)
memory usage: 195.0+ KB


In [2]:
G_x = nx.read_gpickle('graph_x_4.gpkl')

In [3]:
G_x.number_of_edges()

1494706

In [4]:
G_x.number_of_nodes()

749075

In [5]:
sorted_nodes = sorted(G_x.nodes())

In [68]:
sorted_nodes[134400:134413]

['390_09:16:00',
 '390_09:22:00',
 '390_09:26:00',
 '390_09:28:00',
 '390_09:30:00',
 '390_09:32:00',
 '390_09:36:00',
 '390_09:40:00',
 '390_09:42:00',
 '390_09:46:00',
 '390_09:53:00',
 '390_09:55:00',
 '390_09:56:00']

In [40]:
len(stop_times['stop_id'].unique())

3566

In [41]:
stops.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4623 entries, 98 to 7857
Data columns (total 6 columns):
stop_name    4623 non-null object
stop_desc    4623 non-null object
stop_lat     4623 non-null float64
stop_lon     4623 non-null float64
zone_id      4623 non-null object
stop_url     4623 non-null object
dtypes: float64(2), object(4)
memory usage: 252.8+ KB


In [7]:
#foot speeds in m/s
walk_speed = 1.39
run_speed = 4.47
sprint_speed = 6.7

In [29]:
stop_times[stop_times['stop_id'] == 98]

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled


In [9]:
dits = stops.apply(lambda x: haversine(x['stop_lon'], x['stop_lat'], -122.401225,37.789255), axis=1)

In [10]:
dits.head()

stop_id
98     0.000000
100    0.820693
202    0.951172
210    7.185659
383    7.185659
dtype: float64

In [11]:
np.ceil((1000*dits.head()/walk_speed))

stop_id
98        0.0
100     591.0
202     685.0
210    5170.0
383    5170.0
dtype: float64

In [20]:
for n, d in G_x.nodes_iter(data=True):
    if d['stop_id'] == 4506:
        print n

4506_13:52:47
4506_11:21:48
4506_13:39:48
4506_17:11:48
4506_22:33:05
4506_15:02:47
4506_08:17:26
4506_08:47:05
4506_19:18:27
4506_12:38:48
4506_11:42:47
4506_12:45:48
4506_17:23:48
4506_16:37:47
4506_11:59:48
4506_23:39:44
4506_17:09:27
4506_12:01:47
4506_25:12:06
4506_20:10:27
4506_19:42:26
4506_13:54:48
4506_20:40:27
4506_14:08:48
4506_24:07:06
4506_22:07:27
4506_10:32:47
4506_13:57:48
4506_06:55:05
4506_23:36:27
4506_13:29:48
4506_08:58:27
4506_11:57:48
4506_18:25:26
4506_07:26:27
4506_16:21:48
4506_16:43:47
4506_16:43:48
4506_16:32:47
4506_17:11:47
4506_22:13:06
4506_22:19:27
4506_09:57:27
4506_17:36:48
4506_09:32:26
4506_21:02:48
4506_19:52:26
4506_20:29:48
4506_19:33:06
4506_22:57:27
4506_10:32:26
4506_13:41:26
4506_25:03:06
4506_15:01:26
4506_08:14:48
4506_07:05:05
4506_16:22:47
4506_16:16:27
4506_20:31:27
4506_17:38:47
4506_11:41:26
4506_15:54:47
4506_14:33:48
4506_15:31:26
4506_12:09:48
4506_19:24:49
4506_21:29:48
4506_18:33:27
4506_11:01:27
4506_21:27:05
4506_11:33:48
4506_1

In [42]:
G_x.subgraph?

In [None]:
# Adding edges between stops that are within walking distance (200 meters)
# iterate over every stop
for row in stops.iterrows():
    stop_id = row[0]
    stop_info = row[1]
    
    # calculate distance between this stop and every other stop
    dists = stops.apply(lambda x: haversine(x['stop_lon'], \
                        x['stop_lat'], stop_info['stop_lon'],\
                        stop_info['stop_lat']), axis=1)
    
    # only keep the stops within 200 meters
    dists = dists[dists <= 0.2]
    
    # don't connect the stop to itself, duh
    dists = dists.drop(row[0])
    
    stop_timepoints = (n for n,d in G_x.nodes_iter(data=True) if d['stop_id'] == stop_id)
    # for every closest stop, add an edge between each stop-timepoint
    # and each walkable stop-timepoint
    for close_stop in dists.iteritems():
        close_stop_id = close_stop[0]
        
        for n, d in G_x.nodes_iter(data=True):
            if d['stop_id'] == close_stop_id:
                if G.has_edge(stop['stop_id'], close_stop_id):
                    G[stop['stop_id']][close_stop_id]['d'] = close_stop[1]
                else:
                    G.add_edge(stop['stop_id'], close_stop_id , {'d':close_stop[1]})

In [13]:
foo = sorted(list(set(stop_times[stop_times['stop_id'] == 4555].\
            apply(lambda x: '{0}_{1}'.format(x['stop_id'], x['arrival_time']), axis=1))))

In [14]:
foo

['4555_05:28:00',
 '4555_05:28:48',
 '4555_05:48:00',
 '4555_05:48:48',
 '4555_06:09:00',
 '4555_06:29:00',
 '4555_06:49:00',
 '4555_07:04:00',
 '4555_07:09:00',
 '4555_07:19:00',
 '4555_07:29:00',
 '4555_07:34:13',
 '4555_07:47:00',
 '4555_07:49:13',
 '4555_08:04:13',
 '4555_08:07:00',
 '4555_08:18:13',
 '4555_08:27:00',
 '4555_08:33:13',
 '4555_08:47:00',
 '4555_08:48:13',
 '4555_09:02:13',
 '4555_09:07:00',
 '4555_09:17:13',
 '4555_09:27:00',
 '4555_09:32:13',
 '4555_09:47:00',
 '4555_09:47:13',
 '4555_10:02:13',
 '4555_10:06:00',
 '4555_10:17:13',
 '4555_10:21:00',
 '4555_10:32:13',
 '4555_10:36:00',
 '4555_10:47:13',
 '4555_10:51:00',
 '4555_11:02:13',
 '4555_11:06:00',
 '4555_11:17:13',
 '4555_11:21:00',
 '4555_11:32:13',
 '4555_11:36:00',
 '4555_11:47:13',
 '4555_11:51:00',
 '4555_12:02:13',
 '4555_12:06:00',
 '4555_12:17:13',
 '4555_12:21:13',
 '4555_12:32:13',
 '4555_12:36:13',
 '4555_12:47:13',
 '4555_12:51:13',
 '4555_13:02:13',
 '4555_13:06:13',
 '4555_13:17:13',
 '4555_13:

In [8]:
all_stop_timepoints = {}
for stopid in used_stops:
    all_stop_timepoints[stopid] = sorted([n for n in if d['stop_id'] == stopid])

SyntaxError: invalid syntax (<ipython-input-8-5e55f53ae58c>, line 3)