In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier

from urllib.request import Request, urlopen  # Python 3
import json
import os.path

from datetime import datetime

In [5]:
API_Key = open("../variables/google_directions_query.txt", 'r').read()

In [13]:
# Load the data sets
data_dir = './data/'
df_Trip = pd.read_csv(data_dir + 'Trip_Household_Merged.csv')    
df_Trip = df_Trip[(df_Trip["origin_lat"] < 48) & (df_Trip["origin_lat"] > 47)]
df_Trip = df_Trip[(df_Trip["dest_lat"] < 48) & (df_Trip["dest_lat"] > 47)]
df_Trip = df_Trip[(df_Trip["dest_lng"] > -123) & (df_Trip["dest_lng"] < -122)]
df_Trip = df_Trip[(df_Trip["origin_lng"] > -123) & (df_Trip["origin_lng"] < -122)]
df_Trip['bg_origin'] = df_Trip['bg_origin'].astype(str)

df_Blockgroups = pd.read_csv(data_dir + 'SeattleCensusBlocksandNeighborhoodCorrelationFile.csv') 
df_Blockgroups['BLOCKGROUP'] = df_Blockgroups['BLOCKGROUP'].astype(str)

#print (df_Blockgroups)

#print (list(df_Trip.columns.values))
print (df_Trip)

df_Merged= pd.merge(left=df_Trip, right=df_Blockgroups, how='inner', left_on='bg_dest', right_on='BLOCKGROUP')
print (list(df_Merged.columns.values))

print (df_Merged)


  interactivity=interactivity, compiler=compiler, result=result)


             tripid      hhid  origin_lat  origin_lng   dest_lat    dest_lng  \
1573   1.710000e+12  17136194   47.365692 -122.019256  47.372828 -122.000661   
1574   1.710000e+12  17136194   47.365692 -122.019256  47.372828 -122.000661   
1575   1.710000e+12  17104125   47.347689 -122.028953  47.372828 -122.000661   
1577   1.710000e+12  17141466   47.341028 -122.053314  47.356041 -122.002411   
1578   1.710000e+12  17141466   47.307718 -122.230442  47.356041 -122.002411   
1579   1.720000e+12  17150556   47.618090 -122.055470  47.581400 -122.003200   
1580   1.710000e+12  17117293   47.463440 -122.110580  47.257730 -122.004030   
1582   1.710000e+12  17115594   47.756960 -122.347550  47.259610 -122.004260   
1584   1.710000e+12  17112690   47.858809 -122.011849  47.851652 -122.004467   
1585   1.710000e+12  17112690   47.799965 -122.109273  47.851652 -122.004467   
1586   1.710000e+12  17112690   47.799965 -122.109273  47.851652 -122.004467   
1587   1.710000e+12  17112690   47.64036

In [14]:
# Calculate the distance (and travel time) to each destination 
df_Polylines = pd.DataFrame()
    
Latitude = []
Longitude = []
SortOrder = []
End = []
SegmentID = []
Mode = []
Travelers = []
Depart_time_timestamp = []
Bg_origin = []
Bg_dest = []

tripID = 0
sortOrder = 0
segmentID = 0

for index, row in df_Merged.iterrows():
    # Build the Origin and Destination strings
    Origin = str(row["origin_lat"]) + "," + str(row["origin_lng"])
    Destination = str(row["dest_lat"]) + "," + str(row["dest_lng"])
    URL = "https://maps.googleapis.com/maps/api/directions/json?&origin=" + Origin + \
                    "&destination=" + Destination + "&key=" + API_Key
    #print (URL)
    q = Request(URL)
    a = urlopen(q).read()
    data = json.loads(a)
    sortOrder = 0  
        
    if 'errorZ' in data:
        print (data["error"])
        
    polyline = []
    polyline =  decode_polyline(data['routes'][0]['overview_polyline']['points'])
    

    startLat = 0
    endLat = 0
    startLong = 0
    endLong = 0
    
    for i, coordinate in enumerate(polyline):
        
        # first point
        if i == 0:
            startLat = coordinate[0]
            startLong = coordinate[1]
            
            Latitude.append(startLat)
            Longitude.append(startLong)
            SortOrder.append(0)
            Depart_time_timestamp.append(row['depart_time_timestamp'])
            Travelers.append(row['travelers_total'])
            SegmentID.append(segmentID)
            Bg_origin.append(row['bg_origin'])
            Bg_dest.append(row['bg_dest'])

        
        # not last point
        if i > 0 and i < len(polyline):

            # end of point
            Latitude.append(coordinate[0])
            Longitude.append(coordinate[1])
            SortOrder.append(1)
            Depart_time_timestamp.append(row['depart_time_timestamp'])
            Travelers.append(row['travelers_total'])
            SegmentID.append(segmentID)
            Bg_origin.append(row['bg_origin'])
            Bg_dest.append(row['bg_dest'])
            
            segmentID = segmentID + 1     
            
            # start of point         
            Latitude.append(coordinate[0])
            Longitude.append(coordinate[1])
            SortOrder.append(0)
            Depart_time_timestamp.append(row['depart_time_timestamp'])
            Travelers.append(row['travelers_total'])
            SegmentID.append(segmentID)
            Bg_origin.append(row['bg_origin'])
            Bg_dest.append(row['bg_dest'])
        
 
        # last point
        if i == len(polyline):
        
            Latitude.append(coordinate[0])
            Longitude.append(coordinate[1])
            SortOrder.append(1)
            Depart_time_timestamp.append(row['depart_time_timestamp'])
            Travelers.append(row['travelers_total'])
            SegmentID.append(segmentID)
            Bg_origin.append(row['bg_origin'])
            Bg_dest.append(row['bg_dest'])
            
    segmentID = segmentID + 1   
                    
df_Polylines['lat'] = Latitude
df_Polylines['lon'] = Longitude
df_Polylines['sort_order'] = SortOrder
df_Polylines['segmentID'] = SegmentID 
df_Polylines['travelers'] = Travelers
df_Polylines['bg_origin'] = Bg_origin
df_Polylines['bg_dest'] = Bg_dest

df_Polylines['depart_time_timestamp'] = Depart_time_timestamp
   

In [5]:
# Decode polyline
def decode_polyline(polyline_str):
    '''Pass a Google Maps encoded polyline string; returns list of lat/lon pairs'''
    index, lat, lng = 0, 0, 0
    coordinates = []
    changes = {'latitude': 0, 'longitude': 0}

    # Coordinates have variable length when encoded, so just keep
    # track of whether we've hit the end of the string. In each
    # while loop iteration, a single coordinate is decoded.
    while index < len(polyline_str):
        # Gather lat/lon changes, store them in a dictionary to apply them later
        for unit in ['latitude', 'longitude']: 
            shift, result = 0, 0

            while True:
                byte = ord(polyline_str[index]) - 63
                index+=1
                result |= (byte & 0x1f) << shift
                shift += 5
                if not byte >= 0x20:
                    break

            if (result & 1):
                changes[unit] = ~(result >> 1)
            else:
                changes[unit] = (result >> 1)

        lat += changes['latitude']
        lng += changes['longitude']

        coordinates.append((lat / 100000.0, lng / 100000.0))

    return coordinates


In [None]:

# Export to csv
if os.path.exists(data_dir + 'Living_Graph.csv'):
    df_Polylines.to_csv(data_dir + 'Living_Graph.csv', mode='a', header=False, index=False)
else:
    df_Polylines.to_csv(data_dir + 'Living_Graph.csv', mode='w', header=True, index=False)

    

In [16]:
df_Polylines.to_csv(data_dir + 'Living_Graph.csv', mode='w', header=True, index=False)

In [17]:
#df_Polylines = pd.read_csv(data_dir + 'Living_Graph.csv')    

df_Polylines['depart_time_timestamp'] = pd.to_datetime(df_Polylines['depart_time_timestamp'])
df_Polylines['pair'] = df_Polylines['lat'].astype(str) + df_Polylines['lon'].astype(str) + "-" + df_Polylines['sort_order'].astype(str)
df_Polylines['hour'] = df_Polylines.depart_time_timestamp.dt.hour

print (df_Polylines)

              lat        lon  sort_order  segmentID  travelers     bg_origin  \
0        47.49679 -122.22046           0          0          1  530330260024   
1        47.49681 -122.22081           1          0          1  530330260024   
2        47.49681 -122.22081           0          1          1  530330260024   
3        47.49715 -122.22082           1          1          1  530330260024   
4        47.49715 -122.22082           0          2          1  530330260024   
5        47.49719 -122.22077           1          2          1  530330260024   
6        47.49719 -122.22077           0          3          1  530330260024   
7        47.49722 -122.21999           1          3          1  530330260024   
8        47.49722 -122.21999           0          4          1  530330260024   
9        47.49806 -122.22023           1          4          1  530330260024   
10       47.49806 -122.22023           0          5          1  530330260024   
11       47.49891 -122.22051           1

In [18]:
#aggregate by urban village OD Pair
df_Polylines_Agg = df_Polylines.groupby(['pair','hour','sort_order'], as_index=False).agg({'travelers':sum,'lat':['max'],
                                                                             'lon':['max'],'segmentID':['max'],
                                                                            'bg_origin':['max'],'bg_dest':['max']})
df_Polylines_Agg.columns = df_Polylines_Agg.columns.droplevel(level=1)

print (df_Polylines.shape)
print (df_Polylines_Agg.shape)
                                                                              
print (df_Polylines_Agg.head(10))                                                         
                                                                              
                                                                    


(3950504, 10)
(1184822, 9)
                   pair  hour  sort_order  travelers       lat        lon  \
0   47.0303-122.83171-0  11.0           0          3  47.03030 -122.83171   
1  47.03083-122.83167-0  11.0           0          3  47.03083 -122.83167   
2  47.03083-122.83167-1  11.0           1          3  47.03083 -122.83167   
3  47.03164-122.83142-0  11.0           0          3  47.03164 -122.83142   
4  47.03164-122.83142-1  11.0           1          3  47.03164 -122.83142   
5    47.03181-122.891-0  16.0           0          1  47.03181 -122.89100   
6    47.03181-122.891-1  16.0           1          1  47.03181 -122.89100   
7  47.03182-122.89108-0  13.0           0          1  47.03182 -122.89108   
8  47.03182-122.89108-1  13.0           1          1  47.03182 -122.89108   
9  47.03184-122.89117-0  16.0           0          2  47.03184 -122.89117   

   segmentID     bg_origin       bg_dest  
0     216886  530670113002  530330007001  
1     216887  530670113002  53033000700

In [19]:
df_Polylines_Agg.to_csv(data_dir + 'Living_Graph.csv', mode='w', header=True, index=False)