In [204]:
import pandas as pd
import math
import datetime

In [205]:
def decode(point_str):
    '''Decodes a polyline that has been encoded using Google's algorithm
    http://code.google.com/apis/maps/documentation/polylinealgorithm.html
    
    This is a generic method that returns a list of (latitude, longitude) 
    tuples.
    
    :param point_str: Encoded polyline string.
    :type point_str: string
    :returns: List of 2-tuples where each tuple is (latitude, longitude)
    :rtype: list
    
    '''
            
    # sone coordinate offset is represented by 4 to 5 binary chunks
    coord_chunks = [[]]
    for char in point_str:
        
        # convert each character to decimal from ascii
        value = ord(char) - 63
        
        # values that have a chunk following have an extra 1 on the left
        split_after = not (value & 0x20)         
        value &= 0x1F
        
        coord_chunks[-1].append(value)
        
        if split_after:
                coord_chunks.append([])
        
    del coord_chunks[-1]
    
    coords = []
    
    for coord_chunk in coord_chunks:
        coord = 0
        
        for i, chunk in enumerate(coord_chunk):                    
            coord |= chunk << (i * 5) 
        
        #there is a 1 on the right if the coord is negative
        if coord & 0x1:
            coord = ~coord #invert
        coord >>= 1
        coord /= 100000.0
                    
        coords.append(coord)
    
    # convert the 1 dimensional list to a 2 dimensional list and offsets to 
    # actual values
    points = []
    prev_x = 0
    prev_y = 0
    for i in range(0, len(coords) - 1, 2):
        if coords[i] == 0 and coords[i + 1] == 0:
            continue
        
        prev_x += coords[i + 1]
        prev_y += coords[i]
        # a round to 6 digits ensures that the floats are the same as when 
        # they were encoded
        points.append((round(prev_y, 6), round(prev_x, 6)))
    
    return points

In [206]:
def create_carto_df(df):
    carto_df = pd.DataFrame()
    for index, row in df.iterrows():
        for endpoint in ['Start', 'End']:
            geocode_points = decode(row['Polyline'])
            geocode_points = geocode_points if endpoint == 'Start' else list(reversed(geocode_points))
            df_route = pd.DataFrame(geocode_points, columns = ['lat', 'lng'])
            df_route['lat_shift'] = df_route['lat'].shift(1).fillna(0)
            df_route['lng_shift'] = df_route['lng'].shift(1).fillna(0)
            df_route['euclidean_distance'] = df_route.apply(lambda x: math.sqrt((x['lat_shift'] - x['lat'])**2 + (x['lng_shift'] - x['lng'])**2), axis = 1)
            df_route.set_value(0,'euclidean_distance', 0)
            df_route['dist_prop'] = 1.0*df_route['euclidean_distance'].cumsum()/df_route['euclidean_distance'].sum()
            day_columns = [col for col in df.columns if endpoint in col and '-' in col]
            duration = row[endpoint + ' Duration']
            df_route = df_route.drop(['lat_shift', 'lng_shift', 'euclidean_distance'], axis = 1)
            route = row['Route']
            for i, day in enumerate(day_columns):
                times = row[day].replace('[', '').replace(']','').split(',')
                for time in times:
                    if time != '':
                        df_route_time = df_route.copy()
                        hour = int(time.split(':')[0])
                        minute = int(time.split(':')[1])
                        start_time = datetime.datetime(2017,5,(1+i),hour,minute,0)
                        df_route_time['timestamp'] = df_route_time['dist_prop'].apply(lambda x: start_time + datetime.timedelta(minutes=(x * duration)))
                        df_route_time = df_route_time.drop('dist_prop', axis = 1)
                        df_route_time['Route'] = df_route_time['timestamp'].apply(lambda x: route)
                        df_route_time['Duration'] = df_route_time['timestamp'].apply(lambda x: duration)
                        carto_df = pd.concat([carto_df, df_route_time])
        print("{0} out of {1} Complete".format(index + 1, df.shape[0]))
    carto_df = carto_df.reset_index().drop('index', axis = 1)
    return carto_df

In [207]:
def records_to_filter(row):
    routes = ['Capitol Corridor', 'Texas Eagle', 'Pacific Surfliner']
    if row['Route'] in routes:
        if row['timestamp'].day == 1 and row['timestamp'].hour < 4:
            return True
    if row['timestamp'].day > 7:
        return True
    return False

In [208]:
df = pd.read_csv('Routes Polylines and Times.csv')

In [209]:
carto_df = create_carto_df(df)

1 out of 61 Complete
2 out of 61 Complete
3 out of 61 Complete
4 out of 61 Complete
5 out of 61 Complete
6 out of 61 Complete
7 out of 61 Complete
8 out of 61 Complete
9 out of 61 Complete
10 out of 61 Complete
11 out of 61 Complete
12 out of 61 Complete
13 out of 61 Complete
14 out of 61 Complete
15 out of 61 Complete
16 out of 61 Complete
17 out of 61 Complete
18 out of 61 Complete
19 out of 61 Complete
20 out of 61 Complete
21 out of 61 Complete
22 out of 61 Complete
23 out of 61 Complete
24 out of 61 Complete
25 out of 61 Complete
26 out of 61 Complete
27 out of 61 Complete
28 out of 61 Complete
29 out of 61 Complete
30 out of 61 Complete
31 out of 61 Complete
32 out of 61 Complete
33 out of 61 Complete
34 out of 61 Complete
35 out of 61 Complete
36 out of 61 Complete
37 out of 61 Complete
38 out of 61 Complete
39 out of 61 Complete
40 out of 61 Complete
41 out of 61 Complete
42 out of 61 Complete
43 out of 61 Complete
44 out of 61 Complete
45 out of 61 Complete
46 out of 61 Comple

In [210]:
print(carto_df.shape)
carto_df.head()

(547002, 5)


Unnamed: 0,lat,lng,timestamp,Route,Duration
0,40.75033,-73.99446,2017-05-01 08:15:00.000000,Adirondack,656
1,40.75432,-74.00292,2017-05-01 08:16:02.461654,Adirondack,656
2,40.75941,-73.99681,2017-05-01 08:16:55.565584,Adirondack,656
3,40.77389,-73.98985,2017-05-01 08:18:42.849384,Adirondack,656
4,40.82071,-73.95887,2017-05-01 08:24:57.748478,Adirondack,656


In [211]:
carto_df['to_filter'] = carto_df.apply(lambda x: records_to_filter(x), axis = 1)
carto_df = carto_df[carto_df['to_filter'] == False]
carto_df = carto_df.drop('to_filter', axis = 1)

In [212]:
carto_df.to_csv('amtrak_carto_full.csv', index = False)