In [1]:
from google.transit import gtfs_realtime_pb2
from google.protobuf.json_format import MessageToDict
from pandas.io.json import json_normalize

import json
import requests
import numpy as np
import pandas as pd
import datetime
from datetime import timedelta 
import os, time
import pytz

from elasticsearch import Elasticsearch

# Trial 2 - Adding ElasticSearch Functionality

## Helper Functions

In [2]:
def get_feed():
    # initialise the feed message parser from Google
    feed = gtfs_realtime_pb2.FeedMessage()
    
    # get the response from the api
    response = requests.get('http://files.transport.act.gov.au/feeds/lightrail.pb', allow_redirects=True)

    # pass the response to the Parser
    feed.ParseFromString(response.content)

    # convert to dict from our original protobuf feed
    dict_obj = MessageToDict(feed)

    return dict_obj

In [3]:
def get_vehicles(feed_obj):
    # check if empty
    if len(feed_obj) > 0:
        # get the trip updates listed on the feed
        vehicles = [vehicle for vehicle in feed_obj['entity'] if 'vehicle' in vehicle]
        return vehicles
    else:
        return None

In [4]:
def get_updates(feed_obj):
    # check if empty
    if len(feed_obj) > 0:
        # get the trip updates listed on the feed
        updates = [update for update in feed_obj['entity'] if 'tripUpdate' in update]
        return updates
    else:
        return None

In [5]:
def updates_to_dataframe(updates):
    # transform feed to a dataframe 
    df = json_normalize(updates)
    df['tripUpdate.stopTimeUpdate'] = df['tripUpdate.stopTimeUpdate'].apply(lambda x: x[0])
    print("length of updates: {}".format(len(updates))) # debug: print number of updates in the feed
    
    # format feed
    x = json_normalize(df['tripUpdate.stopTimeUpdate'])
    x['tripUpdate.trip.tripId'] = df['tripUpdate.trip.tripId']
    x['tripUpdate.timestamp'] = df['tripUpdate.timestamp']
    x['tripUpdate.delay'] = df['tripUpdate.delay']
    x['id'] = df['id']

    # format date time
    x['arrival.time'] = x['arrival.time'].apply(lambda xx: datetime.datetime.fromtimestamp(int(xx)))
    x['departure.time'] = x['departure.time'].apply(lambda xx: datetime.datetime.fromtimestamp(int(xx)))
    x['tripUpdate.timestamp'] = x['tripUpdate.timestamp'].apply(lambda xx: datetime.datetime.fromtimestamp(int(xx)))

    # transform to datetime
    x['arrival.time'] = pd.to_datetime(x['arrival.time'])
    x['departure.time'] = pd.to_datetime(x['departure.time'])
    x['tripUpdate.timestamp'] = pd.to_datetime(x['tripUpdate.timestamp'])

    return x

In [6]:
def validate(updates):
    if updates is None:
        print('Feed is empty')
        return False

    else:
        return True

## Get API Response

In [7]:
df = updates_to_dataframe(get_updates(get_feed()))
df.head()

length of updates: 129


Unnamed: 0,arrival.delay,arrival.time,arrival.uncertainty,departure.delay,departure.time,departure.uncertainty,scheduleRelationship,stopId,stopSequence,tripUpdate.trip.tripId,tripUpdate.timestamp,tripUpdate.delay,id
0,179,2019-06-16 08:26:59,0,179,2019-06-16 08:26:59,0,SCHEDULED,8129,13,780,2019-06-16 11:50:15,0,16218343
1,26,2019-06-16 08:54:26,0,26,2019-06-16 08:54:26,0,SCHEDULED,8100,13,718,2019-06-16 11:50:15,0,16218344
2,49,2019-06-16 09:24:49,0,49,2019-06-16 09:24:49,0,SCHEDULED,8129,13,784,2019-06-16 11:50:15,0,16218345
3,22,2019-06-16 09:54:22,0,22,2019-06-16 09:54:22,0,SCHEDULED,8100,13,722,2019-06-16 11:50:15,0,16218346
4,23,2019-06-16 10:24:23,0,23,2019-06-16 10:24:23,0,SCHEDULED,8129,13,788,2019-06-16 11:50:15,0,16218347


In [8]:
df.columns = ['Arrival Delay', 'Arrival Time', 'Arrival Uncertainty',
       'Departure Delay', 'Departure Time', 'Departure Uncertainty',
       'Schedule Relationship', 'Stop ID', 'Stop Sequence', 'Trip ID',
       'Request Timestamp', 'Delay', 'Feed ID']

## Set Index to `Arrival Time`

In [9]:
start = datetime.datetime.now() - datetime.timedelta(minutes=1) 
end = datetime.datetime.now() + datetime.timedelta(minutes=1)

In [10]:
start = datetime.datetime.strftime(start, '%H:%M')
end = datetime.datetime.strftime(end, '%H:%M')

In [11]:
df.head()

Unnamed: 0,Arrival Delay,Arrival Time,Arrival Uncertainty,Departure Delay,Departure Time,Departure Uncertainty,Schedule Relationship,Stop ID,Stop Sequence,Trip ID,Request Timestamp,Delay,Feed ID
0,179,2019-06-16 08:26:59,0,179,2019-06-16 08:26:59,0,SCHEDULED,8129,13,780,2019-06-16 11:50:15,0,16218343
1,26,2019-06-16 08:54:26,0,26,2019-06-16 08:54:26,0,SCHEDULED,8100,13,718,2019-06-16 11:50:15,0,16218344
2,49,2019-06-16 09:24:49,0,49,2019-06-16 09:24:49,0,SCHEDULED,8129,13,784,2019-06-16 11:50:15,0,16218345
3,22,2019-06-16 09:54:22,0,22,2019-06-16 09:54:22,0,SCHEDULED,8100,13,722,2019-06-16 11:50:15,0,16218346
4,23,2019-06-16 10:24:23,0,23,2019-06-16 10:24:23,0,SCHEDULED,8129,13,788,2019-06-16 11:50:15,0,16218347


In [12]:
start

'11:49'

In [13]:
print( start, end )
records = df.set_index('Arrival Time')
records = records.between_time(start, end)
records.reset_index(inplace=True)
records

11:49 11:51


Unnamed: 0,Arrival Time,Arrival Delay,Arrival Uncertainty,Departure Delay,Departure Time,Departure Uncertainty,Schedule Relationship,Stop ID,Stop Sequence,Trip ID,Request Timestamp,Delay,Feed ID
0,2019-06-16 11:49:51,74,0,74,2019-06-16 11:50:11,0,SCHEDULED,8109,10,730,2019-06-16 11:50:15,0,16218350
1,2019-06-16 11:50:11,54,0,54,2019-06-16 11:50:31,0,SCHEDULED,8122,10,794,2019-06-16 11:50:15,0,16218415
2,2019-06-16 11:50:36,-4,0,-4,2019-06-16 11:50:56,0,SCHEDULED,8108,4,795,2019-06-16 11:50:15,0,16218447


## Elasticsearch

In [14]:
records['Arrival Time'] = records['Arrival Time'].apply(lambda x: int(time.mktime(x.timetuple())))
records['Departure Time'] = records['Departure Time'].apply(lambda x: int(time.mktime(x.timetuple())))
records['Request Timestamp'] = records['Request Timestamp'].apply(lambda x: int(time.mktime(x.timetuple())))

In [17]:
records['Record ID'] = records['Arrival Time'].astype(str)+records['Stop Sequence'].astype(str)+records['Trip ID'].astype(str)

In [18]:
records['Arrival Time'] = pd.to_datetime(records['Arrival Time'], unit='s')
records['Departure Time'] = pd.to_datetime(records['Departure Time'], unit='s')
records['Request Timestamp'] = pd.to_datetime(records['Request Timestamp'], unit='s')

In [15]:
es=Elasticsearch([{'host':'localhost','port':9200, 'http_auth':('elastic', 'changeme')}])

In [16]:
for idx, record in enumerate(records.to_dict(orient='records'), 1):
    es.index(index='tripupdates',doc_type='lightrail',body=record, id=record['Record ID'])

In [17]:
records.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 13 columns):
Arrival Time             5 non-null datetime64[ns]
Arrival Delay            5 non-null int64
Arrival Uncertainty      5 non-null int64
Departure Delay          5 non-null int64
Departure Time           5 non-null datetime64[ns]
Departure Uncertainty    5 non-null int64
Schedule Relationship    5 non-null object
Stop ID                  5 non-null object
Stop Sequence            5 non-null int64
Trip ID                  5 non-null object
Request Timestamp        5 non-null datetime64[ns]
Delay                    5 non-null int64
Feed ID                  5 non-null object
dtypes: datetime64[ns](3), int64(6), object(4)
memory usage: 600.0+ bytes


---

### Scratch Pad

In [7]:
vehicles = json_normalize(get_vehicles(get_feed()))

In [8]:
vehicles

Unnamed: 0,id,isDeleted,vehicle.congestionLevel,vehicle.currentStatus,vehicle.currentStopSequence,vehicle.occupancyStatus,vehicle.position.bearing,vehicle.position.latitude,vehicle.position.longitude,vehicle.position.odometer,vehicle.position.speed,vehicle.stopId,vehicle.timestamp,vehicle.trip.tripId,vehicle.vehicle.id,vehicle.vehicle.label,vehicle.vehicle.licensePlate
0,15850704,False,RUNNING_SMOOTHLY,IN_TRANSIT_TO,10,EMPTY,0.0,-35.252193,149.133484,17008941.0,11.136111,,1560610095,703,2,LRV2,LRV2
1,15850705,False,RUNNING_SMOOTHLY,STOPPED_AT,4,EMPTY,0.0,-35.265896,149.131241,19017609.0,0.322222,,1560610095,624,6,LRV6,LRV6
2,15850706,False,RUNNING_SMOOTHLY,IN_TRANSIT_TO,10,EMPTY,0.0,-35.206329,149.147903,16735492.0,18.058332,,1560610095,623,8,LRV8,LRV8
3,15850707,False,RUNNING_SMOOTHLY,IN_TRANSIT_TO,5,EMPTY,0.0,-35.205772,149.148087,20612069.0,16.466667,,1560610095,704,12,LRV12,LRV12


In [9]:
vehicles.columns

Index(['id', 'isDeleted', 'vehicle.congestionLevel', 'vehicle.currentStatus',
       'vehicle.currentStopSequence', 'vehicle.occupancyStatus',
       'vehicle.position.bearing', 'vehicle.position.latitude',
       'vehicle.position.longitude', 'vehicle.position.odometer',
       'vehicle.position.speed', 'vehicle.stopId', 'vehicle.timestamp',
       'vehicle.trip.tripId', 'vehicle.vehicle.id', 'vehicle.vehicle.label',
       'vehicle.vehicle.licensePlate'],
      dtype='object')

In [10]:
vehicles.columns = ['Feed Id', 'isDeleted', 'Congestion Level', 'Current Status', 'Current Stop Sequence', 'Occupancy Status', 'Bearing', 'Latitude', 'Longitude', 'Odometer', 'Speed', 'Stop Id', 'Timestamp', 'Trip Id', 'Vehicle Id', 'Vehicle Label', 'Vehicle License Plate']

In [11]:
vehicles.drop(columns='Stop Id', axis=1, inplace=True)

In [12]:
vehicles

Unnamed: 0,Feed Id,isDeleted,Congestion Level,Current Status,Current Stop Sequence,Occupancy Status,Bearing,Latitude,Longitude,Odometer,Speed,Timestamp,Trip Id,Vehicle Id,Vehicle Label,Vehicle License Plate
0,15850704,False,RUNNING_SMOOTHLY,IN_TRANSIT_TO,10,EMPTY,0.0,-35.252193,149.133484,17008941.0,11.136111,1560610095,703,2,LRV2,LRV2
1,15850705,False,RUNNING_SMOOTHLY,STOPPED_AT,4,EMPTY,0.0,-35.265896,149.131241,19017609.0,0.322222,1560610095,624,6,LRV6,LRV6
2,15850706,False,RUNNING_SMOOTHLY,IN_TRANSIT_TO,10,EMPTY,0.0,-35.206329,149.147903,16735492.0,18.058332,1560610095,623,8,LRV8,LRV8
3,15850707,False,RUNNING_SMOOTHLY,IN_TRANSIT_TO,5,EMPTY,0.0,-35.205772,149.148087,20612069.0,16.466667,1560610095,704,12,LRV12,LRV12


In [16]:
vehicles['Timestamp'] = pd.to_datetime(vehicles['Timestamp'], unit='s')

In [55]:
for idx, record in enumerate(vehicles.to_dict(orient='records'), 1):
    es.index(index='vehicles',doc_type='lightrail',body=record, id=record['Feed Id'])