In [1]:
from google.transit import gtfs_realtime_pb2
from google.protobuf.json_format import MessageToDict
from pandas.io.json import json_normalize
# from apscheduler.schedulers.background import BackgroundScheduler

from sqlalchemy import inspect
from sqlalchemy import MetaData
from sqlalchemy import Table
import sqlalchemy as db

import json
import requests
import numpy as np
import pandas as pd
import datetime
from datetime import timedelta 
import os, time
import pytz

In [2]:
def get_feed():
    # initialise the feed message parser from Google
    feed = gtfs_realtime_pb2.FeedMessage()
    
    # get the response from the api
    response = requests.get('http://files.transport.act.gov.au/feeds/lightrail.pb', allow_redirects=True)

    # pass the response to the Parser
    feed.ParseFromString(response.content)

    # convert to dict from our original protobuf feed
    dict_obj = MessageToDict(feed)

    return dict_obj

In [3]:
def get_updates(feed_obj):
    # check if empty
    if len(feed_obj) > 0:
        # get the trip updates listed on the feed
        updates = [update for update in feed_obj['entity'] if 'tripUpdate' in update]
        return updates
    else:
        return None

In [4]:
def updates_to_dataframe(updates):
    # transform feed to a dataframe 
    df = json_normalize(updates)
    df['tripUpdate.stopTimeUpdate'] = df['tripUpdate.stopTimeUpdate'].apply(lambda x: x[0])
    print("length of updates: {}".format(len(updates))) # debug: print number of updates in the feed
    
    # format feed
    x = json_normalize(df['tripUpdate.stopTimeUpdate'])
    x['tripUpdate.trip.tripId'] = df['tripUpdate.trip.tripId']
    x['tripUpdate.timestamp'] = df['tripUpdate.timestamp']
    x['tripUpdate.delay'] = df['tripUpdate.delay']
    x['id'] = df['id']

    # format date time
    x['arrival.time'] = x['arrival.time'].apply(lambda xx: datetime.datetime.fromtimestamp(int(xx)))
    x['departure.time'] = x['departure.time'].apply(lambda xx: datetime.datetime.fromtimestamp(int(xx)))
    x['tripUpdate.timestamp'] = x['tripUpdate.timestamp'].apply(lambda xx: datetime.datetime.fromtimestamp(int(xx)))

    # transform to datetime
    x['arrival.time'] = pd.to_datetime(x['arrival.time'])
    x['departure.time'] = pd.to_datetime(x['departure.time'])
    x['tripUpdate.timestamp'] = pd.to_datetime(x['tripUpdate.timestamp'])

    return x

In [5]:
def validate(updates):
    if updates is None:
        print('Feed is empty')
        return False

    else:
        return True

In [6]:
df = updates_to_dataframe(get_updates(get_feed()))
df

length of updates: 250


Unnamed: 0,arrival.delay,arrival.time,arrival.uncertainty,departure.delay,departure.time,departure.uncertainty,scheduleRelationship,stopId,stopSequence,tripUpdate.trip.tripId,tripUpdate.timestamp,tripUpdate.delay,id
0,30,2019-06-14 06:24:30,0,30,2019-06-14 06:24:30,0,SCHEDULED,8129,13,407,2019-06-14 07:31:45,0,13783017
1,24,2019-06-14 06:54:24,0,24,2019-06-14 06:54:24,0,SCHEDULED,8100,13,273,2019-06-14 07:31:45,0,13783018
2,46,2019-06-14 07:24:46,0,46,2019-06-14 07:24:46,0,SCHEDULED,8129,13,413,2019-06-14 07:31:45,0,13783019
3,14,2019-06-14 07:32:08,0,14,2019-06-14 07:32:28,0,SCHEDULED,8127,2,288,2019-06-14 07:31:45,0,13783020
4,0,2019-06-14 08:00:00,1,0,2019-06-14 08:00:00,1,SCHEDULED,8100,1,423,2019-06-14 07:31:45,0,13783021
5,0,2019-06-14 08:30:00,1,0,2019-06-14 08:30:00,1,SCHEDULED,8129,1,298,2019-06-14 07:31:45,0,13783022
6,0,2019-06-14 09:00:00,1,0,2019-06-14 09:00:00,1,SCHEDULED,8100,1,433,2019-06-14 07:31:45,0,13783023
7,0,2019-06-14 09:30:00,1,0,2019-06-14 09:30:00,1,SCHEDULED,8129,1,310,2019-06-14 07:31:45,0,13783024
8,0,2019-06-14 10:00:00,1,0,2019-06-14 10:00:00,1,SCHEDULED,8100,1,443,2019-06-14 07:31:45,0,13783025
9,0,2019-06-14 10:30:00,1,0,2019-06-14 10:30:00,1,SCHEDULED,8129,1,316,2019-06-14 07:31:45,0,13783026


In [7]:
df.columns = ['Arrival Delay', 'Arrival Time', 'Arrival Uncertainty',
       'Departure Delay', 'Departure Time', 'Departure Uncertainty',
       'Schedule Relationship', 'Stop ID', 'Stop Sequence', 'Trip ID',
       'Request Timestamp', 'Delay', 'Feed ID']

In [8]:
len(df['Trip ID'].unique())

250

In [9]:
df = df.set_index('Arrival Time')

In [10]:
start = datetime.datetime.now() - datetime.timedelta(minutes=1) 
end = datetime.datetime.now() + datetime.timedelta(minutes=1)

In [11]:
start = datetime.datetime.strftime(start, '%H:%M')
end = datetime.datetime.strftime(end, '%H:%M')

In [12]:
df.head()

Unnamed: 0_level_0,Arrival Delay,Arrival Uncertainty,Departure Delay,Departure Time,Departure Uncertainty,Schedule Relationship,Stop ID,Stop Sequence,Trip ID,Request Timestamp,Delay,Feed ID
Arrival Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-06-14 06:24:30,30,0,30,2019-06-14 06:24:30,0,SCHEDULED,8129,13,407,2019-06-14 07:31:45,0,13783017
2019-06-14 06:54:24,24,0,24,2019-06-14 06:54:24,0,SCHEDULED,8100,13,273,2019-06-14 07:31:45,0,13783018
2019-06-14 07:24:46,46,0,46,2019-06-14 07:24:46,0,SCHEDULED,8129,13,413,2019-06-14 07:31:45,0,13783019
2019-06-14 07:32:08,14,0,14,2019-06-14 07:32:28,0,SCHEDULED,8127,2,288,2019-06-14 07:31:45,0,13783020
2019-06-14 08:00:00,0,1,0,2019-06-14 08:00:00,1,SCHEDULED,8100,1,423,2019-06-14 07:31:45,0,13783021


In [13]:
print( start, end )
df = df.between_time(start, end)
df

07:31 07:33


Unnamed: 0_level_0,Arrival Delay,Arrival Uncertainty,Departure Delay,Departure Time,Departure Uncertainty,Schedule Relationship,Stop ID,Stop Sequence,Trip ID,Request Timestamp,Delay,Feed ID
Arrival Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2019-06-14 07:32:08,14,0,14,2019-06-14 07:32:28,0,SCHEDULED,8127,2,288,2019-06-14 07:31:45,0,13783020
2019-06-14 07:31:40,-24,0,-24,2019-06-14 07:32:00,0,SCHEDULED,8104,2,418,2019-06-14 07:31:45,0,13783085
2019-06-14 07:32:25,-35,0,-35,2019-06-14 07:32:45,0,SCHEDULED,8115,8,287,2019-06-14 07:31:45,0,13783125
2019-06-14 07:31:50,33,0,33,2019-06-14 07:32:10,0,SCHEDULED,8122,10,415,2019-06-14 07:31:45,0,13783169
2019-06-14 07:31:06,-25,0,-25,2019-06-14 07:31:26,0,SCHEDULED,8110,5,417,2019-06-14 07:31:45,0,13783175
2019-06-14 07:32:33,0,1,0,2019-06-14 07:32:33,1,SCHEDULED,8111,1,286,2019-06-14 07:31:45,0,13783208


In [14]:
df.reset_index(inplace=True)

In [15]:
df['Feed ID']=df['Feed ID'].astype(int)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 13 columns):
Arrival Time             6 non-null datetime64[ns]
Arrival Delay            6 non-null int64
Arrival Uncertainty      6 non-null int64
Departure Delay          6 non-null int64
Departure Time           6 non-null datetime64[ns]
Departure Uncertainty    6 non-null int64
Schedule Relationship    6 non-null object
Stop ID                  6 non-null object
Stop Sequence            6 non-null int64
Trip ID                  6 non-null object
Request Timestamp        6 non-null datetime64[ns]
Delay                    6 non-null int64
Feed ID                  6 non-null int64
dtypes: datetime64[ns](3), int64(7), object(3)
memory usage: 704.0+ bytes


In [17]:
df.head()

Unnamed: 0,Arrival Time,Arrival Delay,Arrival Uncertainty,Departure Delay,Departure Time,Departure Uncertainty,Schedule Relationship,Stop ID,Stop Sequence,Trip ID,Request Timestamp,Delay,Feed ID
0,2019-06-14 07:32:08,14,0,14,2019-06-14 07:32:28,0,SCHEDULED,8127,2,288,2019-06-14 07:31:45,0,13783020
1,2019-06-14 07:31:40,-24,0,-24,2019-06-14 07:32:00,0,SCHEDULED,8104,2,418,2019-06-14 07:31:45,0,13783085
2,2019-06-14 07:32:25,-35,0,-35,2019-06-14 07:32:45,0,SCHEDULED,8115,8,287,2019-06-14 07:31:45,0,13783125
3,2019-06-14 07:31:50,33,0,33,2019-06-14 07:32:10,0,SCHEDULED,8122,10,415,2019-06-14 07:31:45,0,13783169
4,2019-06-14 07:31:06,-25,0,-25,2019-06-14 07:31:26,0,SCHEDULED,8110,5,417,2019-06-14 07:31:45,0,13783175


## Read Postgres and check if these Feed ID exists

In [39]:
engine = db.create_engine('postgresql://postgres@localhost:5432/noelangelo')
rt_feed = pd.read_sql('SELECT * FROM public.gtfsr_feed LIMIT 10 ', engine)
conn = engine.connect()

In [40]:
# Create MetaData instance
metadata = MetaData(engine, reflect=True)

  


In [41]:
# Get Table
gtfsr_table = metadata.tables['gtfsr_feed']

In [42]:
df.head()

Unnamed: 0,Arrival Time,Arrival Delay,Arrival Uncertainty,Departure Delay,Departure Time,Departure Uncertainty,Schedule Relationship,Stop ID,Stop Sequence,Trip ID,Request Timestamp,Delay,Feed ID
0,2019-06-06 22:33:07,-17,0,-17,2019-06-06 22:33:27,0,SCHEDULED,8125,3,127,2019-06-06 22:33:30,0,5488973
1,2019-06-06 22:33:45,-19,0,-19,2019-06-06 22:34:05,0,SCHEDULED,8106,3,257,2019-06-06 22:33:30,0,5489036


In [43]:
postgres = pd.read_sql('SELECT * FROM public.gtfsr_feed LIMIT 100', engine)

In [44]:
postgres

Unnamed: 0,Arrival Delay,Arrival Time,Arrival Uncertainty,Departure Delay,Departure Time,Departure Uncertainty,Schedule Relationship,Stop ID,Stop Sequence,Trip ID,Request Timestamp,Delay,Feed ID
0,-24,2019-06-06 22:31:30,0,-24,2019-06-06 22:31:50,0,SCHEDULED,8127,2,127,2019-06-06 22:31:45,0,5487251
1,-22,2019-06-06 22:31:42,0,-22,2019-06-06 22:32:02,0,SCHEDULED,8104,2,257,2019-06-06 22:31:45,0,5487314
2,23,2019-06-06 22:32:40,0,23,2019-06-06 22:33:00,0,SCHEDULED,8120,9,256,2019-06-06 22:31:45,0,5487356
3,-10,2019-06-06 22:31:23,0,-10,2019-06-06 22:31:43,0,SCHEDULED,8111,9,126,2019-06-06 22:31:45,0,5487446
4,-24,2019-06-06 22:31:30,0,-24,2019-06-06 22:31:50,0,SCHEDULED,8127,2,127,2019-06-06 22:31:45,0,5487251
5,-22,2019-06-06 22:31:42,0,-22,2019-06-06 22:32:02,0,SCHEDULED,8104,2,257,2019-06-06 22:31:45,0,5487314
6,23,2019-06-06 22:32:40,0,23,2019-06-06 22:33:00,0,SCHEDULED,8120,9,256,2019-06-06 22:31:45,0,5487356
7,-10,2019-06-06 22:31:23,0,-10,2019-06-06 22:31:43,0,SCHEDULED,8111,9,126,2019-06-06 22:31:45,0,5487446
8,-24,2019-06-06 22:31:30,0,-24,2019-06-06 22:31:50,0,SCHEDULED,8127,2,127,2019-06-06 22:31:45,0,5487251
9,-22,2019-06-06 22:31:42,0,-22,2019-06-06 22:32:02,0,SCHEDULED,8104,2,257,2019-06-06 22:31:45,0,5487314


In [45]:
df

Unnamed: 0,Arrival Time,Arrival Delay,Arrival Uncertainty,Departure Delay,Departure Time,Departure Uncertainty,Schedule Relationship,Stop ID,Stop Sequence,Trip ID,Request Timestamp,Delay,Feed ID
0,2019-06-06 22:33:07,-17,0,-17,2019-06-06 22:33:27,0,SCHEDULED,8125,3,127,2019-06-06 22:33:30,0,5488973
1,2019-06-06 22:33:45,-19,0,-19,2019-06-06 22:34:05,0,SCHEDULED,8106,3,257,2019-06-06 22:33:30,0,5489036


In [47]:
for feed in list(df['Feed ID']):
    if feed in list(postgres['Feed ID']):
        print('Feed already exists')
    else:
        conn.execute(gtfsr_table.insert(),df.to_dict(orient='records'))

In [48]:
list(df['Feed ID'])

[5488973, 5489036]

In [57]:
my_list = []
for idx, val in df.iterrows():
    my_list.append([val['Arrival Time'], val['Stop Sequence'], val['Trip ID']])

In [55]:
my_list

[[Timestamp('2019-06-06 22:33:07'), 3], [Timestamp('2019-06-06 22:33:45'), 3]]

## Elasticsearch

In [31]:
from elasticsearch import Elasticsearch

In [32]:
es=Elasticsearch([{'host':'localhost','port':9200, 'http_auth':('elastic', 'changeme')}])

In [39]:
for idx, record in enumerate(df.to_dict(orient='records'), 1):
    es.index(index='transportcanberra',doc_type='lightrail',body=record, id=record['Feed ID'])
    print(record['Feed ID'])

13783020
13783085
13783125
13783169
13783175
13783208


In [41]:
res = es.get(index='transportcanberra',doc_type='lightrail',id=13783020)

In [42]:
pd.DataFrame.from_dict(res)

Unnamed: 0,_index,_type,_id,_version,_seq_no,_primary_term,found,_source
Arrival Delay,transportcanberra,lightrail,13783020,4,18,1,True,14
Arrival Time,transportcanberra,lightrail,13783020,4,18,1,True,2019-06-14T07:32:08
Arrival Uncertainty,transportcanberra,lightrail,13783020,4,18,1,True,0
Delay,transportcanberra,lightrail,13783020,4,18,1,True,0
Departure Delay,transportcanberra,lightrail,13783020,4,18,1,True,14
Departure Time,transportcanberra,lightrail,13783020,4,18,1,True,2019-06-14T07:32:28
Departure Uncertainty,transportcanberra,lightrail,13783020,4,18,1,True,0
Feed ID,transportcanberra,lightrail,13783020,4,18,1,True,13783020
Request Timestamp,transportcanberra,lightrail,13783020,4,18,1,True,2019-06-14T07:31:45
Schedule Relationship,transportcanberra,lightrail,13783020,4,18,1,True,SCHEDULED


In [47]:
doc = {
        'size' : 10000,
        'query': {
            'match_all' : {}
       }
   }
res = es.search(index='transportcanberra', body=doc, scroll='1m')

In [53]:
pd.DataFrame.from_dict(res['hits']['hits'])

Unnamed: 0,_id,_index,_score,_source,_type
0,13783020,transportcanberra,1.0,"{'Arrival Time': '2019-06-14T07:32:08', 'Arriv...",lightrail
1,13783085,transportcanberra,1.0,"{'Arrival Time': '2019-06-14T07:31:40', 'Arriv...",lightrail
2,13783125,transportcanberra,1.0,"{'Arrival Time': '2019-06-14T07:32:25', 'Arriv...",lightrail
3,13783169,transportcanberra,1.0,"{'Arrival Time': '2019-06-14T07:31:50', 'Arriv...",lightrail
4,13783175,transportcanberra,1.0,"{'Arrival Time': '2019-06-14T07:31:06', 'Arriv...",lightrail
5,13783208,transportcanberra,1.0,"{'Arrival Time': '2019-06-14T07:32:33', 'Arriv...",lightrail
