# Light Rail Realtime to Elasticsearch

In [2]:
from pandas.io.json import json_normalize
from sodapy import Socrata
from elasticsearch import Elasticsearch

import json
import requests
import numpy as np
import pandas as pd
import datetime
from datetime import timedelta 
import os, time
import pytz
import arrow

In [4]:
# get credentials
with open('../.credentials') as f:
    credentials = f.readlines()
    credentials = json.loads(''.join(credentials).strip())

# Data Acquisition

1. Socrata
2. GTFS (`trips.csv`, `stops.csv`) 

## Get data from Socrata

In [5]:
def getRealtimeFeed():
    # authenticate Socrata
    client = Socrata('www.data.act.gov.au',
                 '4jqogoRJ9NKj1gr8QAZ9CCKFI',
                 username=credentials['username'],
                 password=credentials['password'])
    # get endpoint
    results = client.get("r9a8-xw6s")
    # Convert to pandas DataFrame
    results_df = pd.DataFrame.from_records(results)
    return results_df

## Get data from GTFS

In [6]:
# get data from trips and stops
trips = pd.read_csv('../GTFS/google_transit_lr/trips.csv')
stops = pd.read_csv('../GTFS/google_transit_lr/stops.csv')

## Converting, Sorting and Dropping Columns

In [59]:
# adds information from trips.csv and stops.csv
def CSDM(dff, trips, stops):
    
    # TODO: check with Selva later for R (reserved buses) but removing them for now
    dff = dff[dff['trip_id'].map(lambda x: 'R' not in x)]
    
    # convert columns to datetime and int
    dff['arrival_delay'] = dff['arrival_delay'].astype(int)
    dff['depature_delay'] = dff['depature_delay'].astype(int)
    dff['stop_sequence'] = dff['stop_sequence'].astype(int)
    dff['trip_id'] = dff['trip_id'].astype(int)
    trips['trip_id'] = trips['trip_id'].astype(int)
    stops['stop_id'] = stops['stop_id'].astype(str)
    
    dff['arrival_time'] = dff['arrival_time'].apply(lambda x: arrow.get(x, tzinfo='Australia/Canberra'))
    dff['arrival_date'] = dff['arrival_time'].apply(lambda x: x.date())
    dff['depature_time'] = dff['depature_time'].apply(lambda x: arrow.get(x, tzinfo='Australia/Canberra'))
    dff['depature_date'] = dff['depature_time'].apply(lambda x: x.date())
    dff['timestamp'] = dff['timestamp'].apply(lambda x: arrow.get(x, tzinfo='Australia/Canberra'))
    
    # sort columns based on arrival time
    dff.sort_values(by='arrival_time',ascending=True, inplace=True)

    # drop columns
    dff_dropped = dff[['arrival_delay', 'arrival_time', 'arrival_date', 'depature_delay', 'depature_time', 'depature_date', 'stop_id', 'stop_sequence', 'trip_id', 'timestamp']].copy()
    trips_dropped = trips[['route_id', 'service_id', 'trip_id','trip_headsign', 'direction_id']]
    stops_dropped = stops[stops.columns[:2]]
    
    # merge columns
    merged = pd.merge(left=stops_dropped, left_on='stop_id', right=dff_dropped, right_on='stop_id', how='right')
    merged = pd.merge(left=trips_dropped, left_on='trip_id', right=merged, right_on='trip_id', how='right').copy()
#     print('|     Dates    |\n----------------')
#     for i in sorted(merged['arrival_date'].unique()):
#         print('|  {}  |'.format(str(i)))
    return merged

# Periodically Collect Data

In [41]:
df = CSDM(getRealtimeFeed(), trips, stops)

|     Dates    |
----------------
|  2019-06-29  |
|  2019-06-30  |


In [None]:
counter = 0
for i in range(20,1):
    counter+=1
    print('Count {}'.format(counter))
    df = CSDM(getRealtimeFeed(), trips, stops)
    df.insert(loc=15, column='timedelta', value=(df['timestamp'] - df['depature_time']))
    df = df[df['timedelta'].map(lambda x: (x.days == 0) & (x.seconds <= 30))].copy()
    df.sort_values(by=['timedelta'], inplace=True)
    print(df[['trip_id', 'stop_sequence', 'timedelta']])
    time.sleep(5)

Count 1
Empty DataFrame
Columns: [trip_id, stop_sequence, timedelta]
Index: []
Count 2
Empty DataFrame
Columns: [trip_id, stop_sequence, timedelta]
Index: []
Count 3
Empty DataFrame
Columns: [trip_id, stop_sequence, timedelta]
Index: []
Count 4
Empty DataFrame
Columns: [trip_id, stop_sequence, timedelta]
Index: []
Count 5
Empty DataFrame
Columns: [trip_id, stop_sequence, timedelta]
Index: []
Count 6
Empty DataFrame
Columns: [trip_id, stop_sequence, timedelta]
Index: []
Count 7
Empty DataFrame
Columns: [trip_id, stop_sequence, timedelta]
Index: []
Count 8
Empty DataFrame
Columns: [trip_id, stop_sequence, timedelta]
Index: []
Count 9
Empty DataFrame
Columns: [trip_id, stop_sequence, timedelta]
Index: []
Count 10
Empty DataFrame
Columns: [trip_id, stop_sequence, timedelta]
Index: []
Count 11
Empty DataFrame
Columns: [trip_id, stop_sequence, timedelta]
Index: []
Count 12
Empty DataFrame
Columns: [trip_id, stop_sequence, timedelta]
Index: []
Count 13
Empty DataFrame
Columns: [trip_id, stop_

Unnamed: 0,trip_id,arrival_time,stop_sequence,timestamp,timedelta


# Moving to Elastic

In [299]:
es=Elasticsearch([{'host':'localhost','port':9200, 'http_auth':('elastic', 'changeme')}])

In [300]:
df3['arrival_time'] = df3['arrival_time'].apply(lambda x: x.datetime)
df3['depature_time'] = df3['depature_time'].apply(lambda x: x.datetime)
df3['timestamp'] = df3['timestamp'].apply(lambda x: x.datetime)
for record in df3.to_dict(orient='records'):
    es.index(index='trips_realtime', doc_type='lightrail', body=record)