# Light Rail Realtime to Elasticsearch

In [1]:
from pandas.io.json import json_normalize
from sodapy import Socrata
from elasticsearch import Elasticsearch

import json
import requests
import numpy as np
import pandas as pd
import datetime
from datetime import timedelta 
import os, time
import pytz
import arrow

In [3]:
# get credentials
with open('.credentials') as f:
    credentials = f.readlines()
    credentials = json.loads(''.join(credentials).strip())

# Data Acquisition

1. Socrata
2. GTFS (`trips.csv`, `stops.csv`) 

## Get data from Socrata

In [35]:
def getRealtimeFeed():
    # authenticate Socrata
    client = Socrata('www.data.act.gov.au',
                 '4jqogoRJ9NKj1gr8QAZ9CCKFI',
                 username=credentials['username'],
                 password=credentials['password'])
    # get endpoint
    results = client.get("r9a8-xw6s")
    # Convert to pandas DataFrame
    results_df = pd.DataFrame.from_records(results)
    return results_df

## Get data from GTFS

In [25]:
# get data from trips and stops
trips = pd.read_csv('../GTFS/google_transit_lr/trips.csv')
stops = pd.read_csv('../GTFS/google_transit_lr/stops.csv')

## Converting, Sorting and Dropping Columns

In [282]:
# adds information from trips.csv and stops.csv
def CSDM(dff, trips, stops):
    
    # TODO: check with Selva later for R (reserved buses) but removing them for now
    dff = dff[dff['trip_id'].map(lambda x: 'R' not in x)]
    
    # convert columns to datetime and int
    dff['arrival_delay'] = dff['arrival_delay'].astype(int)
    dff['depature_delay'] = dff['depature_delay'].astype(int)
    dff['stop_sequence'] = dff['stop_sequence'].astype(int)
    dff['trip_id'] = dff['trip_id'].astype(int)
    trips['trip_id'] = trips['trip_id'].astype(int)
    stops['stop_id'] = stops['stop_id'].astype(str)
    
    dff['arrival_time'] = dff['arrival_time'].apply(lambda x: arrow.get(x, tzinfo='Australia/Canberra'))
    dff['arrival_date'] = dff['arrival_time'].apply(lambda x: x.date())
    dff['depature_time'] = dff['depature_time'].apply(lambda x: arrow.get(x, tzinfo='Australia/Canberra'))
    dff['depature_date'] = dff['depature_time'].apply(lambda x: x.date())
    dff['timestamp'] = dff['timestamp'].apply(lambda x: arrow.get(x, tzinfo='Australia/Canberra'))
    
    # sort columns based on arrival time
    dff = dff.sort_values(by='arrival_time',ascending=True)

    # drop columns
    dff = dff[['arrival_delay', 'arrival_time', 'arrival_date', 'depature_delay', 'depature_time', 'depature_date', 'stop_id', 'stop_sequence', 'trip_id', 'timestamp']]
    trips = trips[['route_id', 'service_id', 'trip_id','trip_headsign', 'direction_id']]
    stops = stops[stops.columns[:2]]
    
    # merge columns
    merged = pd.merge(left=stops, left_on='stop_id', right=dff, right_on='stop_id', how='right')
    merged = pd.merge(left=trips, left_on='trip_id', right=merged, right_on='trip_id', how='right')
    print('|     Dates    |\n----------------')
    for i in sorted(merged['arrival_date'].unique()):
        print('|  {}  |'.format(str(i)))
    return merged

# Periodically Collect Data

In [49]:
df = CSDM(getRealtimeFeed(), trips, stops)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.

|     Dates    |
----------------
|  2019-06-26  |
|  2019-06-27  |


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 723 entries, 0 to 722
Data columns (total 14 columns):
route_id          723 non-null object
service_id        723 non-null object
trip_id           723 non-null int64
trip_headsign     723 non-null object
direction_id      723 non-null int64
stop_id           723 non-null object
stop_name         723 non-null object
arrival_delay     723 non-null int64
arrival_time      723 non-null object
arrival_date      723 non-null object
depature_delay    723 non-null int64
depature_time     723 non-null object
depature_date     723 non-null object
stop_sequence     723 non-null int64
dtypes: int64(5), object(9)
memory usage: 84.7+ KB


In [1]:
old = CSDM(getRealtimeFeed(), trips, stops)
time.sleep(15)
now = arrow.now()
new = CSDM(getRealtimeFeed(), trips, stops)
df2 = pd.concat([new, old])
df2.sort_values(by='trip_id', inplace=True)
df3 = df2.drop_duplicates(subset=['trip_id', 'arrival_time'])
df3 = df3[df3['depature_time'].map(lambda x: (now - x).seconds <= 15)]

NameError: name 'CSDM' is not defined

In [305]:
df3.to_dict(orient='records')

[]

In [298]:
df3.to_dict(orient='records')[0]

{'route_id': 'X1',
 'service_id': 'WD',
 'trip_id': 4,
 'trip_headsign': 'Gungahlin Pl',
 'direction_id': 0,
 'stop_id': '8111',
 'stop_name': 'Well Station Drive Platform 2',
 'arrival_delay': 0,
 'arrival_time': <Arrow [2019-06-27T06:03:33+10:00]>,
 'arrival_date': datetime.date(2019, 6, 27),
 'depature_delay': 0,
 'depature_time': <Arrow [2019-06-27T06:03:33+10:00]>,
 'depature_date': datetime.date(2019, 6, 27),
 'stop_sequence': 1,
 'timestamp': <Arrow [2019-06-27T06:02:30+10:00]>}

# Moving to Elastic

In [299]:
es=Elasticsearch([{'host':'localhost','port':9200, 'http_auth':('elastic', 'changeme')}])

In [300]:
df3['arrival_time'] = df3['arrival_time'].apply(lambda x: x.datetime)
df3['depature_time'] = df3['depature_time'].apply(lambda x: x.datetime)
df3['timestamp'] = df3['timestamp'].apply(lambda x: x.datetime)
for record in df3.to_dict(orient='records'):
    es.index(index='trips_realtime', doc_type='lightrail', body=record)