In [1]:
from pandas.io.json import json_normalize
from sodapy import Socrata
from elasticsearch import Elasticsearch

import json
import requests
import numpy as np
import pandas as pd
import datetime
from datetime import timedelta 
import os, time
import pytz
import arrow

In [2]:
# get credentials
with open('../.credentials') as f:
    credentials = f.readlines()
    credentials = json.loads(''.join(credentials).strip())

In [3]:
client = Socrata('www.data.act.gov.au',
                 '4jqogoRJ9NKj1gr8QAZ9CCKFI',
                 username=credentials['username'],
                 password=credentials['password'])

In [4]:
results = client.get("jxpp-4iiz", limit=50000)

In [5]:
# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

## Converting, Sorting and Dropping Columns

In [6]:
results_df['arrival_time'] = pd.to_datetime(results_df['arrival_time'])
results_df['depature_time'] = pd.to_datetime(results_df['depature_time'])
results_df['timestamp'] = pd.to_datetime(results_df['timestamp'])

In [7]:
results_df = results_df.sort_values(by='arrival_time',ascending=True)

In [8]:
results_df = results_df[['arrival_delay', 'arrival_time', 'arrivaluncertainty', 'depature_delay', 'depature_time', 'depatureuncertainty', 'stop_id', 'stop_sequence', 'trip_id']]

In [9]:
results_df['arrival_delay'] = results_df['arrival_delay'].astype(int)
results_df['depature_delay'] = results_df['depature_delay'].astype(int)
results_df['stop_sequence'] = results_df['stop_sequence'].astype(int)

In [10]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29535 entries, 9255 to 21194
Data columns (total 9 columns):
arrival_delay          29535 non-null int64
arrival_time           29535 non-null datetime64[ns]
arrivaluncertainty     29535 non-null object
depature_delay         29535 non-null int64
depature_time          29535 non-null datetime64[ns]
depatureuncertainty    29535 non-null object
stop_id                29535 non-null object
stop_sequence          29535 non-null int64
trip_id                29535 non-null object
dtypes: datetime64[ns](2), int64(3), object(4)
memory usage: 2.3+ MB


In [11]:
results_df.head()

Unnamed: 0,arrival_delay,arrival_time,arrivaluncertainty,depature_delay,depature_time,depatureuncertainty,stop_id,stop_sequence,trip_id
9255,127,2019-06-19 06:13:07,0,127,2019-06-19 06:13:07,0,8100,5,4
9317,149,2019-06-19 06:14:58,0,149,2019-06-19 06:14:58,0,8129,8,134
9215,-4,2019-06-19 06:23:56,0,-4,2019-06-19 06:23:56,0,8129,13,135
9281,194,2019-06-19 06:27:14,0,194,2019-06-19 06:27:14,0,8100,13,5
9256,6,2019-06-19 06:39:06,0,6,2019-06-19 06:39:06,0,8129,13,136


In [12]:
results_df['arrival_date'] = results_df['arrival_time'].apply(lambda x: x.date())

In [13]:
[str(date) for date in results_df['arrival_date'].unique()]

['2019-06-19',
 '2019-06-20',
 '2019-06-21',
 '2019-06-22',
 '2019-06-23',
 '2019-06-24',
 '2019-06-25',
 '2019-06-26',
 '2019-06-27',
 '2019-06-28',
 '2019-06-29',
 '2019-06-30']

## Combine Tables

In [14]:
trips = pd.read_csv('../GTFS/google_transit_lr/trips.csv')
trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 847 entries, 0 to 846
Data columns (total 9 columns):
route_id                 847 non-null object
service_id               847 non-null object
trip_id                  847 non-null int64
trip_headsign            847 non-null object
direction_id             847 non-null int64
block_id                 847 non-null object
shape_id                 847 non-null int64
wheelchair_accessible    847 non-null int64
bikes_allowed            847 non-null int64
dtypes: int64(5), object(4)
memory usage: 59.6+ KB


In [15]:
trips = trips[['route_id', 'service_id', 'trip_id','trip_headsign', 'direction_id']]

In [16]:
trips['trip_id'] = trips['trip_id'].astype(str)

In [17]:
df = results_df.set_index('arrival_time')

## Function

In [23]:
def getDate(date, df, grouped=False):
    df_copy = df.copy(deep=True)
    df_date = df_copy.loc[date]
    df_date.reset_index(inplace=True)
    
    trips = pd.read_csv('../GTFS/google_transit_lr/trips.csv')
    trips['trip_id'] = trips['trip_id'].astype(str)
    
    date = pd.merge(left=df_date, left_on='trip_id', right=trips, right_on='trip_id', how='left')
    date = date[date.columns[:-4]]
    date['arrival_time'] = date['arrival_time'].apply(lambda x: arrow.get(x, 'Australia/Canberra').datetime)
    date['depature_time'] = date['depature_time'].apply(lambda x: arrow.get(x, 'Australia/Canberra').datetime)
    
    if grouped:
        grouped = date.groupby('route_id')

        full_service = grouped.get_group('ACTO001')
        x1 = grouped.get_group('X1')
        try:
            x2 = grouped.get_group('X2')
        except:
            return [full_service, x1]

        return [full_service, x1, x2]
    else:
        return date

In [24]:
june_20 = getDate('2019-06-20', df)
june_21 = getDate('2019-06-21', df)
june_22 = getDate('2019-06-22', df)
june_23 = getDate('2019-06-23', df)
june_24 = getDate('2019-06-24', df)
june_25 = getDate('2019-06-25', df)
june_26 = getDate('2019-06-26', df)
june_27 = getDate('2019-06-27', df)
june_28 = getDate('2019-06-28', df)

## Test

In [29]:
june_25[['arrival_delay', 'depature_delay']].describe()

Unnamed: 0,arrival_delay,depature_delay
count,2989.0,2989.0
mean,10.54734,10.54734
std,48.135325,48.135325
min,-353.0,-353.0
25%,-12.0,-12.0
50%,1.0,1.0
75%,31.0,31.0
max,246.0,246.0


In [73]:
june_25[june_25['arrival_delay'] == 246]

Unnamed: 0,arrival_time,arrival_delay,arrivaluncertainty,depature_delay,depature_time,depatureuncertainty,stop_id,stop_sequence,trip_id,arrival_date,route_id,service_id,trip_headsign,direction_id
501,2019-06-25 08:28:06+10:00,246,0,246,2019-06-25 08:28:06+10:00,0,8129,13,151,2019-06-25,ACTO001,WD,Alinga St,1.0


In [47]:
no_zero = june_25[june_25['arrival_delay'] != 0].copy()

## Plotly

In [77]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.figure_factory as ff
import plotly.graph_objs as go

init_notebook_mode(connected=True)

### Plot Distribution

In [57]:
# group data together
hist_data = [no_zero['arrival_delay']]
group_labels = ['Arrival Delay']

In [74]:
# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=25)

In [75]:
pyo.iplot(fig)

In [116]:
data = [
    go.Box(
        y=no_zero['arrival_delay'],
        boxpoints='all',
        jitter=0.3,
        pointpos=-1.8,
        name='Arrival'
    ),
    
    go.Box(
        y=no_zero['depature_delay'],
        boxpoints='all',
        jitter=0.3,
        pointpos=-1.8,
        name='Departure'
    )
]
fig = go.Figure(data)
pyo.iplot(fig)

In [104]:
june_25['arrival_date'] = june_25['arrival_time'].apply(lambda x: x.date())

In [105]:
june_25['arrival_time'] = june_25['arrival_time'].apply(lambda x: x.replace(tzinfo=None))

In [106]:
no_zero = june_25[june_25['arrival_delay'] != 0].copy()

In [123]:
trace0 = go.Scatter(
    x = no_zero[no_zero['arrival_delay'].map(lambda x: x > 0)]['arrival_time'],
    y = no_zero[no_zero['arrival_delay'].map(lambda x: x > 0)]['arrival_delay'],
    name = 'Late Arrivals',
    mode = 'markers',
    marker = dict(
        size = 10,
        line = dict(
            width = 1
        )
    )
)

trace1 = go.Scatter(
    x = no_zero[no_zero['arrival_delay'].map(lambda x: x < 0)]['arrival_time'],
    y = no_zero[no_zero['arrival_delay'].map(lambda x: x < 0)]['arrival_delay'],
    name = 'Early Arrivals',
    mode = 'markers',
    marker = dict(
        size = 10,
        line = dict(
            width = 1
        )
    )
)

data = [trace0, trace1]

layout = dict(title = 'Arrival Times',
              yaxis = dict(zeroline = False),
              xaxis = dict(zeroline = False)
             )

fig = dict(data=data, layout=layout)
pyo.iplot(fig)

In [124]:
trace0 = go.Scatter(
    x = no_zero[no_zero['arrival_delay'].map(lambda x: x >= 116)]['arrival_time'],
    y = no_zero[no_zero['arrival_delay'].map(lambda x: x >= 116)]['arrival_delay'],
    name = '116 or more seconds late',
    mode = 'markers',
    marker = dict(
        size = 10,
        line = dict(
            width = 1
        )
    )
)

trace1 = go.Scatter(
    x = no_zero[no_zero['arrival_delay'].map(lambda x: x <= -88)]['arrival_time'],
    y = no_zero[no_zero['arrival_delay'].map(lambda x: x <= -88)]['arrival_delay'],
    name = '88 or more seconds early',
    mode = 'markers',
    marker = dict(
        size = 10,
        line = dict(
            width = 1
        )
    )
)

data = [trace0, trace1]

layout = dict(title = 'Arrival Times',
              yaxis = dict(zeroline = False),
              xaxis = dict(zeroline = False)
             )

fig = dict(data=data, layout=layout)
pyo.iplot(fig)

In [125]:
print(len(no_zero[no_zero['arrival_delay'].map(lambda x: x >= 116)]))
print(len(no_zero[no_zero['arrival_delay'].map(lambda x: x <= -88)]))

73
37


## Elasticsearch

In [30]:
es=Elasticsearch([{'host':'localhost','port':9200, 'http_auth':('elastic', 'changeme')}])

In [57]:
for record in june_20:
    for idx, value in enumerate(record.to_dict(orient='records')):
        es.index(index='trips', doc_type='lightrail', body=value)
for record in june_21:
    for idx, value in enumerate(record.to_dict(orient='records')):
        es.index(index='trips', doc_type='lightrail', body=value)
for record in june_22:
    for idx, value in enumerate(record.to_dict(orient='records')):
        es.index(index='trips', doc_type='lightrail', body=value)
for record in june_23:
    for idx, value in enumerate(record.to_dict(orient='records')):
        es.index(index='trips', doc_type='lightrail', body=value)
for record in june_24:
    for idx, value in enumerate(record.to_dict(orient='records')):
        es.index(index='trips', doc_type='lightrail', body=value)
for record in june_25:
    for idx, value in enumerate(record.to_dict(orient='records')):
        es.index(index='trips', doc_type='lightrail', body=value)
for record in june_26:
    for idx, value in enumerate(record.to_dict(orient='records')):
        es.index(index='trips', doc_type='lightrail', body=value)
for record in june_27:
    for idx, value in enumerate(record.to_dict(orient='records')):
        es.index(index='trips', doc_type='lightrail', body=value)
for record in june_28:
    for idx, value in enumerate(record.to_dict(orient='records')):
        es.index(index='trips', doc_type='lightrail', body=value)