# Historical Records

In [9]:
# make sure to install these packages before running:
# pip install pandas
# pip install sodapy

import pandas as pd
from sodapy import Socrata

# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("www.data.act.gov.au", None)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(www.data.act.gov.au,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("jxpp-4iiz", limit=7000)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)



In [10]:
results_df.head()

Unnamed: 0,arrival_delay,arrival_time,arrivaluncertainty,depature_delay,depature_time,depatureuncertainty,gtfsrupdateid,stop_id,stop_sequence,timestamp,trip_id,tripschedulerelationship,tripupdateschedulerelationship,updatedeleted
0,150,2019-06-19T16:18:04.000,0,150,2019-06-19T16:18:24.000,0,2042486,8118,8,2019-06-19T16:18:15.000,211,SCHEDULED,SCHEDULED,
1,-34,2019-06-20T07:01:30.000,0,-34,2019-06-20T07:01:50.000,0,2903942,8104,2,2019-06-20T07:02:45.000,141,SCHEDULED,SCHEDULED,
2,81,2019-06-21T08:40:02.000,0,81,2019-06-21T08:40:22.000,0,4183373,8119,6,2019-06-21T08:40:30.000,298,SCHEDULED,SCHEDULED,
3,-10,2019-06-22T08:36:34.000,0,-10,2019-06-22T08:36:54.000,0,178952,8121,5,2019-06-22T08:37:00.000,559,SCHEDULED,SCHEDULED,
4,-10,2019-06-19T14:13:15.000,0,-10,2019-06-19T14:13:35.000,0,1918784,8116,7,2019-06-19T14:13:00.000,195,SCHEDULED,SCHEDULED,


In [11]:
results_df.tail()

Unnamed: 0,arrival_delay,arrival_time,arrivaluncertainty,depature_delay,depature_time,depatureuncertainty,gtfsrupdateid,stop_id,stop_sequence,timestamp,trip_id,tripschedulerelationship,tripupdateschedulerelationship,updatedeleted
6995,-15,2019-06-23T16:35:25.000,0,-15,2019-06-23T16:35:45.000,0,1325898,8108,4,2019-06-23T16:35:45.000,814,SCHEDULED,SCHEDULED,
6996,45,2019-06-20T06:34:22.000,0,45,2019-06-20T06:34:42.000,0,2876116,8109,10,2019-06-20T06:34:00.000,6,SCHEDULED,SCHEDULED,
6997,-32,2019-06-19T14:50:13.000,0,-32,2019-06-19T14:50:33.000,0,1955640,8117,7,2019-06-19T14:50:30.000,75,SCHEDULED,SCHEDULED,
6998,-7,2019-06-24T06:33:30.000,0,-7,2019-06-24T06:33:50.000,0,1832653,8109,10,2019-06-24T06:34:00.000,6,SCHEDULED,SCHEDULED,
6999,30,2019-06-21T07:21:05.000,0,30,2019-06-21T07:21:25.000,0,4151795,8124,11,2019-06-21T07:21:00.000,413,SCHEDULED,SCHEDULED,


In [12]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 14 columns):
arrival_delay                     7000 non-null object
arrival_time                      7000 non-null object
arrivaluncertainty                7000 non-null object
depature_delay                    7000 non-null object
depature_time                     7000 non-null object
depatureuncertainty               7000 non-null object
gtfsrupdateid                     7000 non-null object
stop_id                           7000 non-null object
stop_sequence                     7000 non-null object
timestamp                         7000 non-null object
trip_id                           7000 non-null object
tripschedulerelationship          7000 non-null object
tripupdateschedulerelationship    7000 non-null object
updatedeleted                     7000 non-null object
dtypes: object(14)
memory usage: 765.7+ KB


In [15]:
results_df['arrival_time'] = pd.to_datetime(results_df['arrival_time'])
results_df['depature_time'] = pd.to_datetime(results_df['depature_time'])
results_df['timestamp'] = pd.to_datetime(results_df['timestamp'])

In [18]:
results_df = results_df.sort_values(by='arrival_time',ascending=True)

In [20]:
results_df.set_index('arrival_time', inplace=True)

In [21]:
results_df.head()

Unnamed: 0_level_0,arrival_delay,arrivaluncertainty,depature_delay,depature_time,depatureuncertainty,gtfsrupdateid,stop_id,stop_sequence,timestamp,trip_id,tripschedulerelationship,tripupdateschedulerelationship,updatedeleted
arrival_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2019-06-19 10:23:24,-1,0,-1,2019-06-19 10:23:44,0,1694654,8116,7,2019-06-19 10:22:45,172,SCHEDULED,SCHEDULED,
2019-06-19 10:23:41,17,0,17,2019-06-19 10:24:01,0,1694727,8125,3,2019-06-19 10:22:45,49,SCHEDULED,SCHEDULED,
2019-06-19 10:23:57,104,0,104,2019-06-19 10:24:17,0,1694548,8126,12,2019-06-19 10:22:45,171,SCHEDULED,SCHEDULED,
2019-06-19 10:24:04,0,0,0,2019-06-19 10:24:24,0,1694586,8106,3,2019-06-19 10:22:45,173,SCHEDULED,SCHEDULED,
2019-06-19 10:25:12,27,0,27,2019-06-19 10:25:32,0,1697649,8123,4,2019-06-19 10:25:45,49,SCHEDULED,SCHEDULED,


In [23]:
df_06_20 = results_df.loc['2019-06-20']

In [24]:
df_06_20.reset_index(inplace=True)

In [32]:
df_06_20['trip_id'] = df_06_20['trip_id'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


# June 20, 2019 - Thursday
---

Expected Trips: (GTFS)
- Full (13 stop sequences): 212 full trips
- NIS: 28 trips are not in service
- X1: 19 Partial Light rail service from Well Station to Gungahlin Place
- X2: 7 Partial Light rail service from EPIC to Alinga Street

In [27]:
trips = pd.read_csv('./GTFS/google_transit_lr/trips.csv')

In [28]:
trips.head()

Unnamed: 0,route_id,service_id,trip_id,trip_headsign,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed
0,NIS,SU,712,Gungahlin Pl,0,3,1007,1,1
1,ACTO001,SU,780,Alinga St,1,3,1003,1,1
2,ACTO001,SU,718,Gungahlin Pl,0,3,1004,1,1
3,ACTO001,SU,784,Alinga St,1,3,1003,1,1
4,ACTO001,SU,722,Gungahlin Pl,0,3,1004,1,1


In [30]:
trips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 847 entries, 0 to 846
Data columns (total 9 columns):
route_id                 847 non-null object
service_id               847 non-null object
trip_id                  847 non-null int64
trip_headsign            847 non-null object
direction_id             847 non-null int64
block_id                 847 non-null object
shape_id                 847 non-null int64
wheelchair_accessible    847 non-null int64
bikes_allowed            847 non-null int64
dtypes: int64(5), object(4)
memory usage: 59.6+ KB


In [35]:
df = pd.merge(left=df_06_20, left_on='trip_id', right=trips, right_on='trip_id', how='left')

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1764 entries, 0 to 1763
Data columns (total 22 columns):
arrival_time                      1764 non-null datetime64[ns]
arrival_delay                     1764 non-null object
arrivaluncertainty                1764 non-null object
depature_delay                    1764 non-null object
depature_time                     1764 non-null datetime64[ns]
depatureuncertainty               1764 non-null object
gtfsrupdateid                     1764 non-null object
stop_id                           1764 non-null object
stop_sequence                     1764 non-null object
timestamp                         1764 non-null datetime64[ns]
trip_id                           1764 non-null int64
tripschedulerelationship          1764 non-null object
tripupdateschedulerelationship    1764 non-null object
updatedeleted                     1764 non-null object
route_id                          1764 non-null object
service_id                        1764 non-nu

In [50]:
df = df[df.columns[:-4]]

In [59]:
grouped = df.groupby('route_id')

## Full Service

In [60]:
full_service = grouped.get_group('ACTO001')

In [63]:
len(full_service.trip_id.unique())

212

## X1

In [65]:
x1 = grouped.get_group('X1')
len(x1.trip_id.unique())

19

## X2

In [66]:
x2 = grouped.get_group('X2')
len(x2.trip_id.unique())

7