## Estimate Trip arrival times

In [1]:
import pandas as pd

vehicle_positions = pd.read_csv("data/vehicle_tracking/processed/804_lametro-rail/2019-01-16.csv", index_col=0, parse_dates=["datetime"])
stations = pd.read_csv("data/line_info/804/804_directionA_stations.csv", index_col=0)

In [2]:
directionA = vehicle_positions[vehicle_positions['direction'] == 0]
trips = directionA[['datetime', 'trip_id', 'relative_position']].groupby(['trip_id'])
estimates = stations[['stop_id', 'relative_position']]
estimates['estimate'] = True

def create_estimated_arrival_times(trip_id, trip):
    trip['estimate'] = False # add field to indicate these are actual observations not estimates
    trip_est = estimates # copy values into loop variable
    trip_est['trip_id'] = trip_id
    combined = trip.append(trip_est)
    combined = combined.sort_values('relative_position')
    combined = combined.reset_index(drop=True)
    combined['previous_pos'] = combined.relative_position.shift() # shift vals to move adjacent position and date data into each row
    combined['next_pos'] = combined.relative_position.shift(-1)
    combined['previous_dt'] = combined.datetime.shift()
    combined['next_dt'] = combined.datetime.shift(-1)
    select = combined[combined['estimate']==True]
    select['weight'] = (select.relative_position - select.previous_pos) / (select.next_pos - select.previous_pos)
    select['time_interpolation'] = (select.next_dt - select.previous_dt) * select.weight
    select['datetime'] = select.previous_dt + select.time_interpolation
    select['datetime_round'] = pd.DatetimeIndex(select.datetime).round('S')
    select['stop_id'] = pd.to_numeric(select.stop_id, downcast='integer')
    return select

groups = [create_estimated_arrival_times(trip_id, trip) for trip_id, trip in trips]
"DONE"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col

'DONE'

In [3]:
all_estimates = pd.concat(groups)
stop_estimates = all_estimates.groupby(['stop_id'])

## Assemble/parse raw predictions data

In [4]:
import json
import os
import pendulum
from helpers.datetimefs import DateTimeFS, construct_filename

agency = "lametro-rail"
now = pendulum.now("America/Los_Angeles")
today = "2019-01-16"
end_datetime = now

def process_frame(datetime, path_base):
    source_path = construct_filename(path_base, datetime, ".json")
    with open(source_path, "r") as infile:
        raw = json.load(infile)["predictions"]
    try:
        directionB = raw[0]["direction"]["prediction"]
    except:
        return None
    if type(directionB) is dict:
        df = pd.DataFrame(directionB, index=[0])
    elif type(directionB) is list:
        df = pd.DataFrame(directionB[0], index=[0]) # get first prediction only
    df["datetime_prediction_made"] = pd.to_datetime(datetime.to_iso8601_string())
    df["datetime_predicted"] = pd.DatetimeIndex(pd.to_datetime(pd.to_datetime(df["epochTime"], unit='ms')))
    return df

line_predictions = {}
for line in range(804, 805):
    schedule_path = f"data/schedule/{line}_{agency}/{today}.csv"
    schedule = pd.read_csv(schedule_path, index_col=0)
    start_datetime = pendulum.parse(
        schedule.datetime.min(), tz="America/Los_Angeles"
    )  
    stops = stations.stop_id
    stop_predictions = {}
    for stop_id in stops:
        path_base = f"data/predictions/raw/{line}_{agency}/{stop_id}"
        dtfs = DateTimeFS(path_base)
        datetimes = dtfs.get_datetimes_in_range(
            start_datetime.in_tz("UTC"), end_datetime.in_tz("UTC")
        )
        predictions = [process_frame(datetime, path_base) for datetime in datetimes]
        predictions = pd.concat(predictions).drop_duplicates(["dirTag", "datetime_predicted", "vehicle"])[["datetime_prediction_made", "datetime_predicted", "vehicle", "tripTag"]]
        stop_predictions[stop_id] = predictions
    line_predictions[line] = pd.concat(stop_predictions)
all_predictions = pd.concat(line_predictions)
"DONE"

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




'DONE'

In [6]:
os.makedirs("data/predictions/processed/804_lametro-rail", exist_ok=True)
all_predictions.to_csv("data/predictions/processed/804_lametro-rail/2019-01-16.csv")

In [7]:
def match_predictions_and_schedule(key, group):
    stop_id = key[1]
    est = stop_estimates.get_group(stop_id).dropna(subset=['datetime'])
    est = est.set_index(pd.DatetimeIndex(est['datetime'])).sort_index()
    if len(est):
        group.loc[:,"closest_actual"] = group.datetime_predicted.apply(lambda x: est.index[est.index.get_loc(x, method='nearest')])
        group.loc[:,"scheduled"] = group.tripTag.apply(lambda x: schedule[(schedule["trip_id"] == int(x)) & (schedule["stop_id"] == int(stop_id))].datetime.values[0])
        return group
    return None

comparable = pd.concat([match_predictions_and_schedule(key, group) for key, group in all_predictions.groupby(level=[0, 1])])
comparable["difference"] = comparable["datetime_predicted"] - pd.DatetimeIndex(comparable["closest_actual"])
"DONE"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


'DONE'

In [10]:
os.makedirs("data/predictions/comparison/804_lametro-rail", exist_ok=True)
comparable.to_csv("data/predictions/comparison/804_lametro-rail/2019-01-16.csv")

## Results

Mean difference between predicted time and arrival time:

In [11]:
comparable.difference.mean()

Timedelta('-1 days +23:59:40.513339')

Standard deviation of predicted time and arrival time:

In [12]:
comparable.difference.std()

Timedelta('0 days 00:06:50.040284')

In [13]:
len(comparable)

16181