In [1]:
import h3
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.functions import col, lit, udf, acos, cos, sin, radians, rank, element_at
from pyspark.sql.types import StringType, FloatType, BooleanType
import pandas as pd

In [2]:
ls ./source

[34mintersection_status_report[m[m/ [34mtripdatas[m[m/
[34mopticomdevicelog[m[m/           [34mtriplogs[m[m/


In [3]:
import os

In [4]:
def dist(lat_x, long_x, lat_y, long_y):
    return acos(
        sin(radians(lat_x)) * sin(radians(lat_y)) +
        cos(radians(lat_x)) * cos(radians(lat_y)) *
            cos(radians(long_x) - radians(long_y))
    ) * lit(6371.0 * 1000) # in meters


In [5]:
import math 

def dist_simple(lat_x, long_x, lat_y, long_y):
    # no spark used here
    return math.acos(
        math.sin(math.radians(lat_x)) * math.sin(math.radians(lat_y)) +
        math.cos(math.radians(lat_x)) * math.cos(math.radians(lat_y)) *
            math.cos(math.radians(long_x) - math.radians(long_y))
    ) * 6371.0 * 1000 # in meters


In [6]:
geo_to_h3_udf = udf(h3.geo_to_h3, StringType())

In [7]:
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .config('spark.driver.memory','2G') \
    .getOrCreate()

In [8]:
intersection_status_report = spark.read.parquet("./source/intersection_status_report/2021-05-01")

In [9]:
trip_logs = spark.read.parquet("./source/triplogs/05-01-2021").drop('routeName')

In [10]:
trip_datas = spark.read.parquet("./source/tripdatas/05-01-2021")

In [11]:
trip_log_datas = trip_logs.join(trip_datas, on=['logID', 'deviceID'], how='inner')

In [12]:
intersections = intersection_status_report \
    .filter("status <> 'Error'") \
    .withColumnRenamed('Latitude', 'actual_lat') \
    .withColumnRenamed('Longitude', 'actual_lon') \
    .withColumnRenamed('LocationName', 'name') \
    .withColumnRenamed('LocationId', 'id') \
    .withColumn('actual_lat', col('actual_lat').cast("double")) \
    .withColumn('actual_lon', col('actual_lon').cast("double")) \
    .withColumn('zone_id', geo_to_h3_udf('actual_lat', 'actual_lon', lit(9))) \
    .select('id', 'name', 'actual_lat', 'actual_lon', 'zone_id')

In [13]:
from pyspark.sql.functions import row_number

zoned_trip_data = trip_log_datas \
    .withColumn('lon', element_at(col("loc.coordinates"), 1)) \
    .withColumn('lat', element_at(col("loc.coordinates"), 2)) \
    .filter("tspMode == 'alwaysOn'") \
    .select('lon', 'lat', 'time', 'routeName', 'direction', 'tripID', 'logID', 'deviceID', 'event', 'mph') \
    .withColumn('zone_id', geo_to_h3_udf('lat', 'lon', lit(9))) \
    .withColumn('breadcrumb_index', row_number().over(Window.partitionBy(['logID', 'deviceID']).orderBy('time')))

## Detect tripdatas closest to intersection (tripdata points crossing intersection)

In [14]:
# max distance (m) from breadcrumb to intersection, bus to be considered possibly crossing this intersection
# it can be thought of as a gps sideways threshold
dist_to_be_crossing_thresh = 45

In [15]:
zoned_trip_data_intersection = zoned_trip_data.join(intersections, on='zone_id')

zoned_trip_data_intersection = zoned_trip_data_intersection \
    .withColumn('dist_to_intersection', dist("lat", "lon", "actual_lat", "actual_lon")) \
    .withColumn('rank_dist', row_number().over(Window.partitionBy(['logID', 'deviceID', 'id', 'routeName', 'direction']).orderBy('dist_to_intersection')))

trip_datas_crossing_intersections = zoned_trip_data_intersection.filter('rank_dist == 1') \
    .filter(zoned_trip_data_intersection.dist_to_intersection < dist_to_be_crossing_thresh).orderBy("time")

In [16]:
# save trip_datas_crossing_intersections to dict to make it work faster

from collections import defaultdict
log_id_device_id_to_trip_datas_crossing_intersections = defaultdict(list)

for row in trip_datas_crossing_intersections.collect():
    log_id_device_id_to_trip_datas_crossing_intersections[row['logID'] + "_" + row['deviceID']].append(row)


In [17]:
# save breadcrumbs with 'stop arrive', 'stop depart' to dict to later detect breadcrumbs between those events

log_id_device_id_to_trip_datas_arrive_depart = defaultdict(list)

for row in zoned_trip_data.filter("event IN ('stop arrive', 'stop depart')").collect():
    log_id_device_id_to_trip_datas_arrive_depart[row['logID'] + "_" + row['deviceID']].append(row)

In [18]:
# sequential stop arrive stop depart indexes saved in dict
log_id_device_id_to_stoparrive_stopdepart_indexes = defaultdict(list)

for key, val in log_id_device_id_to_trip_datas_arrive_depart.items():
    stop_arrive_depart_breadcrumbs = val
    cur_stop_arrive_stopdepart_indexes = []
    for ind, row in enumerate(stop_arrive_depart_breadcrumbs):
        if ind + 1 < len(stop_arrive_depart_breadcrumbs) and (row['event'] == "stop arrive") and (stop_arrive_depart_breadcrumbs[ind + 1]['event'] == "stop depart"):
            cur_stop_arrive_stopdepart_indexes.append({'arrive': row["breadcrumb_index"], 'depart': stop_arrive_depart_breadcrumbs[ind + 1]["breadcrumb_index"]})
    
    log_id_device_id_to_stoparrive_stopdepart_indexes[key] = cur_stop_arrive_stopdepart_indexes
    

In [19]:
log_id_device_id_to_stoparrive_stopdepart_indexes['20210430-t5_4010KJ1133']

[{'arrive': 5, 'depart': 22},
 {'arrive': 42, 'depart': 79},
 {'arrive': 107, 'depart': 135},
 {'arrive': 154, 'depart': 158},
 {'arrive': 234, 'depart': 294},
 {'arrive': 338, 'depart': 343},
 {'arrive': 367, 'depart': 374},
 {'arrive': 432, 'depart': 435},
 {'arrive': 499, 'depart': 504},
 {'arrive': 573, 'depart': 621},
 {'arrive': 648, 'depart': 687},
 {'arrive': 709, 'depart': 713},
 {'arrive': 767, 'depart': 771},
 {'arrive': 830, 'depart': 900},
 {'arrive': 961, 'depart': 967},
 {'arrive': 1012, 'depart': 1015},
 {'arrive': 1074, 'depart': 1078},
 {'arrive': 1101, 'depart': 1154},
 {'arrive': 1170, 'depart': 1175},
 {'arrive': 1301, 'depart': 1305},
 {'arrive': 1386, 'depart': 1415},
 {'arrive': 1489, 'depart': 1494},
 {'arrive': 1546, 'depart': 1588},
 {'arrive': 1679, 'depart': 1684},
 {'arrive': 1744, 'depart': 1749},
 {'arrive': 1816, 'depart': 1839},
 {'arrive': 1884, 'depart': 1946},
 {'arrive': 2002, 'depart': 2027},
 {'arrive': 2100, 'depart': 2106},
 {'arrive': 2401, 'd

## Upcoming intersection detection

In [20]:
import json

def get_upcoming_intersection_factory(log_id_device_id_to_trip_datas_crossing_intersections_broadcasted):
    
    def get_upcoming_intersection(log_id, device_id, lat, lon, breadcrumb_index, dist_threshold):
        """
        log_id - log_id to identify trip in combination with device_id
        device_id - device_id identify trip in combination with log_id
        breadcrumb_index - breadcrumb_index attribute for given breadcrumb (generated at previous steps)
        log_id_device_id_to_trip_datas_crossing_intersections - dict where keys are strings [logID]_[deviceID]
        and values are lists of rows in tripdatas where crossing the intersection happened
        dist_threshold - max distance of bus to intersection so it is considered to be affected by that intersection

        returns: string representation of dict
        
        if upcoming_intersection is found (such that goes after breadcrumb within threshold distance),
        its info is given in intersection_loc_id, dist_to_upcoming_intersection fields
        otherwise reason not found explained in note_upcoming field
        
        intersection_loc_id - id of location of upcoming intersection if found
        dist_to_upcoming_intersection - distance in m to upcoming intersection
        note_upcoming - string explaining why an upcoming intersection is considered to be not found
        """
        # take all trip_datas_crossing_intersections for given trip
        cur_trip_datas_crossing_intersections = log_id_device_id_to_trip_datas_crossing_intersections_broadcasted.value.get(log_id + "_" + device_id)
        if cur_trip_datas_crossing_intersections:
            # leave only upcoming (with larger index than current) trip_datas_crossing_intersections with valid distace threshold
            cur_trip_datas_crossing_intersections_filtered = list(filter(lambda x: x["breadcrumb_index"] > breadcrumb_index, cur_trip_datas_crossing_intersections))
            if cur_trip_datas_crossing_intersections_filtered:
                # if there are several such intersections, take closest by time
                breadcrumb_crossing_intersection = min(cur_trip_datas_crossing_intersections_filtered, key=lambda x : x["breadcrumb_index"])
                dist_to_upcoming_intersection = dist_simple(breadcrumb_crossing_intersection["actual_lat"], breadcrumb_crossing_intersection["actual_lon"] ,lat ,lon)
                if dist_to_upcoming_intersection < dist_threshold:
                    return json.dumps({"intersection_loc_id": breadcrumb_crossing_intersection["id"], "dist_to_upcoming_intersection": dist_to_upcoming_intersection, "note_upcoming": None})
                else:
                    return json.dumps({"intersection_loc_id": None, "dist_to_upcoming_intersection": None, "note_upcoming": "distance to upcoming intersection: " + str(dist_to_upcoming_intersection) + " >= " + str(dist_threshold) + " \n potential upcoming id: " + str(breadcrumb_crossing_intersection["id"])})
            else:
                return json.dumps({"intersection_loc_id": None ,  "dist_to_upcoming_intersection": None, "note_upcoming": "no upcoming found"})
        return json.dumps({"intersection_loc_id": None ,  "dist_to_upcoming_intersection": None, "note_upcoming": "no crossing intersections on trip"})
    return udf(get_upcoming_intersection)
    
    

In [21]:
log_id_device_id_to_trip_datas_crossing_intersections_broadcasted = spark.sparkContext.broadcast(log_id_device_id_to_trip_datas_crossing_intersections)

In [22]:
get_upcoming_intersection_udf = get_upcoming_intersection_factory(log_id_device_id_to_trip_datas_crossing_intersections_broadcasted)

In [23]:
# max distance to upcoming intersection for it to be considered to possibly affect (cause delay) the bus
dist_threshold = 70

In [24]:
zoned_trip_data_intersection = zoned_trip_data_intersection \
    .withColumn('upcoming_intersection', get_upcoming_intersection_udf('logID', 'deviceID', 'lat', 'lon', 'breadcrumb_index', lit(dist_threshold)))

In [25]:
def upcoming_intersection_str_to_loc_id(upcoming_intersection):
    upcoming_intersection_val = json.loads(upcoming_intersection)
    return upcoming_intersection_val["intersection_loc_id"]

def upcoming_intersection_str_to_dist(upcoming_intersection):
    upcoming_intersection_val = json.loads(upcoming_intersection)
    return upcoming_intersection_val["dist_to_upcoming_intersection"]

def upcoming_intersection_str_to_note_upcoming(upcoming_intersection):
    upcoming_intersection_val = json.loads(upcoming_intersection)
    return upcoming_intersection_val["note_upcoming"]

upcoming_intersection_str_to_loc_id_udf = udf(upcoming_intersection_str_to_loc_id, StringType())
upcoming_intersection_str_to_dist_udf = udf(upcoming_intersection_str_to_dist, FloatType())
upcoming_intersection_str_to_note_upcoming_udf = udf(upcoming_intersection_str_to_note_upcoming, StringType())

In [26]:
zoned_trip_data_intersection = zoned_trip_data_intersection \
    .withColumn('upcoming_intersection_loc_id', upcoming_intersection_str_to_loc_id_udf("upcoming_intersection")) \
    .withColumn('dist_to_upcoming_intersection', upcoming_intersection_str_to_dist_udf("upcoming_intersection")) \
    .withColumn('upcoming_intersection_note', upcoming_intersection_str_to_note_upcoming_udf("upcoming_intersection"))


In [27]:
t = zoned_trip_data_intersection.limit(500).toPandas()

In [28]:
t.head()

Unnamed: 0,zone_id,lon,lat,time,routeName,direction,tripID,logID,deviceID,event,...,id,name,actual_lat,actual_lon,dist_to_intersection,rank_dist,upcoming_intersection,upcoming_intersection_loc_id,dist_to_upcoming_intersection,upcoming_intersection_note
0,89283082a77ffff,-122.396278,37.793412,2021-05-01 05:50:49.448,NBUS,outbound,9823544,20210430-t11,4010KJ1113,stop depart,...,05F728F4-6E79-48E1-A328-A87CE2415D96,Market/Main and Drumm,37.7932,-122.396,33.942522,1,"{""intersection_loc_id"": null, ""dist_to_upcomin...",,,distance to upcoming intersection: 119.5724681...
1,89283082a77ffff,-122.396278,37.793412,2021-05-01 05:50:49.479,NBUS,outbound,9823544,20210430-t11,4010KJ1113,TSP enable,...,05F728F4-6E79-48E1-A328-A87CE2415D96,Market/Main and Drumm,37.7932,-122.396,33.942522,2,"{""intersection_loc_id"": null, ""dist_to_upcomin...",,,distance to upcoming intersection: 119.5724681...
2,89283082a77ffff,-122.396278,37.793412,2021-05-01 05:50:50.023,NBUS,outbound,9823544,20210430-t11,4010KJ1113,GPS,...,05F728F4-6E79-48E1-A328-A87CE2415D96,Market/Main and Drumm,37.7932,-122.396,33.942522,3,"{""intersection_loc_id"": null, ""dist_to_upcomin...",,,distance to upcoming intersection: 119.5724681...
3,89283082a77ffff,-122.396342,37.793357,2021-05-01 05:50:51.274,NBUS,outbound,9823544,20210430-t11,4010KJ1113,GPS,...,05F728F4-6E79-48E1-A328-A87CE2415D96,Market/Main and Drumm,37.7932,-122.396,34.710246,4,"{""intersection_loc_id"": null, ""dist_to_upcomin...",,,distance to upcoming intersection: 111.4460743...
4,89283082a77ffff,-122.396218,37.793462,2021-05-01 05:50:48.775,NBUS,outbound,9823544,20210430-t11,4010KJ1113,GPS,...,05F728F4-6E79-48E1-A328-A87CE2415D96,Market/Main and Drumm,37.7932,-122.396,34.851729,5,"{""intersection_loc_id"": ""05F728F4-6E79-48E1-A3...",05F728F4-6E79-48E1-A328-A87CE2415D96,34.851601,


In [29]:
def is_delayed_by_intersection_factory(log_id_device_id_to_stoparrive_stopdepart_indexes_broadcasted):
    
    def is_delayed_by_intersection(log_id, device_id, breadcrumb_index, upcoming_intersection_loc_id, mph, mph_threshold):
        """
        log_id - log_id to identify trip in combination with device_id
        device_id - device_id identify trip in combination with log_id
        breadcrumb_index - breadcrumb_index attribute for given breadcrumb (generated at previous steps)
        upcoming_intersection_loc_id - id of upcoming intersection if found else None
        mph - speed of vehicle at that point in mph
        mph_threshold - max speed in mph of bus so it is considered to be delayed (usually low number such as 3-5)
        
        returns: string representation of dict
        
        is_delayed_by_intersection - bool (True if all conditions to be considered delayed by intersection are satisfied)
        is_delayed_by_intersection_note - string explaining why the bus is considered not to be delayed (if is_delayed_by_intersection False)

        """
        
        def is_on_busstop(log_id, device_id, breadcrumb_index):
            # helper function to detect if breadcrumb is between stop arrive, stop depart events
            stoparrive_stopdeparts = log_id_device_id_to_stoparrive_stopdepart_indexes_broadcasted.value.get(log_id + "_" + device_id)
            for arrive_depart in stoparrive_stopdeparts:
                if (breadcrumb_index >= arrive_depart['arrive']) and (breadcrumb_index <= arrive_depart['depart']):
                    return True
            return False
        
        # if no upcoming intersection detected -> False 
        if upcoming_intersection_loc_id is None:
            return json.dumps({"is_delayed_by_intersection": False, "is_delayed_by_intersection_note": "upcoming intersection not found"})
        
        if mph > mph_threshold:
            return json.dumps({"is_delayed_by_intersection": False, "is_delayed_by_intersection_note": "mph > mph_threshold"})
        
        # if mph <= mph_threshold and it is between stop arrive, stop depart events -> False
        if is_on_busstop(log_id, device_id, breadcrumb_index):
            return json.dumps({"is_delayed_by_intersection": False, "is_delayed_by_intersection_note": "on bus stop"})
        
        return json.dumps({"is_delayed_by_intersection": True, "is_delayed_by_intersection_note": None})
    
    return udf(is_delayed_by_intersection)
        

In [30]:
log_id_device_id_to_stoparrive_stopdepart_indexes_broadcasted = spark.sparkContext.broadcast(log_id_device_id_to_stoparrive_stopdepart_indexes)

In [31]:
is_delayed_by_intersection_udf = is_delayed_by_intersection_factory(log_id_device_id_to_stoparrive_stopdepart_indexes_broadcasted)

In [32]:
mph_threshold = 3

In [33]:
zoned_trip_data_intersection = zoned_trip_data_intersection \
    .withColumn('is_delayed_by_intersection_str', is_delayed_by_intersection_udf('logID', 'deviceID', 'breadcrumb_index', 'upcoming_intersection_loc_id' , 'mph', lit(mph_threshold)))


In [34]:
def is_delayed_by_intersection_str_to_bool(is_delayed_by_intersection_str):
    is_delayed_by_intersection_val = json.loads(is_delayed_by_intersection_str)
    return is_delayed_by_intersection_val["is_delayed_by_intersection"]

def is_delayed_by_intersection_str_to_note(is_delayed_by_intersection_str):
    is_delayed_by_intersection_val = json.loads(is_delayed_by_intersection_str)
    return is_delayed_by_intersection_val["is_delayed_by_intersection_note"]

is_delayed_by_intersection_str_to_bool_udf = udf(is_delayed_by_intersection_str_to_bool, BooleanType())
is_delayed_by_intersection_str_to_note_udf = udf(is_delayed_by_intersection_str_to_note)

In [35]:
zoned_trip_data_intersection = zoned_trip_data_intersection \
    .withColumn('is_delayed_by_intersection', is_delayed_by_intersection_str_to_bool_udf("is_delayed_by_intersection_str")) \
    .withColumn('is_delayed_by_intersection_note', is_delayed_by_intersection_str_to_note_udf("is_delayed_by_intersection_str"))

In [36]:
t = zoned_trip_data_intersection.limit(500).toPandas()

In [37]:
t[["mph","is_delayed_by_intersection_note", "upcoming_intersection_note", "is_delayed_by_intersection"]].head()

Unnamed: 0,mph,is_delayed_by_intersection_note,upcoming_intersection_note,is_delayed_by_intersection
0,17.491856,upcoming intersection not found,distance to upcoming intersection: 119.5724681...,False
1,17.491856,upcoming intersection not found,distance to upcoming intersection: 119.5724681...,False
2,17.491856,upcoming intersection not found,distance to upcoming intersection: 119.5724681...,False
3,18.872792,upcoming intersection not found,distance to upcoming intersection: 111.4460743...,False
4,15.53553,mph > mph_threshold,,False


In [38]:
t[t["is_delayed_by_intersection"]].head()

Unnamed: 0,zone_id,lon,lat,time,routeName,direction,tripID,logID,deviceID,event,...,actual_lon,dist_to_intersection,rank_dist,upcoming_intersection,upcoming_intersection_loc_id,dist_to_upcoming_intersection,upcoming_intersection_note,is_delayed_by_intersection_str,is_delayed_by_intersection,is_delayed_by_intersection_note
40,89283082a77ffff,-122.395507,37.794022,2021-05-01 05:50:20.200,NBUS,outbound,9823544,20210430-t11,4010KJ1113,GPS,...,-122.396,101.127172,41,"{""intersection_loc_id"": ""32938F73-0173-4459-93...",32938F73-0173-4459-9342-109E7854013A,49.86607,,"{""is_delayed_by_intersection"": true, ""is_delay...",True,
41,89283082a77ffff,-122.395505,37.794022,2021-05-01 05:50:16.430,NBUS,outbound,9823544,20210430-t11,4010KJ1113,GPS,...,-122.396,101.190028,42,"{""intersection_loc_id"": ""32938F73-0173-4459-93...",32938F73-0173-4459-9342-109E7854013A,49.993328,,"{""is_delayed_by_intersection"": true, ""is_delay...",True,
42,89283082a77ffff,-122.395505,37.794022,2021-05-01 05:50:17.727,NBUS,outbound,9823544,20210430-t11,4010KJ1113,GPS,...,-122.396,101.190028,43,"{""intersection_loc_id"": ""32938F73-0173-4459-93...",32938F73-0173-4459-9342-109E7854013A,49.993328,,"{""is_delayed_by_intersection"": true, ""is_delay...",True,
43,89283082a77ffff,-122.395503,37.794022,2021-05-01 05:50:15.194,NBUS,outbound,9823544,20210430-t11,4010KJ1113,GPS,...,-122.396,101.253068,44,"{""intersection_loc_id"": ""32938F73-0173-4459-93...",32938F73-0173-4459-9342-109E7854013A,50.120804,,"{""is_delayed_by_intersection"": true, ""is_delay...",True,
44,89283082a77ffff,-122.395505,37.794023,2021-05-01 05:50:18.963,NBUS,outbound,9823544,20210430-t11,4010KJ1113,GPS,...,-122.396,101.357381,45,"{""intersection_loc_id"": ""32938F73-0173-4459-93...",32938F73-0173-4459-9342-109E7854013A,50.085007,,"{""is_delayed_by_intersection"": true, ""is_delay...",True,


# Testing

1) closest points - actual: yellow / expected: blue

2) all breadcrumbs of trip - blue

3) all breadcrumbs with mph <= mph_threshold - yellow

4) stop arrive / depart events - green

5) is_delayed_by_intersection - red

In [39]:
stop_arrive_stopdepart_pdf = zoned_trip_data.filter("event IN ('stop arrive', 'stop depart')").distinct().toPandas()

In [40]:
stop_arrive_stopdepart_pdf["routeName"].unique()

array(['NBUS', '49', '25', '8', '14R', '38', '29', 'MBUS', '43', '15',
       '54', '55', '9', '38R', '28', '44', '1', 'L-OWL', '8AX', '24',
       '12', '37', 'LBUS', 'KBUS', '91', '27', '48', '30', '45', '19',
       '5', '14', '22', 'N-OWL', '7', 'TBUS', '33', '67', '9R', '90'],
      dtype=object)

In [41]:
stop_arrive_stopdepart_pdf.groupby(["routeName", "direction", "logID", "deviceID"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,lon,lat,time,tripID,event,mph,zone_id,breadcrumb_index
routeName,direction,logID,deviceID,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,inbound,20210501-t10,4010KW2045,55,55,55,55,55,55,55,55
1,inbound,20210501-t12,4010KW2045,54,54,54,54,54,54,54,54
1,inbound,20210501-t14,4010KW2045,55,55,55,55,55,55,55,55
1,inbound,20210501-t2,4010KO2013,53,53,53,53,53,53,53,53
1,inbound,20210501-t2,4010KW2045,52,52,52,52,52,52,52,52
...,...,...,...,...,...,...,...,...,...,...,...
TBUS,outbound,20210501-t2,4010KK1240,43,43,43,43,43,43,43,43
TBUS,outbound,20210501-t2,4010KK2085,43,43,43,43,43,43,43,43
TBUS,outbound,20210501-t3,4010KK2078,41,41,41,41,41,41,41,41
TBUS,outbound,20210501-t3,4010KO2027,43,43,43,43,43,43,43,43


In [42]:
stop_arrive_stopdepart_pdf.groupby(["routeName", "direction", "logID", "deviceID"]).count().loc["1"]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,lon,lat,time,tripID,event,mph,zone_id,breadcrumb_index
direction,logID,deviceID,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
inbound,20210501-t10,4010KW2045,55,55,55,55,55,55,55,55
inbound,20210501-t12,4010KW2045,54,54,54,54,54,54,54,54
inbound,20210501-t14,4010KW2045,55,55,55,55,55,55,55,55
inbound,20210501-t2,4010KO2013,53,53,53,53,53,53,53,53
inbound,20210501-t2,4010KW2045,52,52,52,52,52,52,52,52
inbound,20210501-t4,4010KW2045,55,55,55,55,55,55,55,55
inbound,20210501-t6,4010KW2045,55,55,55,55,55,55,55,55
inbound,20210501-t8,4010KW2045,55,55,55,55,55,55,55,55
outbound,20210501-t1,4010KJ1039,2,2,2,2,2,2,2,2
outbound,20210501-t1,4010KO2013,45,45,45,45,45,45,45,45


In [43]:
# selecting current trip
cur_direction = "outbound"
routeName = "1"
cur_logid = "20210501-t1"
cur_deviceid = "4010KO2013"

In [44]:
# all breadcrumbs of trip - blue

In [45]:
cur_all_breadcrumbs = zoned_trip_data_intersection \
    .filter(trip_log_datas.routeName == routeName) \
    .filter(trip_log_datas.direction == cur_direction) \
    .filter(trip_log_datas.logID == cur_logid) \
    .filter(trip_log_datas.deviceID == cur_deviceid) \
    .select("lon", "lat", "routeName", "direction", "logID", "deviceID", "mph", "time", "event", "zone_id", "breadcrumb_index", 
            "upcoming_intersection_loc_id", "dist_to_upcoming_intersection", "is_delayed_by_intersection", "is_delayed_by_intersection_note", "upcoming_intersection_note") \
    .distinct().toPandas()

In [46]:
# we save these breadcrumbs as "all" because in zoned_trip_data_intersection we only have those joined with intersection
#(in h3 zones that have intersections)
all_basic_breadcrumbs = zoned_trip_data \
    .filter(trip_log_datas.routeName == routeName) \
    .filter(trip_log_datas.direction == cur_direction) \
    .filter(trip_log_datas.logID == cur_logid) \
    .filter(trip_log_datas.deviceID == cur_deviceid) \
    .select("lon", "lat", "routeName", "direction", "logID", "deviceID", "mph", "time", "event", "zone_id", "breadcrumb_index") \
    .distinct().toPandas()

In [47]:
all_basic_breadcrumbs.head()

Unnamed: 0,lon,lat,routeName,direction,logID,deviceID,mph,time,event,zone_id,breadcrumb_index
0,-122.397617,37.794232,1,outbound,20210501-t1,4010KO2013,1.15078,2021-05-02 01:02:15.862,trip new,89283082a2bffff,1
1,-122.397617,37.794232,1,outbound,20210501-t1,4010KO2013,1.15078,2021-05-02 01:02:16.413,stop jump,89283082a2bffff,2
2,-122.397617,37.794237,1,outbound,20210501-t1,4010KO2013,1.035702,2021-05-02 01:02:16.520,trip pending,89283082a2bffff,3
3,-122.397617,37.794237,1,outbound,20210501-t1,4010KO2013,1.035702,2021-05-02 01:02:17.060,stop arrive,89283082a2bffff,4
4,-122.397617,37.794237,1,outbound,20210501-t1,4010KO2013,1.035702,2021-05-02 01:02:17.144,GPS,89283082a2bffff,5


In [48]:
all_basic_breadcrumbs.to_csv("bus_stop_maps/all_" + routeName + "_" + cur_direction +  "_" + cur_logid + "_" + cur_deviceid + ".csv")

In [49]:
# all breadcrumbs with mph <= mph_threshold - yellow

In [50]:
cur_breadcrumbs_mph_less_than_thresh = cur_all_breadcrumbs[cur_all_breadcrumbs.mph <= mph_threshold]

In [51]:
cur_breadcrumbs_mph_less_than_thresh.head()

Unnamed: 0,lon,lat,routeName,direction,logID,deviceID,mph,time,event,zone_id,breadcrumb_index,upcoming_intersection_loc_id,dist_to_upcoming_intersection,is_delayed_by_intersection,is_delayed_by_intersection_note,upcoming_intersection_note
1,-122.446618,37.787333,1,outbound,20210501-t1,4010KO2013,0.0,2021-05-02 01:26:02.907,GPS,8928308293bffff,1288,42B1E300-2137-443A-AE98-4707A5419D1C,33.743725,True,,
6,-122.407553,37.793037,1,outbound,20210501-t1,4010KO2013,0.115078,2021-05-02 01:08:42.955,GPS,89283082bcbffff,357,DE683B43-4CCD-4AEB-A22F-C1C6D707B032,43.246754,False,on bus stop,
7,-122.425638,37.790945,1,outbound,20210501-t1,4010KO2013,0.0,2021-05-02 01:18:29.525,GPS,89283082bbbffff,885,F4B044CE-852F-4DC8-97E6-95699AF99381,32.171864,True,,
9,-122.446618,37.787333,1,outbound,20210501-t1,4010KO2013,0.0,2021-05-02 01:25:54.262,GPS,8928308293bffff,1281,42B1E300-2137-443A-AE98-4707A5419D1C,33.743725,True,,
12,-122.420688,37.791633,1,outbound,20210501-t1,4010KO2013,2.646794,2021-05-02 01:14:54.393,GPS,89283082b13ffff,689,05E5BE1D-3A7F-410C-B1C6-36FD39219292,27.63612,True,,


In [52]:
cur_all_breadcrumbs[((cur_all_breadcrumbs["dist_to_upcoming_intersection"] < 70.0) & (cur_all_breadcrumbs["dist_to_upcoming_intersection"] < 3.0))]

Unnamed: 0,lon,lat,routeName,direction,logID,deviceID,mph,time,event,zone_id,breadcrumb_index,upcoming_intersection_loc_id,dist_to_upcoming_intersection,is_delayed_by_intersection,is_delayed_by_intersection_note,upcoming_intersection_note
226,-122.433978,37.789892,1,outbound,20210501-t1,4010KO2013,10.011786,2021-05-02 01:21:38.695,GPS,8928308294fffff,1052,976878FF-898A-475D-89DA-B735E411E111,2.118568,False,mph > mph_threshold,
799,-122.446982,37.787297,1,outbound,20210501-t1,4010KO2013,6.559446,2021-05-02 01:26:43.941,GPS,8928308293bffff,1322,42B1E300-2137-443A-AE98-4707A5419D1C,1.655254,False,mph > mph_threshold,


In [53]:
cur_all_breadcrumbs[((cur_all_breadcrumbs["dist_to_upcoming_intersection"] < 70.0))].sort_values("breadcrumb_index")

Unnamed: 0,lon,lat,routeName,direction,logID,deviceID,mph,time,event,zone_id,breadcrumb_index,upcoming_intersection_loc_id,dist_to_upcoming_intersection,is_delayed_by_intersection,is_delayed_by_intersection_note,upcoming_intersection_note
1008,-122.397617,37.794232,1,outbound,20210501-t1,4010KO2013,1.150780,2021-05-02 01:02:15.862,trip new,89283082a2bffff,1,24D26F5E-BCC6-4622-8202-2B7BF531295D,44.997528,True,,
358,-122.397617,37.794232,1,outbound,20210501-t1,4010KO2013,1.150780,2021-05-02 01:02:16.413,stop jump,89283082a2bffff,2,24D26F5E-BCC6-4622-8202-2B7BF531295D,44.997528,True,,
490,-122.397617,37.794237,1,outbound,20210501-t1,4010KO2013,1.035702,2021-05-02 01:02:16.520,trip pending,89283082a2bffff,3,24D26F5E-BCC6-4622-8202-2B7BF531295D,44.630901,True,,
491,-122.397617,37.794237,1,outbound,20210501-t1,4010KO2013,1.035702,2021-05-02 01:02:17.060,stop arrive,89283082a2bffff,4,24D26F5E-BCC6-4622-8202-2B7BF531295D,44.630901,False,on bus stop,
331,-122.397617,37.794237,1,outbound,20210501-t1,4010KO2013,1.035702,2021-05-02 01:02:17.144,GPS,89283082a2bffff,5,24D26F5E-BCC6-4622-8202-2B7BF531295D,44.630901,False,on bus stop,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
529,-122.446823,37.787315,1,outbound,20210501-t1,4010KO2013,8.861006,2021-05-02 01:26:40.190,GPS,8928308293bffff,1318,42B1E300-2137-443A-AE98-4707A5419D1C,15.614476,False,mph > mph_threshold,
596,-122.446868,37.787310,1,outbound,20210501-t1,4010KO2013,8.861006,2021-05-02 01:26:41.429,GPS,8928308293bffff,1319,42B1E300-2137-443A-AE98-4707A5419D1C,11.624051,False,mph > mph_threshold,
83,-122.446910,37.787307,1,outbound,20210501-t1,4010KO2013,8.515772,2021-05-02 01:26:42.696,GPS,8928308293bffff,1320,42B1E300-2137-443A-AE98-4707A5419D1C,7.943424,False,mph > mph_threshold,
616,-122.446947,37.787302,1,outbound,20210501-t1,4010KO2013,6.904680,2021-05-02 01:26:43.570,stop arrive,8928308293bffff,1321,42B1E300-2137-443A-AE98-4707A5419D1C,4.691379,False,mph > mph_threshold,


In [54]:
cur_all_breadcrumbs[((cur_all_breadcrumbs["mph"] < 3.0))].sort_values("breadcrumb_index")

Unnamed: 0,lon,lat,routeName,direction,logID,deviceID,mph,time,event,zone_id,breadcrumb_index,upcoming_intersection_loc_id,dist_to_upcoming_intersection,is_delayed_by_intersection,is_delayed_by_intersection_note,upcoming_intersection_note
1008,-122.397617,37.794232,1,outbound,20210501-t1,4010KO2013,1.150780,2021-05-02 01:02:15.862,trip new,89283082a2bffff,1,24D26F5E-BCC6-4622-8202-2B7BF531295D,44.997528,True,,
358,-122.397617,37.794232,1,outbound,20210501-t1,4010KO2013,1.150780,2021-05-02 01:02:16.413,stop jump,89283082a2bffff,2,24D26F5E-BCC6-4622-8202-2B7BF531295D,44.997528,True,,
490,-122.397617,37.794237,1,outbound,20210501-t1,4010KO2013,1.035702,2021-05-02 01:02:16.520,trip pending,89283082a2bffff,3,24D26F5E-BCC6-4622-8202-2B7BF531295D,44.630901,True,,
491,-122.397617,37.794237,1,outbound,20210501-t1,4010KO2013,1.035702,2021-05-02 01:02:17.060,stop arrive,89283082a2bffff,4,24D26F5E-BCC6-4622-8202-2B7BF531295D,44.630901,False,on bus stop,
331,-122.397617,37.794237,1,outbound,20210501-t1,4010KO2013,1.035702,2021-05-02 01:02:17.144,GPS,89283082a2bffff,5,24D26F5E-BCC6-4622-8202-2B7BF531295D,44.630901,False,on bus stop,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,-122.446618,37.787332,1,outbound,20210501-t1,4010KO2013,0.000000,2021-05-02 01:26:27.725,GPS,8928308293bffff,1308,42B1E300-2137-443A-AE98-4707A5419D1C,33.723953,True,,
466,-122.446618,37.787332,1,outbound,20210501-t1,4010KO2013,0.000000,2021-05-02 01:26:28.962,GPS,8928308293bffff,1309,42B1E300-2137-443A-AE98-4707A5419D1C,33.723953,True,,
528,-122.446617,37.787332,1,outbound,20210501-t1,4010KO2013,0.000000,2021-05-02 01:26:30.193,GPS,8928308293bffff,1310,42B1E300-2137-443A-AE98-4707A5419D1C,33.869556,True,,
460,-122.446617,37.787332,1,outbound,20210501-t1,4010KO2013,0.000000,2021-05-02 01:26:31.441,GPS,8928308293bffff,1311,42B1E300-2137-443A-AE98-4707A5419D1C,33.869556,True,,


In [55]:
cur_breadcrumbs_mph_less_than_thresh.to_csv("bus_stop_maps/mph_less_than_thresh_" + routeName + "_" + cur_direction +  "_" + cur_logid + "_" + cur_deviceid + ".csv")

In [56]:
# stop arrive / depart events - green

In [57]:
cur_breadcrumbs_stoparrive_depart = cur_all_breadcrumbs[cur_all_breadcrumbs["event"].isin(["stop arrive", "stop depart"])]


In [58]:
cur_breadcrumbs_stoparrive_depart.head()

Unnamed: 0,lon,lat,routeName,direction,logID,deviceID,mph,time,event,zone_id,breadcrumb_index,upcoming_intersection_loc_id,dist_to_upcoming_intersection,is_delayed_by_intersection,is_delayed_by_intersection_note,upcoming_intersection_note
40,-122.400368,37.794123,1,outbound,20210501-t1,4010KO2013,7.134836,2021-05-02 01:03:56.583,stop depart,89283082a2bffff,100,,,False,upcoming intersection not found,distance to upcoming intersection: 356.2029310...
98,-122.419515,37.79172,1,outbound,20210501-t1,4010KO2013,2.30156,2021-05-02 01:14:11.781,stop arrive,89283082b13ffff,650,,,False,upcoming intersection not found,distance to upcoming intersection: 131.1691161...
112,-122.419565,37.791742,1,outbound,20210501-t1,4010KO2013,8.976084,2021-05-02 01:14:30.384,stop depart,89283082b13ffff,668,,,False,upcoming intersection not found,distance to upcoming intersection: 127.0752848...
154,-122.422255,37.791398,1,outbound,20210501-t1,4010KO2013,3.45234,2021-05-02 01:16:10.449,stop arrive,89283082b8fffff,760,,,False,upcoming intersection not found,distance to upcoming intersection: 156.8833302...
167,-122.427317,37.79074,1,outbound,20210501-t1,4010KO2013,5.868978,2021-05-02 01:19:32.511,stop arrive,89283082bbbffff,938,,,False,upcoming intersection not found,distance to upcoming intersection: 594.6640824...


In [59]:
cur_breadcrumbs_stoparrive_depart.to_csv("bus_stop_maps/stop_arrive_depart_" + routeName + "_" + cur_direction +  "_" + cur_logid + "_" + cur_deviceid + ".csv")

In [60]:
# is_delayed_by_intersection - red

In [61]:
cur_breadcrumbs_delayed_by_intersection = cur_all_breadcrumbs[cur_all_breadcrumbs["is_delayed_by_intersection"]]

In [62]:
cur_breadcrumbs_delayed_by_intersection.head()

Unnamed: 0,lon,lat,routeName,direction,logID,deviceID,mph,time,event,zone_id,breadcrumb_index,upcoming_intersection_loc_id,dist_to_upcoming_intersection,is_delayed_by_intersection,is_delayed_by_intersection_note,upcoming_intersection_note
1,-122.446618,37.787333,1,outbound,20210501-t1,4010KO2013,0.0,2021-05-02 01:26:02.907,GPS,8928308293bffff,1288,42B1E300-2137-443A-AE98-4707A5419D1C,33.743725,True,,
7,-122.425638,37.790945,1,outbound,20210501-t1,4010KO2013,0.0,2021-05-02 01:18:29.525,GPS,89283082bbbffff,885,F4B044CE-852F-4DC8-97E6-95699AF99381,32.171864,True,,
9,-122.446618,37.787333,1,outbound,20210501-t1,4010KO2013,0.0,2021-05-02 01:25:54.262,GPS,8928308293bffff,1281,42B1E300-2137-443A-AE98-4707A5419D1C,33.743725,True,,
12,-122.420688,37.791633,1,outbound,20210501-t1,4010KO2013,2.646794,2021-05-02 01:14:54.393,GPS,89283082b13ffff,689,05E5BE1D-3A7F-410C-B1C6-36FD39219292,27.63612,True,,
14,-122.425638,37.790947,1,outbound,20210501-t1,4010KO2013,0.0,2021-05-02 01:18:32.019,GPS,89283082bbbffff,887,F4B044CE-852F-4DC8-97E6-95699AF99381,32.201122,True,,


In [63]:
cur_breadcrumbs_delayed_by_intersection.to_csv("bus_stop_maps/delayed_by_intersection_" + routeName + "_" + cur_direction +  "_" + cur_logid + "_" + cur_deviceid + ".csv")

In [64]:
# points where crossing intersection - white

In [65]:
cur_breadcrumbs_crossing_intersections = pd.DataFrame([row.asDict() for row in log_id_device_id_to_trip_datas_crossing_intersections[cur_logid + "_" + cur_deviceid]])

In [66]:
cur_breadcrumbs_crossing_intersections.to_csv("bus_stop_maps/crossing_intersections_" + routeName + "_" + cur_direction +  "_" + cur_logid + "_" + cur_deviceid + ".csv")
