Lateness Report [link](https://gttwiki.atlassian.net/wiki/spaces/GD/pages/2492956684/Lateness+report)

Approach 2 (compare time late for the last stop for TSPon vs TSPoff)

In [1]:
%load_ext lab_black
import numpy as np
import pandas as pd
import pyspark
import scipy.stats

spark = pyspark.sql.SparkSession.builder.appName(
    "LatnessHypothesisCountLate [pyspark]"
).getOrCreate()
spark

In [2]:
!tree -h --sort=size /var/lib/gtt/2020_11_8_to_2020_11_14

[01;34m/var/lib/gtt/2020_11_8_to_2020_11_14[00m
├── [4.0K]  [01;34mCMS[00m
│   ├── [2.0G]  [01;32mopticomdevicelog.csv[00m
│   ├── [607K]  [01;32mintersectionstatusreport.csv[00m
│   ├── [ 25K]  [01;32mopticomdevicelog_data_dictonary.docx[00m
│   └── [ 15K]  [01;32mintersectionstatusreport_data_dictionary.docx[00m
└── [4.0K]  [01;34mCVP[00m
    ├── [5.4G]  [01;32mtripdatas.csv[00m
    ├── [3.3M]  [01;32mtriplogs.csv[00m
    ├── [1.4M]  [01;32mdevices.csv[00m
    ├── [ 18K]  [01;32mtriplogs_data_dictionary.docx[00m
    ├── [ 15K]  [01;32mdevices_data_dictionary.docx[00m
    └── [ 15K]  [01;32mtripdatas_data_dictionary.docx[00m

2 directories, 10 files


In [3]:
%%time
spark.sql(
    """
    create table if not exists tripdatas using csv options (
        header = true,
        inferSchema = true,
        path = '/var/lib/gtt/2020_11_8_to_2020_11_14/CVP/tripdatas.csv'
    );
    """
)

CPU times: user 10.1 ms, sys: 3.54 ms, total: 13.6 ms
Wall time: 1min 6s


DataFrame[]

In [4]:
%%time
spark.sql(
    """
    create table if not exists triplogs using csv options (
        header = true,
        inferSchema = true,
        path = '/var/lib/gtt/2020_11_8_to_2020_11_14/CVP/triplogs.csv'
    );
    """
)

CPU times: user 1.07 ms, sys: 272 µs, total: 1.34 ms
Wall time: 426 ms


DataFrame[]

In [5]:
%%time
spark.sql(
    """
    cache table breadcrumbs as (
        select deviceid || '_' || logid as gd_uid,
               count(distinct time)     as count
        from tripdatas
        where event = 'GPS'
        group by 1
        order by 1
    );
    """
);

CPU times: user 12.6 ms, sys: 3.14 ms, total: 15.8 ms
Wall time: 1min 23s


DataFrame[]

In [6]:
%%time
spark.sql(
    """
    cache table triplogs_good as (
        with --
             triplogs as (
                 select countlate,
                        countstopshit,
                        direction,
                        duration,
                        endstatus,
                        routename,
                        stops,
                        tspmode,
                        valid,
                        to_timestamp(starttime)                    as gd_starttime,
                        to_timestamp(endtime)                      as gd_endtime,
                        date_part('hour', to_timestamp(starttime)) as gd_hour,
                        deviceid || '_' || logid                   as gd_uid
                 from triplogs
             ),
             triplogs_good as (
                 select distinct triplogs.*
                 from triplogs
                          left join breadcrumbs using (gd_uid)
                 where valid
                   and gd_starttime < gd_endtime
                   and endstatus = 'completed'
                   and tspmode != 'normal'
                   and duration > 0
                   and countstopshit / stops > 0.7
                   and breadcrumbs.count / duration > 0.7
             ),
             three_sigma as (
                 select mean(duration) - 3 * stddev(duration) as min,
                        mean(duration) + 3 * stddev(duration) as max
                 from triplogs_good
             )
        select triplogs_good.*
        from triplogs_good,
             three_sigma
        where duration > three_sigma.min
          and duration < three_sigma.max
    );
    """
);

CPU times: user 0 ns, sys: 2.88 ms, total: 2.88 ms
Wall time: 4.74 s


DataFrame[]

## Whole trip

In [7]:
%%time
spark.sql(
    """
    cache table whole_trip as (
        select routename,
               direction,
               stops,
               collect_list(case when tspmode = 'alwaysOn' then countlate end)  as countlate_alwayson,
               collect_list(case when tspmode = 'alwaysOff' then countlate end) as countlate_alwaysoff
        from triplogs_good
        where tspmode in ('alwaysOn', 'alwaysOff')
        group by 1, 2, 3
        having size(countlate_alwayson) > 3
           and size(countlate_alwaysoff) > 3
        order by 1, 2, 3
    );
    """
);

CPU times: user 0 ns, sys: 3.05 ms, total: 3.05 ms
Wall time: 5.58 s


DataFrame[]

In [8]:
def statistical_testing(row):
    try:
        row["mannwhitneyu_p"] = scipy.stats.mannwhitneyu(
            row["countlate_alwayson"], row["countlate_alwaysoff"], alternative="less"
        ).pvalue
    except ValueError:
        row["mannwhitneyu_p"] = np.nan

    try:
        row["ttest_p"] = scipy.stats.ttest_ind(
            row["countlate_alwayson"], row["countlate_alwaysoff"], alternative="less"
        ).pvalue
        if (
            np.abs(
                np.sqrt(np.var(row["countlate_alwayson"], ddof=1))
                - np.sqrt(np.var(row["countlate_alwaysoff"], ddof=1))
            )
            > 3
        ):
            row["ttest_p"] = np.nan
    except ValueError:
        row["ttest_p"] = np.nan

    row["countlate_alwayson"] = str(row["countlate_alwayson"])[:25]
    row["countlate_alwaysoff"] = str(row["countlate_alwaysoff"])[:25]
    return row

In [9]:
whole_trip_results = (
    spark.table("whole_trip")
    .toPandas()
    .set_index(["routename", "direction", "stops"])
    .apply(statistical_testing, axis=1)
)
display(
    whole_trip_results.filter(regex=r"_p$")
    .apply(
        lambda s: pd.Series(
            {
                "results": f"{(s < 0.05).sum() / s.notna().sum():.2%} routes are significantly faster",
                "errors": f"{s.isna().sum() / len(s):.2%} routes errored out",
            }
        )
    )
    .T
)
with pd.option_context("display.max_rows", None):
    display(
        whole_trip_results.style.applymap(
            lambda x: (
                "background-color:rgba(256,0,0,.2)"
                if x > 0.05
                else "background-color:rgba(0,256,0,.2)"
                if x < 0.05
                else ""
            ),
            subset=["mannwhitneyu_p", "ttest_p"],
        )
    )

Unnamed: 0,results,errors
mannwhitneyu_p,15.87% routes are significantly faster,7.35% routes errored out
ttest_p,2.22% routes are significantly faster,33.82% routes errored out


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,countlate_alwayson,countlate_alwaysoff,mannwhitneyu_p,ttest_p
routename,direction,stops,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,inbound,21,"[0, 0, 0, 0, 0, 0]","[0, 0, 0, 0]",,
1,inbound,48,"[0, 0, 1, 1, 1, 1, 0, 0,","[0, 0, 0, 0, 1, 0, 0, 1]",0.925995,0.91929
1,outbound,48,"[0, 0, 20, 0, 0, 0, 0, 0,","[0, 0, 0, 0, 0, 0, 0, 0,",0.960273,
12,inbound,18,"[0, 0, 0, 2, 0, 0, 0, 0,","[0, 0, 0, 13, 0, 0, 0, 0,",0.575349,0.353733
12,outbound,19,"[0, 3, 4, 0, 0, 0, 0, 0,","[0, 0, 0, 0, 0, 0, 0, 0,",0.866204,0.715482
14R,outbound,22,"[0, 0, 0, 0, 0, 0, 0, 0,","[0, 0, 0, 0]",,
19,inbound,70,"[0, 0, 0, 0, 0, 0, 1, 0,","[0, 3, 2, 0, 0, 1, 13, 0,",0.50627,0.424839
19,outbound,64,"[53, 1, 0, 21, 0, 0, 0, 0","[0, 10, 0, 0, 0, 0, 58, 0",0.570198,0.418442
22,inbound,43,"[0, 1, 1, 0, 0, 13, 0, 1,","[1, 0, 1, 1, 0, 0, 1, 1,",0.439923,0.819628
22,outbound,49,"[0, 0, 0, 0, 0, 0, 0, 0,","[0, 0, 0, 0, 0, 0, 0, 0,",0.030423,0.093695


## With hourly separation

In [10]:
%%time
spark.sql(
    """
    cache table hourly_separation as (
        select routename,
               direction,
               stops,
               gd_hour,
               collect_list(case when tspmode = 'alwaysOn' then countlate end)  as countlate_alwayson,
               collect_list(case when tspmode = 'alwaysOff' then countlate end) as countlate_alwaysoff
        from triplogs_good
        where tspmode in ('alwaysOn', 'alwaysOff')
        group by 1, 2, 3, 4
        having size(countlate_alwayson) > 3
           and size(countlate_alwaysoff) > 3
        order by 1, 2, 3, 4
    );
    """
);

CPU times: user 2.12 ms, sys: 434 µs, total: 2.56 ms
Wall time: 6.82 s


DataFrame[]

In [11]:
hourly_separation_results = (
    spark.table("hourly_separation")
    .toPandas()
    .set_index(["routename", "direction", "stops", "gd_hour"])
    .apply(statistical_testing, axis=1)
)
display(
    hourly_separation_results.filter(regex=r"_p$")
    .apply(
        lambda s: pd.Series(
            {
                "results": f"{(s < 0.05).sum() / s.notna().sum():.2%} routes are significantly faster",
                "errors": f"{s.isna().sum() / len(s):.2%} routes errored out",
            }
        )
    )
    .T
)
with pd.option_context("display.max_rows", None):
    display(
        hourly_separation_results.style.applymap(
            lambda x: (
                "background-color:rgba(256,0,0,.2)"
                if x > 0.05
                else "background-color:rgba(0,256,0,.2)"
                if x < 0.05
                else ""
            ),
            subset=["mannwhitneyu_p", "ttest_p"],
        )
    )

Unnamed: 0,results,errors
mannwhitneyu_p,3.86% routes are significantly faster,27.64% routes errored out
ttest_p,4.38% routes are significantly faster,57.45% routes errored out


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,countlate_alwayson,countlate_alwaysoff,mannwhitneyu_p,ttest_p
routename,direction,stops,gd_hour,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12,outbound,19,18,"[0, 0, 0, 0]","[0, 0, 0, 1]",0.226627,0.177959
12,outbound,19,19,"[0, 0, 0, 0, 0, 0]","[0, 0, 2, 0, 0]",0.180655,0.148333
12,outbound,19,20,"[0, 3, 0, 0, 0, 0]","[2, 0, 0, 0, 0]",0.553852,0.558523
25,inbound,7,3,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]",,
25,inbound,7,5,"[0, 0, 0, 0, 0]","[0, 0, 0, 0]",,
28,inbound,24,0,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0]",,
28,inbound,24,17,"[0, 0, 0, 0, 0, 0]","[0, 0, 5, 0, 0]",0.180655,0.148333
28,inbound,24,19,"[0, 0, 0, 0, 0]","[0, 0, 6, 0, 0, 0]",0.232604,0.194642
28,inbound,24,21,"[0, 0, 0, 0, 0]","[2, 0, 0, 0]",0.185547,0.146176
28,inbound,24,22,"[0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0]",,
