In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import pickle

In [2]:
# Read files

summary_df = pd.read_parquet("out/qc/qc_summary_df.parquet")
with open("out/qc/issue_examples.pickle", "rb") as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    issue_trips = pickle.load(f)

In [3]:
print(summary_df.shape)
print(summary_df.sum(axis=0))

(845, 6)
pid                         5908814974561939616276661141288088116767175154...
avg_speed_trips                                                             0
na_times_trips                                                              0
same_time_trips                                                           432
min_max_time_issue_trips                                                    0
very_long_trips                                                          1956
dtype: object


In [6]:
issue_trips

[{'pid': '5908',
  'avg_speed_trips': [],
  'na_times_trips': [],
  'same_time_trips': [],
  'min_max_time_issue_trips': [],
  'very_long_trips': []},
 {'pid': '8149',
  'avg_speed_trips': [],
  'na_times_trips': [],
  'same_time_trips': [],
  'min_max_time_issue_trips': [],
  'very_long_trips': []},
 {'pid': '7456',
  'avg_speed_trips': [],
  'na_times_trips': [],
  'same_time_trips': ['52A7456.0108746114432023-04-01',
   '52A7456.0109050979142023-06-12'],
  'min_max_time_issue_trips': [],
  'very_long_trips': []},
 {'pid': '19396',
  'avg_speed_trips': [],
  'na_times_trips': [],
  'same_time_trips': [],
  'min_max_time_issue_trips': [],
  'very_long_trips': []},
 {'pid': '1627',
  'avg_speed_trips': [],
  'na_times_trips': [],
  'same_time_trips': [],
  'min_max_time_issue_trips': [],
  'very_long_trips': []},
 {'pid': '6661',
  'avg_speed_trips': [],
  'na_times_trips': [],
  'same_time_trips': [],
  'min_max_time_issue_trips': [],
  'very_long_trips': []},
 {'pid': '14128',
  'avg

In [5]:
summary_df.sort_values('same_time_trips', ascending=False).head(10)

Unnamed: 0,pid,avg_speed_trips,na_times_trips,same_time_trips,min_max_time_issue_trips,very_long_trips
655,7103,0,0,39,0,3
10,1541,0,0,17,0,8
138,7120,0,0,14,0,46
60,7555,0,0,13,0,1
552,7111,0,0,11,0,16
555,6128,0,0,10,0,9
114,6126,0,0,9,0,1
120,14120,0,0,8,0,3
131,8227,0,0,8,0,37
551,7101,0,0,8,0,0


In [None]:
grouping = summary_df.groupby('pid').sum().sum(axis =1).reset_index()
grouping.rename(columns={0: 'errors'}, inplace=True)
grouping = grouping[grouping.errors > 0]

# num of issues trips as perc of processed
new_df = []
for row in grouping.itertuples():
    pid = row.pid
    trips_df = gpd.read_parquet(f"out/trips/trips_{pid}_full.parquet")
    total_trips = trips_df['unique_trip_vehicle_day'].nunique()
    new_row = {'pid': pid, 'total_trips': total_trips, 'issues': row.errors, 'perc': (row.errors/total_trips)*100}
    new_df.append(new_row)

errors_review = pd.DataFrame(new_df)

In [70]:
errors_review.sort_values('perc', ascending=False).head(10)

Unnamed: 0,pid,total_trips,issues,perc
197,19380,9147,99,1.082322
169,9379,139,1,0.719424
141,7904,149,1,0.671141
3,704,170,1,0.588235
170,9718,200,1,0.5
90,5693,666,3,0.45045
125,7109,690,3,0.434783
164,9371,860,3,0.348837
163,9370,696,2,0.287356
150,8144,356,1,0.280899


Look at some specific issue trips

In [9]:
issue_trips

[{'pid': '5908',
  'avg_speed_trips': [],
  'na_times_trips': [],
  'same_time_trips': [],
  'min_max_time_issue_trips': [],
  'very_long_trips': []},
 {'pid': '8149',
  'avg_speed_trips': [],
  'na_times_trips': [],
  'same_time_trips': [],
  'min_max_time_issue_trips': [],
  'very_long_trips': []},
 {'pid': '7456',
  'avg_speed_trips': [],
  'na_times_trips': [],
  'same_time_trips': ['52A7456.0108746114432023-04-01',
   '52A7456.0109050979142023-06-12'],
  'min_max_time_issue_trips': [],
  'very_long_trips': []},
 {'pid': '19396',
  'avg_speed_trips': [],
  'na_times_trips': [],
  'same_time_trips': [],
  'min_max_time_issue_trips': [],
  'very_long_trips': []},
 {'pid': '1627',
  'avg_speed_trips': [],
  'na_times_trips': [],
  'same_time_trips': [],
  'min_max_time_issue_trips': [],
  'very_long_trips': []},
 {'pid': '6661',
  'avg_speed_trips': [],
  'na_times_trips': [],
  'same_time_trips': [],
  'min_max_time_issue_trips': [],
  'very_long_trips': []},
 {'pid': '14128',
  'avg

In [10]:
pid = '7103'
for dict in issue_trips:
    if dict['pid'] == pid:
        print(dict['same_time_trips'])
        break

['357103.010017032023-05-15', '357103.0100503317552023-09-04', '357103.010117552023-08-30', '357103.0101182917552024-05-12', '357103.0101241216212022-08-26']


In [17]:
trip = '357103.010117552023-08-30'

df1 = pd.read_parquet(f"out/trips/trips_{pid}_full.parquet")
test_trip = df1[(df1['unique_trip_vehicle_day'] == trip)]

In [24]:
actual = pd.read_parquet(f"out/pids/{pid}.parquet")
actual[actual['unique_trip_vehicle_day'] == trip].sort_values('tmstmp')

Unnamed: 0,vid,tmstmp,lat,lon,hdg,pid,rt,des,pdist,dly,tatripid,tablockid,zone,scrape_file,data_time,data_hour,data_date,unique_trip_vehicle_day
206989,1755,20230830 06:57,41.831524,-87.610809,166,7103.0,35,24th Pl/Cicero,0,False,101,35 -205,,bus_data/2023-08-30/06:57:56.json,2023-08-30 06:57:00,6,2023-08-30,357103.010117552023-08-30
206993,1755,20230830 07:02,41.831524,-87.610809,166,7103.0,35,24th Pl/Cicero,0,False,101,35 -205,,bus_data/2023-08-30/07:02:56.json,2023-08-30 07:02:00,7,2023-08-30,357103.010117552023-08-30
206997,1755,20230830 07:07,41.831129,-87.621284,271,7103.0,35,24th Pl/Cicero,2975,False,101,35 -205,,bus_data/2023-08-30/07:07:56.json,2023-08-30 07:07:00,7,2023-08-30,357103.010117552023-08-30
207001,1755,20230830 07:12,41.830902,-87.640549,270,7103.0,35,24th Pl/Cicero,8250,False,101,35 -205,,bus_data/2023-08-30/07:12:56.json,2023-08-30 07:12:00,7,2023-08-30,357103.010117552023-08-30
207005,1755,20230830 07:16,41.830639,-87.655747,268,7103.0,35,24th Pl/Cicero,12439,False,101,35 -205,,bus_data/2023-08-30/07:17:56.json,2023-08-30 07:16:00,7,2023-08-30,357103.010117552023-08-30
207009,1755,20230830 07:22,41.830418,-87.67672,268,7103.0,35,24th Pl/Cicero,18171,False,101,35 -205,,bus_data/2023-08-30/07:22:56.json,2023-08-30 07:22:00,7,2023-08-30,357103.010117552023-08-30
207013,1755,20230830 07:27,41.830071,-87.680293,357,7103.0,35,24th Pl/Cicero,20694,False,101,35 -205,,bus_data/2023-08-30/07:27:56.json,2023-08-30 07:27:00,7,2023-08-30,357103.010117552023-08-30
207018,1755,20230830 07:27,41.830071,-87.680293,357,7103.0,35,24th Pl/Cicero,20694,False,101,35 -205,,bus_data/2023-08-30/07:32:56.json,2023-08-30 07:27:00,7,2023-08-30,357103.010117552023-08-30
207023,1755,20230830 07:37,41.830505,-87.70462,359,7103.0,35,24th Pl/Cicero,29577,False,101,35 -205,,bus_data/2023-08-30/07:37:56.json,2023-08-30 07:37:00,7,2023-08-30,357103.010117552023-08-30
207028,1755,20230830 07:42,41.83707,-87.716873,272,7103.0,35,24th Pl/Cicero,35350,False,101,35 -205,,bus_data/2023-08-30/07:42:56.json,2023-08-30 07:42:00,7,2023-08-30,357103.010117552023-08-30
