# Valid Images
Create a DataFrame for each session with the valid images.

**Validation:**
1. The image name must be in the format `[0-9]*s[0-9]*ms`.

2. `withinRange` to confirms that the  (`image_s` - 1 is after `rec_df`, `rough_df`, and `gps_df` begin and before `rec_df`, `rough_df`, and `gps_df` end.

3. Create `rec_df_window` which has the time window in `record.csv` used in calculating the timestamp corresponding to 5 meters ahead of the image and the time window used in calculating the roughness metric at this target timestamp. This checks for four additional withinRange validation errors:
  - There must be a reading at the same `utc_s (s)` as the image.
  - `target_dist` must be <= the last distance reading in `rec_df`.
  - We must have readings in the time window used for the roughness calculation.
  - The timestamp corresponding to the index of `target_dist` should be in `rough_df`.

4. `nonzero` iterates through `rec_df_window` to make sure that all speeds are nonzero in this time window.

5. We then create `rough_df_window` and `gps_df_window`, corresponding to the same time window as `rec_df_window` but for `roughness_metric.csv` and `gps.csv`. `continuous` then iterates through `rec_df_window`, `gps_df_window`, and `rough_df_window` to make sure that the timestamp readings are continuous as follows:
  - Readings in `rec_df_window` should be no more than 1 second apart.
  - Readings in `rough_df_window` should be no more than 50 ms apart (most readings are ~ 10 ms apart). **Note:** Consider changing the tolerance to 100 ms.
  - Readings in `gps_df_window` should be no more than 500 ms apart (most readings are ~ 100 ms apart).

6. `consistent_utc` iterates through `gps_df_window` to check if the `abs(utc_s (s) - utc_timestamp(s) <= 1`.

## Set-up

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt
from collections import namedtuple
import csv
import re

## Helper functions

In [None]:
# Difference in ms between t1 and t2
# Negative if s1 is before s2, positive if s2 is before s1
def findDifferenceMs(s1, ms1, s2, ms2):
  if (s1 > s2):
    return (s1-s2-1)*1000 + (1000-ms2) + ms1
  elif (s1 < s2):
    return -((s2-s1-1)*1000 + (1000-ms1) + ms2)
  else:
    return ms1-ms2

# Parse s and ms from image name in format XsYms
def parse_times(name):
  lhs, rhs = name.split("s", 1)
  rhs, ignore = rhs.split("ms", 1);
  return int(lhs), int(rhs)

## Criteria for a valid image

#### Helper functions

In [None]:
# Return an empty DataFrame if there was a validation issue, else return
# the window of rec_df corresponding to the values used in labeling the image.
def rec_df_window(s, ms, rec_df, rough_df, m_ahead, window):
  
  # Validation 1: Make sure that a reading exists in rec_df with 'utc_s (s)' == s.
  # This is used for the closest_and_before calculation in the lableing script.
  row1 = rec_df[rec_df['utc_s (s)'] == s]
  if len(row1) == 0:
    return pd.DataFrame()
  
  # index1 corresponds to the image.
  index1 = row1.index.values[0]

  # Overestimate of the distance window.
  distance_window = rec_df[rec_df['utc_s (s)'] >= s]
  distance_window = distance_window[1:]
  d0_estimate = distance_window['distance (m)'].iloc[0]

  # Validation 2: Make sure the target_dist is in rec_df.
  target_dist = d0_estimate + m_ahead
  if rec_df['distance (m)'].iloc[-1] < target_dist:
    return pd.DataFrame()

  # index2 corresponds to the target distance.
  index2 = -1
  for index, row in distance_window.iterrows():
    dist = row['distance (m)']
    if dist > d0_estimate + m_ahead:
      index2 = index
      break
  
  # Validation 3: The roughness reading is from 1 second around 5 meters ahead.
  # We need at least one reading after index2 to calculate the roughness metric.
  if index2 == len(rec_df) - 1:
    return pd.DataFrame()
  
  index2 += 1

  # Validation 4: The time corresponding to index2 should be in rough_df.
  # TODO: Should we also check if it is in gps.csv?
  if findDifferenceMs(rough_df['utc_s'].iloc[-1], rough_df['utc_ms'].iloc[-1], rec_df['utc_s (s)'].iloc[index2], rec_df['utc_ms (ms)'].iloc[index2]) < 0:
    return pd.DataFrame()
  
  return rec_df[index1:index2+1]

In [None]:
def df_window(rec_window, df, utc_s, utc_ms):
  t1_s = rec_window['utc_s (s)'].iloc[0]
  t1_ms = rec_window['utc_ms (ms)'].iloc[0]
  t2_s = rec_window['utc_s (s)'].iloc[-1]
  t2_ms = rec_window['utc_ms (ms)'].iloc[-1]

  # If we have an entry before t1_s in bounds, start the search there.
  start_entry = df[df[utc_s] == t1_s - 1]
  if len(start_entry) == 0:
    start_index = 0
  else:
    start_index = start_entry.index.values[0]

  # Find index1 = closest to and before t1
  index1 = start_index
  while index1 < len(df) and (df.at[index1, utc_s] < t1_s or (df.at[index1, utc_s] == t1_s and df.at[index1, utc_ms] < t1_ms)):
    index1 += 1
  
  # index1 is set to the entry right after the t1_s, set it to the first entry before t1_s
  if index1 > 0:
    index1 -= 1
  
  # Find index2 = closest to and after t2
  index2 = index1
  while index2 < len(df) and (df.at[index2, utc_s] < t2_s or (df.at[index2, utc_s] == t2_s and df.at[index2, utc_ms] < t2_ms)):
    index2 += 1
  
  # TODO: if index1 = index2, should we declare the image invalid?
  return df[index1:index2+1]
  

#### Validity checks

In [None]:
# (df_s1, df_ms1) is the start time of df, (df_s2, df_ms2) is the end time of df
def withinRange(s, ms, window, df_s1, df_ms1, df_s2, df_ms2):
  if findDifferenceMs(s - 1, 0, df_s1, df_ms1) < 0:
    return False
  
  if findDifferenceMs(df_s2, df_ms2, s, ms) < 0:
    return False

  return True

In [None]:
def nonzero(rec_window):
  for _, row in rec_window.iterrows():
    if row['enhanced_speed (m/s)'] == 0:
      return False
  
  return True

In [None]:
def continuous(df, tolerance, utc_s, utc_ms):
  for i in range(len(df)):
    if i == 0:
      continue
      
    if findDifferenceMs(df[utc_s].iloc[i], df[utc_ms].iloc[i], df[utc_s].iloc[i-1], df[utc_ms].iloc[i-1]) > tolerance:
      return False
  
  return True

In [None]:
def consistent_utc(gps_window):
  for _, row in gps_window.iterrows():
    if not abs(row['utc_s (s)'] - row['utc_timestamp (s)'] <= 1):
      print(row)
      return False
      
  return True

In [None]:
def is_valid(image, rec_df, rough_df, gps_df, window, m_ahead):
  s, ms = parse_times(image)

  # Criteria 1: Image name is in format XsYms
  if not bool(re.match("[0-9]*s[0-9]*ms", image)):
    print("Image", image, "has an improper name format.")
    return False
  
  # Criteria 2: Image time - int(1 + (window / 1000.)) should be after df_s, df_ms
  in_range_rec = withinRange(s, ms, window, rec_df['utc_s (s)'].iloc[0], rec_df['utc_ms (ms)'].iloc[0], rec_df['utc_s (s)'].iloc[-1], rec_df['utc_ms (ms)'].iloc[-1])
  in_range_rough = withinRange(s, ms, window, rough_df['utc_s'].iloc[0], rough_df['utc_ms'].iloc[0], rough_df['utc_s'].iloc[-1], rough_df['utc_ms'].iloc[-1])
  in_range_gps = withinRange(s, ms, window, gps_df['utc_s (s)'].iloc[0], gps_df['utc_ms (ms)'].iloc[0], gps_df['utc_s (s)'].iloc[-1], gps_df['utc_ms (ms)'].iloc[-1])
  if not in_range_rec or not in_range_rough or not in_range_gps:
    print("Image", image, "is not in range.")
    return False

  # Criteria 3: Additional range checks during creation of rec_window
  # If one of these check fails, rec_window = pd.DataFrame()
  rec_window = rec_df_window(s, ms, rec_df, rough_df, m_ahead, window)
  if len(rec_window) == 0:
    print("Image", image, "had validation errors in rec_df_window.")
    return False
  
  # Criteria 4: The speeds in the window surrounding the image must be nonzero
  if not nonzero(rec_window):
    print("Image", image, "has one or more speed readings = 0.")
    return False
  
  # Criteria 5: Timestamps in record.csv, roughness_metric.csv, and gps.csv are continuous
  rough_window = df_window(rec_window, rough_df, 'utc_s', 'utc_ms')
  gps_window = df_window(rec_window, gps_df, 'utc_s (s)', 'utc_ms (ms)')
  
  continuous_rec = continuous(rec_window, 1001, 'utc_s (s)', 'utc_ms (ms)')
  continuous_rough = continuous(rough_window, 50, 'utc_s', 'utc_ms')
  continuous_gps = continuous(gps_window, 500, 'utc_s (s)', 'utc_ms (ms)')
  
  if not (continuous_rec and continuous_rough and continuous_gps):    
    print("Image", image, "does not have continuous readings.")
    return False

  # Criteria 6: Calculated UTC timestamp matches the recorded UTC timestamp
  consistent_utc_time = consistent_utc(gps_window)
  if not consistent_utc_time:
    print("Image", image, "does not have consistent UTC time.")
    return False

  return True


## Test data

In [None]:
# test_rec_df = pd.read_csv('/gdrive/My Drive/Labeling/SensorData/Garmin/2020-09-29-09-46-42-garmin/record.csv', index_col=0)
# test_rec_df = test_rec_df.filter(items=['utc_s (s)', 'utc_ms (ms)', 'distance (m)', 'enhanced_speed (m/s)'])
# test_rough_df = pd.read_csv('/gdrive/My Drive/Labeling/SensorData/Virb/2020-09-29-09-46-42/roughness_metric.csv', index_col=0)
# test_gps_df = pd.read_csv('/gdrive/My Drive/Labeling/SensorData/Virb/2020-09-29-09-46-42/gps.csv', index_col=0)
# test_gps_df = test_gps_df.filter(items=['utc_s (s)', 'utc_ms (ms)', 'utc_timestamp (s)'])

# window = 500
# m_ahead = 5

# # Test nonzero
# Range with 0 values: 2771 - 2820, 970328033 - 970328081
# Range with no 0 values: 100 - 120, 970325361 - 970325380, however not in range with gps.csv
# Range where second part has 0 values: 2746 - 2747, 970328008 - 970328009 
# Range where first part has 0 values: 2868 - 2869, 970328130 - 970328131
# print(is_valid("970328034s0ms", test_rec_df, test_rough_df, test_gps_df, window, m_ahead))
# print(is_valid("970325361s50ms", test_rec_df, test_rough_df, test_gps_df, window, m_ahead))
# print(is_valid("970328008s50ms", test_rec_df, test_rough_df, test_gps_df, window, m_ahead))
# print(is_valid("970328130s50ms", test_rec_df, test_rough_df, test_gps_df, window, m_ahead))

# # Test withinRange
# # Start time of test_rec_df = (970325257, 0)
# # Start time of test_rough_df = (970325257, 571)
# # Start time of test_gps_df = (970325478, 800.0)
# print(is_valid("970325257s0ms", test_rec_df, test_rough_df, test_gps_df, window, m_ahead))
# print(is_valid("970325258s0ms", test_rec_df, test_rough_df, test_gps_df, window, m_ahead))
# print(is_valid("970325258s571ms", test_rec_df, test_rough_df, test_gps_df, window, m_ahead))
# print(is_valid("970325500s0ms", test_rec_df, test_rough_df, test_gps_df, window, m_ahead))

# Test consistent_utc
# print(is_valid("970327043s0ms", test_rec_df, test_rough_df, test_gps_df, window, m_ahead))
# print(is_valid("970329000s0ms", test_rec_df, test_rough_df, test_gps_df, window, m_ahead))

## Create DataFrame with valid images

In [None]:
CSVDirs = namedtuple('CSVDirs', ['virb_dir', 'garmin_dir'])

def csv_path(folder):
  root_csv_dir = '/gdrive/My Drive/Labeling/SensorData'
  virb_dir = os.path.join(root_csv_dir, 'Virb', folder)
  garmin_dir = os.path.join(root_csv_dir, 'Garmin', folder + '-garmin')
  return CSVDirs(virb_dir, garmin_dir)

video_csvs = {
    "2020-07-28-06-01-11": csv_path('2020-07-28-06-01-11'),
    "2020-09-23-VIRB0001": csv_path('2020-09-23-16-10-10'),
    "2020-09-23-VIRB0002": csv_path('2020-09-23-16-10-10'),
    "2020-09-23-VIRB0004": csv_path('2020-09-23-16-10-10'),
    "2020-09-23-VIRB0008": csv_path('2020-09-23-16-10-10'),
    "2020-09-24-1": csv_path('2020-09-24-12-07-41'),
    "2020-09-24-2": csv_path('2020-09-24-12-07-41'),
    "2020-09-24-3": csv_path('2020-09-24-12-07-41'),
    "2020-09-24-4": csv_path('2020-09-24-12-07-41'),
    "2020-09-29-09-46-42-1": csv_path('2020-09-29-09-46-42'),
    "2020-09-29-09-46-42-2": csv_path('2020-09-29-09-46-42'),
    "2020-09-29-09-46-42-3": csv_path('2020-09-29-09-46-42'),
    "2020-09-29-09-46-42-4": csv_path('2020-09-29-09-46-42'),
    "2020-10-02-10-17-05-1": csv_path('2020-10-02-10-17-05'),
    "2020-10-02-10-17-05-2": csv_path('2020-10-02-10-17-05'),
    "2020-10-02-10-17-05-3": csv_path('2020-10-02-10-17-05'),
}

In [None]:
# Label all images in Labeling/VideoData/RealVideoSplit/VideoFrames
def valid_images():
  window = 500
  m_ahead = 5

  with open('/gdrive/My Drive/Labeling/ValidImages/valid_images.txt', 'w') as outfile:
    
    frame_directory = '/gdrive/My Drive/Labeling/VideoData/RealVideoSplit/VideoFrames'
    for root, dirs, files in os.walk(frame_directory):
      if root == '/gdrive/My Drive/Labeling/VideoData/RealVideoSplit/VideoFrames':
        for dir in dirs:
          if dir in video_csvs:
            virb_dir = video_csvs[dir].virb_dir
            garmin_dir = video_csvs[dir].garmin_dir
            df = valid_images_in_folder(os.path.join(root, dir), virb_dir, garmin_dir, window, m_ahead)
            output = dir + "\nNumber of valid images: " + str(len(df)) + " Total images: " + str(len(os.listdir(os.path.join(root,dir)))) + "\n"
            outfile.write(output)

            csv_name = dir + '.csv'
            df.to_csv(os.path.join('/gdrive/My Drive/Labeling/ValidImages/', csv_name), index=False)

In [None]:
# Return lists with the video name, image name, and roughness group for all 
# images in the folder
def valid_images_in_folder(folder_dir, virb_dir, garmin_dir, window, m_ahead):
  folder = os.path.split(folder_dir)[1]
  print("Creating list of valid images in folder", folder)

  valid_images = pd.DataFrame()
  image_list = []

  rec_df = pd.read_csv(os.path.join(garmin_dir, "record.csv"), index_col=0)
  rec_df = rec_df.filter(items=['utc_s (s)', 'utc_ms (ms)', 'distance (m)', 'enhanced_speed (m/s)'])
  rough_df = pd.read_csv(os.path.join(virb_dir, "roughness_metric.csv"), index_col=0)
  gps_df = pd.read_csv(os.path.join(virb_dir, "gps.csv"), index_col=0)
  gps_df = gps_df.filter(items=['utc_s (s)', 'utc_ms (ms)', 'utc_timestamp (s)'])

  count = 0

  images = os.listdir(folder_dir)
  for image in images:
    count += 1
    if count % 100 == 0:
      print(count)
   
    if is_valid(image, rec_df, rough_df, gps_df, window, m_ahead):
     image_list.append(image)
    else:
      print("Folder:", folder)
  
  valid_images['images'] = image_list
  return valid_images

In [None]:
valid_images()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Image 970327469s225ms.jpg does not have continuous readings.
Folder: 2020-09-29-09-46-42-2
Image 970327470s226ms.jpg does not have continuous readings.
Folder: 2020-09-29-09-46-42-2
Image 970327471s227ms.jpg does not have continuous readings.
Folder: 2020-09-29-09-46-42-2
Image 970327472s228ms.jpg does not have continuous readings.
Folder: 2020-09-29-09-46-42-2
Image 970327473s229ms.jpg does not have continuous readings.
Folder: 2020-09-29-09-46-42-2
Image 970327474s230ms.jpg does not have continuous readings.
Folder: 2020-09-29-09-46-42-2
Image 970327475s231ms.jpg does not have continuous readings.
Folder: 2020-09-29-09-46-42-2
Image 970327476s199ms.jpg does not have continuous readings.
Folder: 2020-09-29-09-46-42-2
Image 970327477s200ms.jpg does not have continuous readings.
Folder: 2020-09-29-09-46-42-2
Image 970327478s201ms.jpg does not have continuous readings.
Folder: 2020-09-29-09-46-42-2
Image 970327479s202ms.jpg