# Valid Images
Create a DataFrame for each session with the valid images.

**Validation:**
Each image considers the readings closest to and after the image timestamp up until 1 second ahead. For simplicity, we simplify the ranges in terms of seconds as follows:
- The reading closest to and after `image_s, image_ms` will always have a seconds reading of `image_s` or `image_s + 1`. For example (1000, 999) may be closest to (1001, 2). If the closest reading falls outside of this range, it is not valid.
- The calculation of the roughness metric considers readings 1 second ahead of the closest to and after roughness reading. Consider the example above, where the closest reading was (1001, 2). In this case, readings will be considered up until (1002, 2). Since we just want to deal with seconds, we will check to make sure that there is data up to `image_s + 3`.

1. The image name must be in the format `[0-9]*s[0-9]*ms`.

2. `withinRange` to confirms that `image_s` is after `rec_df` and `rough_df` begin and `image_s + 3` is before `rec_df` and `rough_df` end.

3. `nonzero` checks that `rec_df` from `image_s` to `image_s + 3` have nonzero speeds.

4. `continuous` checks that:
  - Readings in `rec_df` from `image_s` to `image_s + 3` should be no more than 1 second apart.
  - Readings in `rough_df` from `image_s` to `image_s + 3` should be no more than 50 ms apart (most readings are ~ 10 ms apart).

## Set-up

In [1]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt
from collections import namedtuple
import csv
import re

## Helper functions

In [3]:
# Difference in ms between t1 and t2
# Negative if s1 is before s2, positive if s2 is before s1
def findDifferenceMs(s1, ms1, s2, ms2):
  if (s1 > s2):
    return (s1-s2-1)*1000 + (1000-ms2) + ms1
  elif (s1 < s2):
    return -((s2-s1-1)*1000 + (1000-ms1) + ms2)
  else:
    return ms1-ms2

# Parse s and ms from image name in format XsYms
def parse_times(name):
  lhs, rhs = name.split("s", 1)
  rhs, ignore = rhs.split("ms", 1);
  return int(lhs), int(rhs)

## Criteria for a valid image

#### Helper functions

In [4]:
# Return an empty DataFrame if there was a validation issue, else return
# the window of rec_df corresponding to the values used in labeling the image.
def rec_df_window(s, ms, rec_df):
  window = rec_df[rec_df['utc_s (s)'] >= s]
  window = window[window['utc_s (s)'] <= s + 3]

  # If there are not readings at s, s + 1, s + 2, and s + 3, validation error.
  if len(window) != 4:
    return pd.DataFrame()
  
  return window

In [5]:
def rough_df_window(s, ms, rough_df):
  window = rough_df[rough_df['utc_s'] >= s]
  window = window[window['utc_s'] <= s + 3]
  return window

#### Validity checks

In [6]:
# (df_s1, df_ms1) is the start time of df, (df_s2, df_ms2) is the end time of df
def withinRange(s, ms, df_s1, df_ms1, df_s2, df_ms2):
  if findDifferenceMs(s, 0, df_s1, df_ms1) < 0:
    return False
  
  if findDifferenceMs(df_s2, df_ms2, s + 3, 0) < 0:
    return False

  return True

In [7]:
def nonzero(rec_window):
  for _, row in rec_window.iterrows():
    if row['enhanced_speed (m/s)'] == 0:
      return False
  
  return True

In [8]:
def continuous(df, tolerance, utc_s, utc_ms):
  for i in range(len(df)):
    if i == 0:
      continue
      
    if findDifferenceMs(df[utc_s].iloc[i], df[utc_ms].iloc[i], df[utc_s].iloc[i-1], df[utc_ms].iloc[i-1]) > tolerance:
      return False
  
  return True

In [9]:
def is_valid(image, rec_df, rough_df):
  s, ms = parse_times(image)

  # Criteria 1: Image name is in format XsYms
  if not bool(re.match("[0-9]*s[0-9]*ms", image)):
    print("Image", image, "has an improper name format.")
    return False
  
  # Criteria 2: Image time - int(1 + (window / 1000.)) should be after df_s, df_ms
  in_range_rec = withinRange(s, ms, rec_df['utc_s (s)'].iloc[0], rec_df['utc_ms (ms)'].iloc[0], rec_df['utc_s (s)'].iloc[-1], rec_df['utc_ms (ms)'].iloc[-1])
  in_range_rough = withinRange(s, ms, rough_df['utc_s'].iloc[0], rough_df['utc_ms'].iloc[0], rough_df['utc_s'].iloc[-1], rough_df['utc_ms'].iloc[-1])
  if not in_range_rec or not in_range_rough:
    print("Image", image, "is not in range.")
    return False

  # Criteria 3: Additional range checks during creation of rec_window
  # If one of these check fails, rec_window = pd.DataFrame()
  rec_window = rec_df_window(s, ms, rec_df)
  if len(rec_window) == 0:
    print("Image", image, "had validation errors in rec_df_window.")
    return False
  
  # Criteria 4: The speeds in the window surrounding the image must be nonzero
  if not nonzero(rec_window):
    print("Image", image, "has one or more speed readings = 0.")
    return False
  
  # Criteria 5: Timestamps in record.csv, roughness_metric.csv, and gps.csv are continuous
  rough_window = rough_df_window(s, ms, rough_df)
  
  continuous_rec = continuous(rec_window, 1001, 'utc_s (s)', 'utc_ms (ms)')
  continuous_rough = continuous(rough_window, 50, 'utc_s', 'utc_ms')
  
  if not (continuous_rec and continuous_rough):    
    print("Image", image, "does not have continuous readings.")
    return False

  return True

## Test data

In [17]:
# test_rec_df = pd.read_csv('/gdrive/My Drive/Labeling/SensorData/Garmin/2020-09-29-09-46-42-garmin/record.csv', index_col=0)
# test_rec_df = test_rec_df.filter(items=['utc_s (s)', 'utc_ms (ms)', 'distance (m)', 'enhanced_speed (m/s)'])
# test_rough_df = pd.read_csv('/gdrive/My Drive/Labeling/SensorData/Virb/2020-09-29-09-46-42/roughness_metric.csv', index_col=0)

# # Test nonzero
# # Range with 0 values: 2771 - 2820, 970328033 - 970328081
# # Range with no 0 values: 100 - 120, 970325361 - 970325380, however not in range with gps.csv
# # Range where second part has 0 values: 2746 - 2747, 970328008 - 970328009 
# # Range where first part has 0 values: 2868 - 2869, 970328130 - 970328131
# print('\n', is_valid("970328034s0ms", test_rec_df, test_rough_df))
# print('\n', is_valid("970325361s50ms", test_rec_df, test_rough_df))
# print('\n', is_valid("970328008s50ms", test_rec_df, test_rough_df))
# print('\n', is_valid("970328130s50ms", test_rec_df, test_rough_df))

# # Test withinRange
# # Start time of test_rec_df = (970325257, 0)
# # Start time of test_rough_df = (970325257, 571)
# print('\n', is_valid("970325257s0ms", test_rec_df, test_rough_df))
# print('\n', is_valid("970325258s0ms", test_rec_df, test_rough_df))
# print('\n', is_valid("970325258s571ms", test_rec_df, test_rough_df))
# print('\n', is_valid("970325500s0ms", test_rec_df, test_rough_df))

# # Test consistent_utc
# print('\n', is_valid("970327043s0ms", test_rec_df, test_rough_df))
# print('\n', is_valid("970329000s0ms", test_rec_df, test_rough_df))

      utc_s (s)  utc_ms (ms)  distance (m)  enhanced_speed (m/s)
2772  970328034            0       9522.26                   0.0
2773  970328035            0       9522.26                   0.0
2774  970328036            0       9522.26                   0.0
2775  970328037            0       9522.26                   0.0
Image 970328034s0ms has one or more speed readings = 0.

 False
     utc_s (s)  utc_ms (ms)  distance (m)  enhanced_speed (m/s)
100  970325361            0        253.85                 2.519
101  970325362            0        256.49                 2.641
102  970325363            0        258.92                 2.435
103  970325364            0        261.63                 2.706

 True
      utc_s (s)  utc_ms (ms)  distance (m)  enhanced_speed (m/s)
2746  970328008            0       9522.26                 2.967
2747  970328009            0       9522.26                 0.000
2748  970328010            0       9522.26                 0.000
2749  970328011         

## Create DataFrame with valid images

In [10]:
CSVDirs = namedtuple('CSVDirs', ['virb_dir', 'garmin_dir'])

def csv_path(folder):
  root_csv_dir = '/gdrive/My Drive/Labeling/SensorData'
  virb_dir = os.path.join(root_csv_dir, 'Virb', folder)
  garmin_dir = os.path.join(root_csv_dir, 'Garmin', folder + '-garmin')
  return CSVDirs(virb_dir, garmin_dir)

video_csvs = {
    "2020-07-28-06-01-11": csv_path('2020-07-28-06-01-11'),
    "2020-09-23-VIRB0001": csv_path('2020-09-23-16-10-10'),
    "2020-09-23-VIRB0002": csv_path('2020-09-23-16-10-10'),
    "2020-09-23-VIRB0004": csv_path('2020-09-23-16-10-10'),
    "2020-09-23-VIRB0008": csv_path('2020-09-23-16-10-10'),
    "2020-09-24-1": csv_path('2020-09-24-12-07-41'),
    "2020-09-24-2": csv_path('2020-09-24-12-07-41'),
    "2020-09-24-3": csv_path('2020-09-24-12-07-41'),
    "2020-09-24-4": csv_path('2020-09-24-12-07-41'),
    "2020-09-29-09-46-42-1": csv_path('2020-09-29-09-46-42'),
    "2020-09-29-09-46-42-2": csv_path('2020-09-29-09-46-42'),
    "2020-09-29-09-46-42-3": csv_path('2020-09-29-09-46-42'),
    "2020-09-29-09-46-42-4": csv_path('2020-09-29-09-46-42'),
    "2020-10-02-10-17-05-1": csv_path('2020-10-02-10-17-05'),
    "2020-10-02-10-17-05-2": csv_path('2020-10-02-10-17-05'),
    "2020-10-02-10-17-05-3": csv_path('2020-10-02-10-17-05'),
}

In [11]:
# Label all images in Labeling/VideoData/RealVideoSplit/VideoFrames
def valid_images():
  with open('/gdrive/My Drive/Labeling/ValidImages/Metric_2/valid_images.txt', 'w') as outfile:
    
    frame_directory = '/gdrive/My Drive/Labeling/VideoData/RealVideoSplit/VideoFrames'
    for root, dirs, files in os.walk(frame_directory):
      if root == '/gdrive/My Drive/Labeling/VideoData/RealVideoSplit/VideoFrames':
        for dir in dirs:
          if dir in video_csvs:
            virb_dir = video_csvs[dir].virb_dir
            garmin_dir = video_csvs[dir].garmin_dir
            df = valid_images_in_folder(os.path.join(root, dir), virb_dir, garmin_dir)
            output = dir + "\nNumber of valid images: " + str(len(df)) + " Total images: " + str(len(os.listdir(os.path.join(root,dir)))) + "\n"
            outfile.write(output)

            csv_name = dir + '.csv'
            df.to_csv(os.path.join('/gdrive/My Drive/Labeling/ValidImages/Metric_2', csv_name), index=False)

In [12]:
# Return lists with the video name, image name, and roughness group for all 
# images in the folder
def valid_images_in_folder(folder_dir, virb_dir, garmin_dir):
  folder = os.path.split(folder_dir)[1]
  print("Creating list of valid images in folder", folder)

  valid_images = pd.DataFrame()
  image_list = []

  rec_df = pd.read_csv(os.path.join(garmin_dir, "record.csv"), index_col=0)
  rec_df = rec_df.filter(items=['utc_s (s)', 'utc_ms (ms)', 'distance (m)', 'enhanced_speed (m/s)'])
  rough_df = pd.read_csv(os.path.join(virb_dir, "roughness_metric.csv"), index_col=0)

  count = 0

  images = os.listdir(folder_dir)
  for image in images:
    count += 1
    if count % 100 == 0:
      print(count)
   
    if is_valid(image, rec_df, rough_df):
     image_list.append(image)
    else:
      print("Folder:", folder)
  
  valid_images['images'] = image_list
  return valid_images

In [13]:
valid_images()

Creating list of valid images in folder 2020-09-24-1
Image 969902285s230ms.jpg does not have continuous readings.
Folder: 2020-09-24-1
Image 969902286s231ms.jpg does not have continuous readings.
Folder: 2020-09-24-1
Image 969902287s232ms.jpg does not have continuous readings.
Folder: 2020-09-24-1
Image 969902288s233ms.jpg does not have continuous readings.
Folder: 2020-09-24-1
100
200
Image 969902506s251ms.jpg does not have continuous readings.
Folder: 2020-09-24-1
Image 969902507s252ms.jpg does not have continuous readings.
Folder: 2020-09-24-1
Image 969902508s253ms.jpg does not have continuous readings.
Folder: 2020-09-24-1
Image 969902509s254ms.jpg does not have continuous readings.
Folder: 2020-09-24-1
300
Image 969902543s254ms.jpg does not have continuous readings.
Folder: 2020-09-24-1
Image 969902544s255ms.jpg does not have continuous readings.
Folder: 2020-09-24-1
Image 969902545s256ms.jpg does not have continuous readings.
Folder: 2020-09-24-1
Image 969902546s257ms.jpg does no