# Auto-labeling script for TSM 1 original groups

All bounds checking and filtering of images is done in ValidImages_TSM1.ipynb.

## Set-up

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt
from collections import namedtuple
import csv
import re

## Helper functions

In [None]:
# Difference in ms between t1 and t2
# Negative if s1 is before s2, positive if s2 is before s1
def findDifferenceMs(s1, ms1, s2, ms2):
  if (s1 > s2):
      return (s1-s2-1)*1000 + (1000-ms2) + ms1
  elif (s1 < s2):
      return -((s2-s1-1)*1000 + (1000-ms1) + ms2)
  else:
      return ms1-ms2

# Parse s and ms from image name in format XsYms
def parse_times(name):
  lhs, rhs = name.split("s", 1)
  rhs, ignore = rhs.split("ms", 1);
  return int(lhs), int(rhs)

#### Closest readings to an input.

In [None]:
# Roughness reading closest to time1_s, time1_ms
def closest_roughness(rough_df, time1_s, time1_ms):
  filtered = rough_df[rough_df['utc_s'] <= time1_s + 1]
  filtered = filtered[filtered['utc_s'] >= time1_s - 1]

  min_difference = float('inf')
  closest_group = -1

  for _, row in filtered.iterrows():
    current_difference = abs(findDifferenceMs(time1_s, time1_ms, row['utc_s'], row['utc_ms']))
    if current_difference < min_difference:
      min_difference = current_difference
      closest_group = row['roughness_group']
    else:
      return closest_group
  
  return closest_group

# Index of distance reading closest to and >= target_dist
def closest_and_above(target_dist, start_index, rec_df):
  index = start_index
  while rec_df['distance (m)'].iloc[index] < target_dist and index < len(rec_df) - 1:
    index += 1
  return index

#### Calculate target distance `m_ahead` meters ahead of an image.

**Description:** `t1` = (`t1_s1`, `t1_ms`) is the time at which the image occurs, `index1` is the reading most closely corresponding to and before `t1` in `rec_df`. We want to find the distance `m_ahead` meters ahead of the image.
- `t0` = (`t0_s`, `t0_ms`) is the timestamp corresponding to `rec_df`[`index1`].
- `d1` is the distance at this entry, `d2` is the distance one entry ahead which is >= the image distance.

**Solve:**
1. Calculate distance elapsed between `d1` and the image using $\Delta distance = speed * \Delta time$.
  - `s_avg` = `d2` - `d1`, since readings in `record.csv` are one second apart.
  - `t_elapsed` = `t1` - `t0`
  - `d_elapsed` = `s_avg` * `t_elapsed`.

2. Calculate the distance reading corresponding to the image by adding `d_elapsed` to `d1`.

3. Calculate the target distance by adding `m_ahead` to `image_distance`.


In [None]:
# t1 is the image time, index1 is the index closest to and before the image
def calculate_target_distance(t1_s, t1_ms, index1, m_ahead, rec_df):

  # If our image time corresponds exactly to a reading, return m_ahead + the 
  # distance at this time.
  if t1_s == rec_df['utc_s (s)'].iloc[index1] and t1_ms == rec_df['utc_ms (ms)'].iloc[index1]:
    return rec_df['distance (m)'].iloc[index1] + m_ahead

  # Else, calculate the approximate distance corresponding to the image.
  t0_s = rec_df['utc_s (s)'].iloc[index1]
  t0_ms = rec_df['utc_ms (ms)'].iloc[index1]
  d1 = rec_df['distance (m)'].iloc[index1]
  d2 = rec_df['distance (m)'].iloc[index1+1]

  avg_s = d2 - d1
  t_elapsed = (abs(findDifferenceMs(t0_s, t0_ms, t1_s, t1_ms)) / 1000.)
  d_elapsed = avg_s * t_elapsed

  image_distance = d_elapsed + d1
  target_distance = image_distance + m_ahead

  return target_distance

#### Calculate `target_time` = (`target_s`, `target_ms`) corresponding to `target_dist`.

**Description:** Given `target_dist` and `index2`, the index of `rec_df` with a distance reading most closely corresponding to and after `target_dist`, we want to find the timestamp `target_time` = (`target_s`, `target_ms`) corresponding to `target_dist`.

**Solve:**
1. Find the window most closely surrounding `target_dist`, [`d1`, `d2`].
2. Calculate the time elapsed between `d1` and `target_dist` using
$\Delta time = \frac{\Delta distance}{speed}$.
3. Find the `target_time` by adding `time_elapsed` to `d1`.


In [None]:
def target_time(index2, target_dist, rec_df):
  # If the distance reading at index2 == target_dist, return the time at index2.
  if rec_df['distance (m)'].iloc[index2] == target_dist:
    return rec_df['utc_s (s)'].iloc[index2], rec_df['utc_ms (ms)'].iloc[index2]
  
  # Else, calculate the timestamp corresponding to the target distance.
  s1 = rec_df['utc_s (s)'].iloc[index2-1]
  ms1 = rec_df['utc_ms (ms)'].iloc[index2-1]
  d1 = rec_df['distance (m)'].iloc[index2-1]
  d2 = rec_df['distance (m)'].iloc[index2]

  new_target = target_dist - d1
  avg_s = d2 - d1
  time_elapsed = new_target / avg_s

  target_time_s = int(time_elapsed) + s1
  target_time_ms = int((time_elapsed - int(time_elapsed)) * 1000.) + ms1
  if target_time_ms >= 1000:
    target_time_s += 1
    target_time_ms %= 1000
  
  return target_time_s, target_time_ms

## Auto-labeling

#### Dictionary with CSV directories for each video

`CSVDirs` is a `namedtuple` holding the Virb and Garmin CSV files for each folder of images.

`csv_path` returns a `namedtuple CSVDirs` with the path to the Virb and Garmin files for a given folder of iamges.

`video_csvs` is a dictionary mapping each folder of iamges to the corresponding `CSVDirs`. To access the Virb or Garmin files for a video:

- `video_csvs[video_name].virb_dir`
- `video_csvs[video_name].garmin_dir`

In [None]:
CSVDirs = namedtuple('CSVDirs', ['virb_dir', 'garmin_dir'])

def csv_path(folder):
  root_csv_dir = '/gdrive/My Drive/Labeling/SensorData'
  virb_dir = os.path.join(root_csv_dir, 'Virb', folder)
  garmin_dir = os.path.join(root_csv_dir, 'Garmin', folder + '-garmin')
  return CSVDirs(virb_dir, garmin_dir)

video_csvs = {
    "2020-07-28-06-01-11": csv_path('2020-07-28-06-01-11'),
    "2020-09-23-VIRB0001": csv_path('2020-09-23-16-10-10'),
    "2020-09-23-VIRB0002": csv_path('2020-09-23-16-10-10'),
    "2020-09-23-VIRB0004": csv_path('2020-09-23-16-10-10'),
    "2020-09-23-VIRB0008": csv_path('2020-09-23-16-10-10'),
    "2020-09-24-1": csv_path('2020-09-24-12-07-41'),
    "2020-09-24-2": csv_path('2020-09-24-12-07-41'),
    "2020-09-24-3": csv_path('2020-09-24-12-07-41'),
    "2020-09-24-4": csv_path('2020-09-24-12-07-41'),
    "2020-09-29-09-46-42-1": csv_path('2020-09-29-09-46-42'),
    "2020-09-29-09-46-42-2": csv_path('2020-09-29-09-46-42'),
    "2020-09-29-09-46-42-3": csv_path('2020-09-29-09-46-42'),
    "2020-09-29-09-46-42-4": csv_path('2020-09-29-09-46-42'),
    "2020-10-02-10-17-05-1": csv_path('2020-10-02-10-17-05'),
    "2020-10-02-10-17-05-2": csv_path('2020-10-02-10-17-05'),
    "2020-10-02-10-17-05-3": csv_path('2020-10-02-10-17-05'),
}

#### Label images

In [None]:
# Label all images in Labeling/VideoData/RealVideoSplit/VideoFrames
def label():

  valid_images_per_video = {}

  col_videos = []
  col_images = []
  col_groups = []

  frame_directory = '/gdrive/My Drive/Labeling/VideoData/RealVideoSplit/VideoFrames'
  for root, dirs, files in os.walk(frame_directory):
    if root == '/gdrive/My Drive/Labeling/VideoData/RealVideoSplit/VideoFrames':
      for dir in dirs:
        if dir in video_csvs:
          virb_dir = video_csvs[dir].virb_dir
          garmin_dir = video_csvs[dir].garmin_dir
          v, i, g = label_images_in_folder(os.path.join(root, dir), virb_dir, garmin_dir)
          col_videos.extend(v)
          col_images.extend(i)
          col_groups.extend(g)
          valid_images_per_video[dir] = len(i)
  
  labels = pd.DataFrame()
  labels['video'] = col_videos
  labels['image'] = col_images
  labels['group'] = col_groups
  
  labels.to_csv('/gdrive/My Drive/Labeling/labels.csv', index=False)

  return valid_images_per_video, labels

In [None]:
# Return lists with the video name, image name, and roughness group for all 
# images in the folder
def label_images_in_folder(folder_dir, virb_dir, garmin_dir):

  v = []
  i = []
  g = []

  folder = os.path.split(folder_dir)[1]
  print("Labeling images in folder", folder)

  valid_images_df = pd.read_csv(os.path.join('/gdrive/My Drive/Labeling/ValidImages', str(folder + '.csv')))
  valid_images_set = set(valid_images_df['images'].unique())

  rec_df = pd.read_csv(os.path.join(garmin_dir, "record.csv"), index_col=0)
  rec_df = rec_df.filter(items=['utc_s (s)', 'utc_ms (ms)', 'distance (m)'])
  rough_df = pd.read_csv(os.path.join(virb_dir, "roughness_metric.csv"), index_col=0)

  images = os.listdir(folder_dir)
  for image in images:
    if not image in valid_images_set:
      continue
   
    group = label_image(image, rec_df, rough_df)
    v.append(folder)
    i.append(image)
    g.append(group)
  
  return v, i, g

In [None]:
# Return the roughness group corresponding to 5 meters ahead of a single image
def label_image(image, rec_df, rough_df):
  meters_ahead = 5
  t1_s, t1_ms = parse_times(image)

  # Garmin record.csv readings are always on X s, 0 ms
  before_row = rec_df[rec_df['utc_s (s)'] == t1_s]
  index1 = before_row.index.values[0]
  target_dist = calculate_target_distance(t1_s, t1_ms, index1, meters_ahead, rec_df)

  index2 = closest_and_above(target_dist, index1, rec_df)
  target_s, target_ms = target_time(index2, target_dist, rec_df)

  roughness = closest_roughness(rough_df, target_s, target_ms)
  return roughness

In [None]:
# Label all images
valid_images_per_video, labels = label()

Labeling images in folder 2020-09-24-1
Labeling images in folder 2020-09-24-2
Labeling images in folder 2020-10-02-10-17-05-3
Labeling images in folder 2020-10-02-10-17-05-2
Labeling images in folder 2020-09-23-VIRB0001
Labeling images in folder 2020-09-23-VIRB0002
Labeling images in folder 2020-09-23-VIRB0004
Labeling images in folder 2020-09-23-VIRB0008
Labeling images in folder 2020-09-29-09-46-42-4
Labeling images in folder 2020-09-24-3
Labeling images in folder 2020-09-24-4
Labeling images in folder 2020-07-28-06-01-11
Labeling images in folder 2020-09-29-09-46-42-1
Labeling images in folder 2020-09-29-09-46-42-2
Labeling images in folder 2020-10-02-10-17-05-1
Labeling images in folder 2020-09-29-09-46-42-3


In [None]:
print("\nNumber of valid images:", len(labels))
print("Valid images per video:")
for k in valid_images_per_video:
  print("  ", k, valid_images_per_video[k])

print()
images_per_group = [len(labels[labels['group'] == i]) for i in range(0,4)]
for i in range(0, 4):
  print("Number of images in group", i, ":", images_per_group[i])

print()
print(labels['group'].describe())


Number of valid images: 8982
Valid images per video:
   2020-09-24-1 1434
   2020-09-24-2 250
   2020-10-02-10-17-05-3 515
   2020-10-02-10-17-05-2 0
   2020-09-23-VIRB0001 0
   2020-09-23-VIRB0002 0
   2020-09-23-VIRB0004 37
   2020-09-23-VIRB0008 79
   2020-09-29-09-46-42-4 659
   2020-09-24-3 1405
   2020-09-24-4 352
   2020-07-28-06-01-11 1262
   2020-09-29-09-46-42-1 1173
   2020-09-29-09-46-42-2 383
   2020-10-02-10-17-05-1 0
   2020-09-29-09-46-42-3 1433

Number of images in group 0 : 1569
Number of images in group 1 : 3678
Number of images in group 2 : 2066
Number of images in group 3 : 1669

count    8982.000000
mean        1.426965
std         0.982736
min         0.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         3.000000
Name: group, dtype: float64
