# Auto-labeling script for TSM 2

All bounds checking and filtering of images is done in ValidImages_TSM2.ipynb.

## Set-up

In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import matplotlib.pyplot as plt
from collections import namedtuple
import csv
import re

## Helper functions

In [3]:
# Difference in ms between t1 and t2
# Negative if s1 is before s2, positive if s2 is before s1
def findDifferenceMs(s1, ms1, s2, ms2):
  if (s1 > s2):
      return (s1-s2-1)*1000 + (1000-ms2) + ms1
  elif (s1 < s2):
      return -((s2-s1-1)*1000 + (1000-ms1) + ms2)
  else:
      return ms1-ms2

# Parse s and ms from image name in format XsYms
def parse_times(name):
  lhs, rhs = name.split("s", 1)
  rhs, ignore = rhs.split("ms", 1);
  return int(lhs), int(rhs)

In [4]:
# Closest reading exactly at or after a given timestamp
def closest_and_after(rough_df, t1_s, t1_ms):
  filtered = rough_df[rough_df['utc_s'] >= t1_s]

  for _, row in filtered.iterrows():
    if findDifferenceMs(t1_s, t1_ms, row['utc_s'], row['utc_ms'] >= 0):
      return row['roughness_group_2'], row['roughness_group_3'], row['roughness_group_4'], row['roughness_group_original']
  
  return -1, -1, -1, -1

## Auto-labeling

#### Dictionary with CSV directories for each video

`CSVDirs` is a `namedtuple` holding the Virb and Garmin CSV files for each folder of images.

`csv_path` returns a `namedtuple CSVDirs` with the path to the Virb and Garmin files for a given folder of iamges.

`video_csvs` is a dictionary mapping each folder of iamges to the corresponding `CSVDirs`. To access the Virb or Garmin files for a video:

- `video_csvs[video_name].virb_dir`
- `video_csvs[video_name].garmin_dir`

In [5]:
CSVDirs = namedtuple('CSVDirs', ['virb_dir', 'garmin_dir'])

def csv_path(folder):
  root_csv_dir = '/gdrive/My Drive/Labeling/SensorData'
  virb_dir = os.path.join(root_csv_dir, 'Virb', folder)
  garmin_dir = os.path.join(root_csv_dir, 'Garmin', folder + '-garmin')
  return CSVDirs(virb_dir, garmin_dir)

video_csvs = {
    "2020-07-28-06-01-11": csv_path('2020-07-28-06-01-11'),
    "2020-09-23-VIRB0001": csv_path('2020-09-23-16-10-10'),
    "2020-09-23-VIRB0002": csv_path('2020-09-23-16-10-10'),
    "2020-09-23-VIRB0004": csv_path('2020-09-23-16-10-10'),
    "2020-09-23-VIRB0008": csv_path('2020-09-23-16-10-10'),
    "2020-09-24-1": csv_path('2020-09-24-12-07-41'),
    "2020-09-24-2": csv_path('2020-09-24-12-07-41'),
    "2020-09-24-3": csv_path('2020-09-24-12-07-41'),
    "2020-09-24-4": csv_path('2020-09-24-12-07-41'),
    "2020-09-29-09-46-42-1": csv_path('2020-09-29-09-46-42'),
    "2020-09-29-09-46-42-2": csv_path('2020-09-29-09-46-42'),
    "2020-09-29-09-46-42-3": csv_path('2020-09-29-09-46-42'),
    "2020-09-29-09-46-42-4": csv_path('2020-09-29-09-46-42'),
    "2020-10-02-10-17-05-1": csv_path('2020-10-02-10-17-05'),
    "2020-10-02-10-17-05-2": csv_path('2020-10-02-10-17-05'),
    "2020-10-02-10-17-05-3": csv_path('2020-10-02-10-17-05'),
}

#### Label images

In [6]:
# Label all images in Labeling/VideoData/RealVideoSplit/VideoFrames
def label():

  valid_images_per_video = {}

  col_videos = []
  col_images = []
  col_groups_2 = []
  col_groups_3 = []
  col_groups_4 = []
  col_groups_original = []

  frame_directory = '/gdrive/My Drive/Labeling/VideoData/RealVideoSplit/VideoFrames'
  for root, dirs, files in os.walk(frame_directory):
    if root == '/gdrive/My Drive/Labeling/VideoData/RealVideoSplit/VideoFrames':
      for dir in dirs:
        if dir in video_csvs:
          virb_dir = video_csvs[dir].virb_dir
          garmin_dir = video_csvs[dir].garmin_dir
          v, i, g2, g3, g4, g = label_images_in_folder(os.path.join(root, dir), virb_dir, garmin_dir)
          col_videos.extend(v)
          col_images.extend(i)
          col_groups_2.extend(g2)
          col_groups_3.extend(g3)
          col_groups_4.extend(g4)
          col_groups_original.extend(g)
          valid_images_per_video[dir] = len(i)
  
  labels = pd.DataFrame()
  labels['video'] = col_videos
  labels['image'] = col_images
  labels['group_2'] = col_groups_2
  labels['group_3'] = col_groups_3
  labels['group_4'] = col_groups_4
  labels['group_original'] = col_groups_original
  
  labels.to_csv('/gdrive/My Drive/Labeling/LabelsCSV/all_labels_metric_2.csv', index=False)

  return valid_images_per_video, labels

In [7]:
# Return lists with the video name, image name, and roughness group for all 
# images in the folder
def label_images_in_folder(folder_dir, virb_dir, garmin_dir):

  v = []
  i = []
  g2 = []
  g3 = []
  g4 = []
  g = []

  folder = os.path.split(folder_dir)[1]
  print("Labeling images in folder", folder)

  valid_images_df = pd.read_csv(os.path.join('/gdrive/My Drive/Labeling/ValidImages/Metric_2', str(folder + '.csv')))
  valid_images_set = set(valid_images_df['images'].unique())

  rec_df = pd.read_csv(os.path.join(garmin_dir, "record.csv"), index_col=0)
  rec_df = rec_df.filter(items=['utc_s (s)', 'utc_ms (ms)', 'distance (m)'])
  rough_df = pd.read_csv(os.path.join(virb_dir, "roughness_metric_2.csv"), index_col=0)

  images = os.listdir(folder_dir)
  for image in images:
    if not image in valid_images_set:
      continue
   
    group_2, group_3, group_4, group_original = label_image(image, rec_df, rough_df)
    v.append(folder)
    i.append(image)
    g2.append(group_2)
    g3.append(group_3)
    g4.append(group_4)
    g.append(group_original)
  
  return v, i, g2, g3, g4, g

In [8]:
# Return the roughness group corresponding to 5 meters ahead of a single image
def label_image(image, rec_df, rough_df):
  t1_s, t1_ms = parse_times(image)
  roughness_2, roughness_3, roughness_4, roughness_original = closest_and_after(rough_df, t1_s, t1_ms)
  return roughness_2, roughness_3, roughness_4, roughness_original

In [9]:
# Label all images
valid_images_per_video, labels = label()

Labeling images in folder 2020-09-24-1
Labeling images in folder 2020-09-24-2
Labeling images in folder 2020-10-02-10-17-05-3
Labeling images in folder 2020-10-02-10-17-05-2
Labeling images in folder 2020-09-23-VIRB0001
Labeling images in folder 2020-09-23-VIRB0002
Labeling images in folder 2020-09-23-VIRB0004
Labeling images in folder 2020-09-23-VIRB0008
Labeling images in folder 2020-09-29-09-46-42-4
Labeling images in folder 2020-09-24-3
Labeling images in folder 2020-09-24-4
Labeling images in folder 2020-07-28-06-01-11
Labeling images in folder 2020-09-29-09-46-42-1
Labeling images in folder 2020-09-29-09-46-42-2
Labeling images in folder 2020-10-02-10-17-05-1
Labeling images in folder 2020-09-29-09-46-42-3


In [10]:
print("\nNumber of valid images:", len(labels))
print("Valid images per video:")
for k in valid_images_per_video:
  print("  ", k, valid_images_per_video[k])


Number of valid images: 12118
Valid images per video:
   2020-09-24-1 1437
   2020-09-24-2 263
   2020-10-02-10-17-05-3 517
   2020-10-02-10-17-05-2 188
   2020-09-23-VIRB0001 78
   2020-09-23-VIRB0002 89
   2020-09-23-VIRB0004 37
   2020-09-23-VIRB0008 77
   2020-09-29-09-46-42-4 664
   2020-09-24-3 1434
   2020-09-24-4 353
   2020-07-28-06-01-11 1367
   2020-09-29-09-46-42-1 1416
   2020-09-29-09-46-42-2 1331
   2020-10-02-10-17-05-1 1422
   2020-09-29-09-46-42-3 1445


In [12]:
for i in range(2, 5):
  print('K-means roughness metric with k =', i)
  group_str = 'group_' + str(i)
  images_per_group = [len(labels[labels[group_str] == j]) for j in range(0,i)]
  for j in range(0, i):
    print("Number of images in group", j, ":", images_per_group[j])
  print()

print('Original roughness metric')
images_per_group = [len(labels[labels['group_original'] == j]) for j in range(0, 4)]
for j in range(0, 4):
  print("Number of images in group", j, ":", images_per_group[j])
print()

K-means roughness metric with k = 2
Number of images in group 0 : 9136
Number of images in group 1 : 2982

K-means roughness metric with k = 3
Number of images in group 0 : 6464
Number of images in group 1 : 4291
Number of images in group 2 : 1363

K-means roughness metric with k = 4
Number of images in group 0 : 5005
Number of images in group 1 : 4394
Number of images in group 2 : 2043
Number of images in group 3 : 676

Original roughness metric
Number of images in group 0 : 2381
Number of images in group 1 : 4952
Number of images in group 2 : 2626
Number of images in group 3 : 2159



In [13]:
for i in range(2, 5):
  print('Labels for k-means roughness metric with k =', i)
  group_str = 'group_' + str(i)
  print(labels[group_str].describe())
  print()

print('Labels for original roughness metric')
print(labels['group_original'].describe())

Labels for k-means roughness metric with k = 2
count    12118.000000
mean         0.246080
std          0.430744
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: group_2, dtype: float64

Labels for k-means roughness metric with k = 3
count    12118.000000
mean         0.579056
std          0.684648
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          2.000000
Name: group_3, dtype: float64

Labels for k-means roughness metric with k = 4
count    12118.000000
mean         0.867140
std          0.887224
min          0.000000
25%          0.000000
50%          1.000000
75%          1.000000
max          3.000000
Name: group_4, dtype: float64

Labels for original roughness metric
count    12118.000000
mean         1.376547
std          0.992038
min          0.000000
25%          1.000000
50%          1.000000
75%          2.000000
max          3.000000
Name: group_original, dty