# Instructions


Check frames in `data/*vid_name*/` folders match `data/labels.csv` file.

You need 1:1 mapping between frame files and rows of `labels.csv` or the rest of the code in this package won't run. 

A mismatch might be caused by a difference in FPS in labels vs extracted frames, an issue in frame extraction or an issue in your labels file.

This notebook will do the check and help debug a mismatch between frame files and `labels.csv`

Note: Your labels might be out by 1 or 2 frames due to the granularity of your labels vs FPS - the easiest solution is to delete those extra frames manually

# Setup

In [1]:
import numpy as np
import cv2
from time import time as timer
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
%matplotlib inline

In [2]:
# setup paths
pwd = os.getcwd().replace("notebooks","")
path_videos = pwd + 'videos/'
path_data = pwd + 'data_cnn_ts_3d/'

In [3]:
# read frames paths
paths_videos = os.listdir(path_data)
paths_videos = [path_data + v + '/' for v in paths_videos if v != '.DS_Store' and v != 'labels.csv']
paths_videos.sort()

# Check we have a 1:1 mapping between labels and frames

In [4]:
if not os.path.exists(pwd + 'data/labels.csv'):
    print("ERROR: labels.csv missing - please copy labels.csv to /data/labels.csv")
    print()
    print("Sample label file below:")    
    print("""video\t\t\tframe \t\t\t\t label
    20160801_no9_1\t\t20160801_no9_1_00001.jpeg	search
    20160801_no9_1\t\t20160801_no9_1_00002.jpeg	search
    ...""")
    print()
    print("Note you also need a 'split' column that assigns videos to train/valid/test splits - can use /notebooks/helper_add_train_valid_test_splits_to_labels.ipynb to add splits")

In [5]:
# load labels
labels = pd.read_csv(path_data + 'labels.csv')

In [6]:
labels['frame'] = labels['frame'].str.replace('.jpeg','.jpg')

  labels['frame'] = labels['frame'].str.replace('.jpeg','.jpg')


In [7]:
labels.head()

Unnamed: 0,video,frame,label,split
0,trajs_2017-03-08_Ut_3064_door_3,trajs_2017-03-08_Ut_3064_door_3-0000.npy,pre-deboarding,train
1,trajs_2017-03-08_Ut_3064_door_3,trajs_2017-03-08_Ut_3064_door_3-0001.npy,pre-deboarding,train
2,trajs_2017-03-08_Ut_3064_door_3,trajs_2017-03-08_Ut_3064_door_3-0002.npy,pre-deboarding,train
3,trajs_2017-03-08_Ut_3064_door_3,trajs_2017-03-08_Ut_3064_door_3-0003.npy,pre-deboarding,train
4,trajs_2017-03-08_Ut_3064_door_3,trajs_2017-03-08_Ut_3064_door_3-0004.npy,pre-deboarding,train


In [8]:
vids_error = []
vids_ok = []

# for path_video in paths_videos:    
for path_video in paths_videos:

    # get vid name from path
    vid_name = path_video[:-1].split("/")[-1]

    # convert list of frame files to dataframe
    paths_frames = os.listdir(path_video)
    paths_frames = [f for f in paths_frames if f != '.DS_Store']
    paths_frames = pd.DataFrame(paths_frames, columns = ['frame_file'])

    # subset labels to just this vid
    labels_vid = labels[labels['video'] == vid_name]

    # collect errors so can print grouped
    if not len(labels_vid) == len(paths_frames):
        vids_error.append("{} .::. Different number of labels ({}) than frames ({}) ... DIFF: {} ".format(vid_name, len(labels_vid),len(paths_frames), len(labels_vid) - len(paths_frames)))
    else:
        vids_ok.append("{} .::. Same number of labels and frames for vid".format(vid_name))

# print errors
print("ERRORS: {} VIDS WITH #LABELS != #FRAMES".format(len(vids_error)))
for msg in vids_error:
    print(msg)

print("\n")
print("OK: {} VIDS WITH #LABELS == #FRAMES".format(len(vids_ok)))
for msg in vids_ok:
    print(msg)

ERRORS: 0 VIDS WITH #LABELS != #FRAMES


OK: 10 VIDS WITH #LABELS == #FRAMES
trajs_2017-03-08_Ut_3064_door_3 .::. Same number of labels and frames for vid
trajs_2017-06-03_Ut_3030_door_4 .::. Same number of labels and frames for vid
trajs_2017-06-14_Ut_3066_door_3 .::. Same number of labels and frames for vid
trajs_2017-09-28_Ut_3126_door_3 .::. Same number of labels and frames for vid
trajs_2017-12-16_Ut_700852_door_3 .::. Same number of labels and frames for vid
trajs_2018-02-06_Ut_3048_door_4 .::. Same number of labels and frames for vid
trajs_2018-02-09_Ut_830_door_3 .::. Same number of labels and frames for vid
trajs_2018-05-08_Ut_3038_door_4 .::. Same number of labels and frames for vid
trajs_2018-05-16_Ut_3040_door_3 .::. Same number of labels and frames for vid
trajs_2018-05-21_Ut_3072_door_4 .::. Same number of labels and frames for vid


In [9]:
assert labels['video'].nunique() == len(paths_videos), "Different number of videos in labels file than /data/"

# Find missing frames for a video

> this function will output any frames in the dataset that do not have labels

In [10]:
# for path_video in paths_videos:    
for vid_error in vids_error:

    # get vid name from path
    vid_name = vid_error.split(" .::.")[0]
    print(vid_name)

    # convert list of frame files to dataframe
    paths_frames = os.listdir(path_data+vid_name)
    paths_frames = [f for f in paths_frames if f != '.DS_Store']
    paths_frames = pd.DataFrame(paths_frames, columns = ['frame_file'])
    paths_frames.sort_values('frame_file', inplace=True)

    # figure out which frames we're missing
    labels_vid = labels[labels['video'] == vid_name]

    labels_vid = pd.merge(labels_vid,paths_frames,left_on='frame', right_on='frame_file',how='left')

    print(len(labels_vid[labels_vid['frame_file'].isnull()]))
    
#     print(labels_vid[labels_vid['frame_file'].isnull()])