# Data Exploration

In [None]:
import sys
sys.path.insert(0,'../src')

In [None]:
# imports
import os
from datetime import datetime
from itertools import combinations

import torch
from torch.utils.data import DataLoader
from matplotlib import pyplot as plt
import seaborn as sns

from utils import ls
from data import ImageDataset, VideoDataset

In [None]:
# helper
def render_latex(df):
    # capitalise col names
    df.columns = [' '.join(map(lambda x: x[0].upper() + x[1:], col.split('_'))) for col in df.columns]
    
    # format df
    s = df.style.highlight_max(props='bfseries: ;')
    s.format(precision=2)
    
    # render latex
    opts = {"hrules": True, "position": "h"}
    return s.to_latex(**opts)

## Raw Data

In [None]:
raw_meta = []
for split in SPLITS:
    for clip in sorted(ls(os.path.join(RAW_DATA_PATH, split))):
        datestr, num = clip.split('_')
        date = datetime.strptime(datestr, "%y%m%d")
        
        filepath = os.path.join(RAW_DATA_PATH, split, clip, "video.mov")
        cap = cv2.VideoCapture(filepath)
        
        # compute duration
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        duration = int(frame_count / fps)
        
        # save information
        raw_meta.append({
            "split": split,
            "clip": clip,
            "date": date,
            "seconds": duration
        })
        
raw_meta = pd.DataFrame(raw_meta)

In [None]:
# total number of seconds in two splits
raw_meta.groupby("split").sum(numeric_only=True)

## ImageDataset

In [None]:
# initialise data splits
image_data = { split: ImageDataset(split=split, include_classes=CLASSES, ratio=1.0) for split in SPLITS }

In [None]:
# initialise data loaders
image_loader = { split: DataLoader(image_data[split], 9) for split in SPLITS}

## Statistics about Processed Data

In [None]:
# some statistics about the data
statistics = []
for split in SPLITS:
    clips = image_data[split].frames_by_clip.keys()
    
    num_clips = len(clips)
    num_frames = sum([len(image_data[split].frames_by_clip[clip]) for clip in clips])
    total_seconds = int(num_frames / FPS)
    total_mins = round(total_seconds / 60)
    
    statistics.append({
        "split": split,
        "num_clips": num_clips,
        "total_seconds": total_seconds,
        "total_mins": total_mins
    })
    
stats = pd.DataFrame(statistics).set_index("split")
stats

In [None]:
print(render_latex(stats))

### Data Collection Timeline

In [None]:
# calendar plot for timing
import calplot
import numpy as np
import pandas as pd
from datetime import datetime
from collections import defaultdict

clips_per_day = defaultdict(int)
for split in SPLITS:
    clips = image_data[split].frames_by_clip.keys()
    for clip in clips:
        datestr, num = clip.split('_')
        date = datetime.strptime(datestr, '%y%m%d')
        clips_per_day[date] += 1
    
dates = pd.Series(clips_per_day)
fig, ax = calplot.calplot(dates, cmap='YlGn', colorbar=False);

fig.savefig("../report/figures/data-collection-freq.jpg")

In [None]:
# calendar plot for timing
import calplot
import numpy as np
import pandas as pd
from datetime import datetime
from collections import defaultdict
    
for key in clips_per_day.keys():
    clips_per_day[key] = 5
# training days
clips_per_day[datetime.strptime('230222', '%y%m%d')] = 10
clips_per_day[datetime.strptime('230302', '%y%m%d')] = 10

dates = pd.Series(clips_per_day)
calplot.calplot(dates, cmap='cool', colorbar=False);

fig.savefig("../report/figures/data-collection-splits.jpg")

## Data Validitity

In [None]:
# verify disjointness of splits
clips = {}
for split in SPLITS:
    clips[split] = set([clip for clip in image_data[split].frames_by_clip.keys()])

for pair in combinations(SPLITS, 2):
    fst, snd = pair
    print(f"{pair} has {len(clips[fst] & clips[snd])} images in common")

In [None]:
# verify even class distribution
fig, ax = plt.subplots(ncols=len(SPLITS), figsize=(4*len(SPLITS),3))
for i, split in enumerate(SPLITS):
    dist = image_data[split].class_distribution
    sns.barplot(x=list(dist.keys()), y=list(dist.values()), ax=ax[i])
    ax[i].tick_params(axis='x', rotation=90)
    ax[i].set(title=f"{split.capitalize()} Split")

## Example Training Batch

In [None]:
# get example preprocessed clips
frames = list(iter(image_data["train"]))

In [None]:
start_idx = 30
n_frames = 4
images, labels = zip(*frames[start_idx:start_idx+n_frames])
images = torch.cat([i.unsqueeze(0) for i in images])

fig, axs = plt.subplots(ncols=n_frames, figsize=(3*n_frames, 3))
for i in range(n_frames):
    show_image(images[i], title=image_data["train"].id2class[labels[i]], ax=axs[i])
fig.tight_layout()

fig.savefig("../report/figures/data-example-batch.jpg")

In [None]:
# train split
images, labels = next(iter(image_loader["train"]))

show_images(images, titles=[image_data["train"].id2class[l.item()] for l in labels])

In [None]:
# test split
images, labels = next(iter(image_loader["test"]))

show_images(images, titles=[image_data["test"].id2class[l.item()] for l in labels])

## Video Dataset

In [None]:
config = VideoDataset.default_config()
video_dataset = VideoDataset(**config)

In [None]:
import random
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import numpy as np

clip, labels = video_dataset[random.randint(0, len(video_dataset))]
image_tensors = [image for image in clip]

# create a figure and axis object
fig, ax = plt.subplots()

# create an empty image object to hold the current frame
im = ax.imshow(np.zeros_like(image_tensors[0].permute(1,2,0)))

# define the update function that will be called for each frame
def update(i):
    im.set_data(image_tensors[i].permute(1, 2, 0))
    return [im]

ani = animation.FuncAnimation(fig, update, frames=len(image_tensors), interval=500, blit=True)

ani.save('animation.mp4', writer='ffmpeg')

from IPython.display import HTML
HTML(f'<video controls src="animation.mp4" />')

In [None]:
# number of clips
len(video_dataset)

In [None]:
# only clips with 10 frames in length
np.unique([len(clip) for clip, _ in video_dataset], return_counts=True)