# Data Exploration

In [None]:
import sys
sys.path.insert(0,'../src')

In [None]:
# imports
import os
import io
import time
from datetime import datetime

import cv2
import torch
from torch.utils.data import DataLoader, Subset
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from defaults import DEFAULT
from config import RAW_DATA_PATH
from utils import ls, show_images
from data import ImageDataset, VideoDataset
from transform import ImageTransform, VideoTransform
from model import ImageClassifier, VideoClassifier

In [None]:
sns.set_style("darkgrid")

## Raw Data

In [None]:
raw_meta = []
for split in ["train", "test"]:
    for clip in sorted(ls(os.path.join(RAW_DATA_PATH, split))):
        datestr, num = clip.split('_')
        date = datetime.strptime(datestr, "%y%m%d")
        
        filepath = os.path.join(RAW_DATA_PATH, split, clip, "video.mov")
        cap = cv2.VideoCapture(filepath)
        
        # compute duration
        fps = cap.get(cv2.CAP_PROP_FPS)
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        duration = int(frame_count / fps)
        
        # save information
        raw_meta.append({
            "split": split,
            "clip": clip,
            "date": date,
            "seconds": duration,
            "frames": frame_count / 1000
        })
        
raw_meta = pd.DataFrame(raw_meta)

In [None]:
# seconds by day
raw_meta.groupby("date").sum(numeric_only=True)

In [None]:
# seconds by split
raw_meta.groupby("split").sum(numeric_only=True)

In [None]:
# seconds by split and date
raw_meta.groupby(["split", "date"]).sum("seconds")

## Processed Data

In [None]:
# image dataset
train_images = ImageDataset(split="train", transform=None)
test_images = ImageDataset(split="test", transform=None)

num_train_images = len(train_images)
num_test_images = len(test_images)

print(f"Number of training images: {num_train_images}")
print(f"Number of test images: {num_test_images}")

In [None]:
# image dataset
frame_rate = 8
num_frames = 8
clip_duration = num_frames * frame_rate / 30
train_clips = VideoDataset(split="train", clip_duration=clip_duration, transform=None)
test_clips = VideoDataset(split="test", clip_duration=clip_duration, transform=None)

num_train_clips = 0
num_test_clips = 0
for clip in train_clips:
    num_train_clips += 1

for clip in test_clips:
    num_test_clips += 1

print(f"Number of training clips (Clip Duration: 2.1s): {num_train_clips}")
print(f"Number of test clips (Clip Duration: 2.1s): {num_test_clips}")

In [None]:
# image dataset
clip_duration = 1
train_clips = VideoDataset(split="train", clip_duration=clip_duration, transform=None)
test_clips = VideoDataset(split="test", clip_duration=clip_duration, transform=None)

num_train_clips = 0
num_test_clips = 0
for clip in train_clips:
    num_train_clips += 1

for clip in test_clips:
    num_test_clips += 1

print(f"Number of training clips (Clip Duration: 1s): {num_train_clips}")
print(f"Number of test clips (Clip Duration: 1s): {num_test_clips}")

In [None]:
# image dataset
clip_duration = 2
train_clips = VideoDataset(split="train", clip_duration=clip_duration, transform=None)
test_clips = VideoDataset(split="test", clip_duration=clip_duration, transform=None)

num_train_clips = 0
num_test_clips = 0
for clip in train_clips:
    num_train_clips += 1

for clip in test_clips:
    num_test_clips += 1

print(f"Number of training clips (Clip Duration: 2s): {num_train_clips}")
print(f"Number of test clips (Clip Duration: 2s): {num_test_clips}")

### Data Collection

In [None]:
# calendar plot for timing
import calplot
import numpy as np
import pandas as pd
from datetime import datetime
    
dates = pd.Series(raw_meta.groupby("date").sum("seconds"))
fig, ax = calplot.calplot(dates, cmap='YlGn', colorbar=False);

# fig.savefig("../report/figures/data-collection-freq.jpg")

## ImageDataset

The `ImageDataset` class is used for all image classification modules. After pre-processing, the frames are stored in the following way

```
filepath
|-class1
|  |__ frame1.jpg
|  |__ frame2.jpg
|  |__ ...
|_class2
|  |__ frame1.jpg
|  |__ frame2.jpg
|  |__ ...
```

With this, all frames can easily be read into a `torchvision.datasets.ImageFolder` class, which takes in `filepath` and a custom `transforms` object.


In [None]:
# initialise data
config = DEFAULT["resnet18"]
transform = ImageTransform(**config["transform"])
train_image_data = ImageDataset(split="train", transform=transform)
test_image_data = ImageDataset(split="test", transform=transform)

The `ImageDataset` instances `train_image_data` and `test_image_data` provide handy ways to load unbatched samples by implementing the `iter` and `getitem` dunder methods. Let's see how many frames there are in either

In [None]:
# total number of frames
print(len(train_image_data))
print(len(test_image_data))

Why is the test split way larger? That is because in the `preprocess` script, frames for the train and test split are extracted. In there, it is specified that for only 1 frame per second (FPS) should be sampled for the training split, because it is hypothesised that including too many too similiar training frames will make the model overfit heavily. For the test split, however, this cannot be said. We would like to evaluate the model on all frames at the normal frame rate of 30FPS to get as robust as possible test metrics.

Let's see if we can say something about the class distribution.

In [None]:
# num classes
print(train_image_data.num_classes)

In [None]:
pd.DataFrame(train_image_data.id2class, index=["class"]).T

In [None]:
# class distribution for train class
train_classes = [cls for (_, cls) in train_image_data.dataset.imgs]
train_class_dist = pd.DataFrame(train_classes, columns=["class"]).value_counts().to_frame().reset_index()
train_class_dist.columns = ["class", "count"]

test_classes = [cls for (_, cls) in test_image_data.dataset.imgs]
test_class_dist = pd.DataFrame(test_classes, columns=["class"]).value_counts().to_frame().reset_index()
test_class_dist.columns = ["class", "count"]

fig, axs = plt.subplots(ncols=2, figsize=(12, 6))
sns.barplot(data=train_class_dist, x="class", y="count", palette="Dark2", ax=axs[0])
sns.barplot(data=test_class_dist, x="class", y="count", palette="Dark2", ax=axs[1])
axs[0].set_xticklabels(list(train_image_data.id2class.values()), rotation=90);
axs[1].set_xticklabels(list(test_image_data.id2class.values()), rotation=90);

# rotate x tick labels 90 degrees
for ax in axs:
  ax.set_xlabel("Class")
  ax.set_ylabel("Count")

Looks about right. The testing distribution is not quite the same as in the training data, but that is fine. After all, we don't expect people to always take the same route through the building. With this established, let's look at an example batch, as used in tthe training loop of image classifiers.

To achieve that, let's first define a data loader (with the configurations for ResNet18)

In [None]:
# data loaders
train_image_loader = DataLoader(train_image_data, **config["loader"])
test_image_loader = DataLoader(test_image_data, **config["loader"])

print(f"Batch Size: {config['loader']['batch_size']}")
print(f"Training Batches: {len(train_image_loader)}")
print(f"Testing Batchs: {len(test_image_loader)}")

We see that with a batch size of 32, there are 70 training batches and 734 testing batches. In the training loop, we want to have some knowledge about the out-of-sample performance. Because we don't do any hyperparameter tuning, we can use the test split as a validation split. However, we don't want to test on ~22k frames after each epoch. For this reason we are using the `torch.utils.Subset` class and sample 5% of the entire testing data as an approximation of the test split, which we call validation split. Let's see how that would look.

In [None]:
k = int(len(test_image_data) * 0.05)
indices = np.random.choice(len(test_image_data), k)
val_image_data = Subset(train_image_data, indices)
val_image_loader = DataLoader(val_image_data, **config["loader"])

print(f"Validation Samples: {len(val_image_data)}")
print(f"Validation Batches: {len(val_image_loader)}")

We get 5% of the test split, randomly sampled, for validation. Nice! Let's look at an example batch now.

In [None]:
frames, labels = next(iter(train_image_loader))
frames, labels = frames[:9], labels[:9]
show_images(frames, titles=[train_image_data.id2class[label.item()] for label in labels], unnormalise=True);

Looks perfect! We see that the training batch is shuffled, the annotation and class-id mapping seems to work, and transforms to the images work.

## VideoDataset

Let's now turn to the `VideoDataset` class. It is conceptually similar, as it is just a wrapper around the `LabeledVideoDataset` class provided by `pytorchvideo`. It is an instance of a PyTorch `IterableDataset` and therefore does not compute the number of samples in `len` method.

To showcase the video dataset class, we will use the configuration specification of `R2+1D(18)`:

In [None]:
config = DEFAULT["r2plus1d_18"]

video_transform = VideoTransform(**config["transform"])
train_video_data = VideoDataset(**config["dataset"], split="train", transform=video_transform)
test_video_data = VideoDataset(**config["dataset"], split="test", transform=video_transform, sampler="sequential")

We cannot get the number of videos/ clips from this instance, but we can iterate over it. Let's do that to count the number of training and testing clips

In [None]:
num_train_clips = 0
num_test_clips = 0
for i, _ in enumerate(train_video_data):
    num_train_clips += 1
for i, _ in enumerate(test_video_data):
    num_test_clips += 1

In [None]:
print(f"Clip Duration: {config['dataset']['clip_duration']}s")
print(f"Training Clips: {num_train_clips} ({num_train_clips * config['dataset']['clip_duration']}s)")
print(f"Testing Clips: {num_test_clips} ({num_test_clips * config['dataset']['clip_duration']}s)")

Nice, but it looks like we are almost loosing 50% of the data, because most video clips have been preprocessed to a max length of 5s, which means that the remaining 2.4s of the video clip are not used. 

Let's look at how a single sample can be obtained from the training video dataset.

In [None]:
sample = next(iter(train_video_data))
print(sample.keys())

We can see that the labelled video dataset class returns a sample as dictionary, with the following keys:

- `video`: `torch.Tensor` of dim (C,T,H,W), the actual clip
- `video_name`: `str`, name of video
- `video_index`: `int`
- `clip_index`: `int`
- `label`: `str`, class of clip

This means that before passing it into a model, we will have to 1) extract the video tensor and 2) encode the string representation of the label.

Another important thing is the differentiation between a video and a clip. Video classification models cannot handle arbitrary length videos (dim T in tensor), but have some fixed capacity. On top of that, they define the parameter `sample_rate`, which specifies the rate at which frames are taken into account from the original 30FPS stream of frames.

In the example of `x3d_s` the frame rate is 6 and the number of frame is 13. This means that for a sequence of frames `x`, only the `o` are used in the model. Because the model assumes to get `13` frames, one training/ inference clip has to consist of at least 6*13=78 frames. In a 30FPS video, this means around ~2.6s. The VideoDataset class handles all that itself, but this is the reason, why there is a clip index.

```
oxxxxxoxxxxxoxxxxxo...
```

If one sets `sampler="random"` then the video clips are sampled randomly. One video always starts at the first clip, and if it gets sampled again goes to the next clip, if there is enough seconds left. 

If one sets `sampler="sequential"` then the video clips are sampled sequentially. 

In [None]:
import ipywidgets as widgets

def display_video(video, label, config):
  mean = np.array(config["transform"]["mean"])
  std = np.array(config["transform"]["std"])
  video = video.permute(1,0,2,3)
  video_widget = widgets.Image(format='jpeg')

  # display the widget
  display(video_widget)
  for frame in video:
    img = plt.imshow(np.array(((frame * std[:, None, None] + mean[:, None, None]) * 255.0).permute(1,2,0), dtype=np.uint8))
    plt.title(label)
    buffer = io.BytesIO()
    plt.savefig(buffer, format='jpeg')
    buffer.seek(0)
    
    video_widget.value = buffer.getvalue()
    time.sleep(1)

display_video(sample["video"], sample["label"], config) 

Looks, nice. Let's define a loader class to observe the differnet video/ clip sampling behavior for the train and test split.

In [None]:
train_video_loader = DataLoader(train_video_data, **config["loader"])
test_video_loader = DataLoader(test_video_data, **config["loader"])

print(f"Batch Size: {config['loader']['batch_size']}")
print("\nTraining (Random Video Sampling)")
train_batch = next(iter(train_video_loader))
print(f"Batch Tensor Shape: {train_batch['video'].shape}")
print(f"Video Names: {train_batch['video_name']}")
print(f"Clip Index: {train_batch['clip_index']}")
print(f"Labels: {train_batch['label']}")

print("\nTesting (Sequential Video Sampling)")
test_batch = next(iter(test_video_loader))
print(f"Batch Tensor Shape: {test_batch['video'].shape}")
print(f"Video Names: {test_batch['video_name']}")
print(f"Clip Index: {test_batch['clip_index']}")
print(f"Labels: {test_batch['label']}")