In [17]:
import math
import numpy as np
import torch
from torch import nn
from torchvision.io import VideoReader
from torchvision.io import write_jpeg, read_image
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import transforms
import os
from note_detector.python.video_note_detector import generate_labels
import matplotlib.pyplot as plt

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [3]:
# Function to convert videos to valid training data that we can use, in the structure shown in ./training_data.
# The ith element of the array in "training_data/labels.npy" is the label(s) for the image named "i.jpg" in "training_data/frames"

def convert_video_to_training_data(vid_path, vid_name, resize_h, resize_w):
    start_num = 0
    cur_labels = []
    if os.path.isfile("training_data/labels.npy"):
        cur_labels = np.load("training_data/labels.npy", allow_pickle=True).tolist()
        start_num = len(cur_labels)
    
    print("processing file", vid_name)
    
    # get frames of video and save sequentially in training_data/frames
    reader = VideoReader(vid_path + vid_name, "video")
    for frame in reader:
        cur_img = frame["data"]
        img_to_save = transforms.Resize((resize_h, resize_w))(cur_img)
        img_name = "./training_data/frames/%s.jpg" % start_num
        start_num += 1
        write_jpeg(img_to_save, img_name)

    print("done w video read")

    # use library to get labels for each frame
    cur_video_labels, num_frames = generate_labels(vid_path, vid_name)
    tmp_label_aggregator = [[] for i in range(num_frames)]
    for frame, note in cur_video_labels:
        tmp_label_aggregator[int(frame)].append(note)
    cur_labels.extend(tmp_label_aggregator)
    np.save("training_data/labels.npy", np.array(cur_labels, dtype=object))

    print("done w labeling")

In [11]:
# Call the above function to generate usable training data

FRAME_HEIGHT = 1080
FRAME_WIDTH = 1920

answer = input("Are you sure you want to modify the training_data folder? (y/n)")

if answer == "y":
    convert_video_to_training_data("./vids/", "dumb_scale_youtube.mp4", FRAME_HEIGHT, FRAME_WIDTH)
    
    convert_video_to_training_data("./vids/", "e1.MOV", FRAME_HEIGHT, FRAME_WIDTH)
    
    convert_video_to_training_data("./vids/", "e2_untrimmed.MOV", FRAME_HEIGHT, FRAME_WIDTH)
    
    convert_video_to_training_data("./vids/", "e3_untrimmed.MOV", FRAME_HEIGHT, FRAME_WIDTH)
    
    convert_video_to_training_data("./vids/", "e5.MOV", FRAME_HEIGHT, FRAME_WIDTH)

processing file e5.MOV
done w video read
done w labeling


In [18]:
# define the dataset

class NoteDataset(Dataset):
    def __init__(self):
        self.labels = np.load("training_data/labels.npy", allow_pickle=True).tolist()
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return read_image("./training_data/frames/%s.jpg" % idx), self.labels[idx]

In [19]:
# create the dataloader

batch_size = 64
FIXED_H = 1080
FIXED_W = 1920

train_dataset = NoteDataset()
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
# define the CNN

# each input to the network is 3x1080x1920

class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.stack = nn.Sequential(
            nn.Conv2d(3, 32, )
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [None]:
# quick debug commands:

# vid_path, vid_name = "./vids/", "dumb_scale_youtube.mp4"
# # cur_video_labels, num_frames = generate_labels(vid_path, vid_name)
# reader = VideoReader(vid_path + vid_name, "video")
# frame = next(reader)["data"]
# print(frame.shape)

In [None]:
# a = np.load("training_data/labels.npy", allow_pickle=True)
# print(len(a))
# print(a[0])

In [None]:
# f, l = train_dataset[0]
# print(len(train_dataset))
# print(type(f))
# print(type(l))
# print(f.shape)
# print(l)

In [None]:
# processed_X = []
# processed_y = []
# 
# tmp_frames = []
# mn_H = math.inf
# mn_W = math.inf
# 
# # loop over each training video to assign a label to each frame and aggregate them all in one training array(s)
# for file in os.listdir("./training_data"):
#     print("processing file", file)
#     
#     # get frames of video and update the dimensions to resize to later
#     v_frames, _, _ = read_video("./training_data/%s" % file, output_format="TCHW")
#     mn_H = min(mn_H, v_frames.shape[2])
#     mn_W = min(mn_W, v_frames.shape[3])
#     for frame_num in range(v_frames.shape[0]):
#         tmp_frames.append(v_frames[frame_num])
#     
#     print("done w video read")
# 
#     
#     # use library to get labels for each frame
#     cur_video_labels, num_frames = generate_labels("./training_data/", file)
#     tmp_label_aggregator = [[] for i in range(num_frames)]
#     for frame, note in cur_video_labels:
#         tmp_label_aggregator[int(frame)].append(note)
#     processed_y.extend(tmp_label_aggregator)
#     
#     print("done w labeling")
# 
# # resize all frames to be the same size so the NN can handle them
# for frame in tmp_frames:
#     tnsr = transforms.Resize((mn_H, mn_W))(frame)
#     processed_X.append(tnsr.numpy())
# 
# processed_X = np.array(processed_X)
# processed_y = np.array(processed_y, dtype=object)
# 
# np.save("processed_frame_data", processed_X)
# np.save("processed_labels", processed_y)