In [1]:
from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os

Define hyperparameters

In [9]:
IMG_SIZE = 128
BATCH_SIZE = 16
EPOCHS = 10

MAX_SEQ_LENGTH = 40
NUM_FEATURES = 2048

INDEX = 10

In [3]:
train_df = pd.read_csv("video list/train.csv")
test_df = pd.read_csv("video list/test.csv")

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

#train_df.sample(10)

Total videos for training: 80
Total videos for testing: 20


In [44]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]

def get_dimension(res):
    res = res[1:-1].split(', ')
    res.reverse()

    right = int(res[0])
    up = int(res[1])
    left = int(res[2])
    down = int(res[3])
    return right, up, left, down

def load_video(path, resolution, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    right, up, left, down = get_dimension(resolution)
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break

            frame = frame[up:down, left:right]

            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

In [45]:
def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
    #feature_extractor = keras.applications.DenseNet121(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()

In [7]:
def prepare_all_videos_OLD(df):
    num_samples = len(df)
    video_paths = df["Direccion"].values.tolist()
    video_resolution = df["Dimension"].values.tolist()
    labels = df["Target"].values
    labels = np.reshape(labels, (len(labels), 1))

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        frames = load_video(path, video_resolution[idx])
        frames = frames[None, ...]

        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )
        print(temp_frame_features.shape)
        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :]
                )
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels

In [55]:
def prepare_all_videos(df):

    num_samples = len(df)
    video_paths = df["Direccion"].values.tolist()
    video_resolution = df["Dimension"].values.tolist()
    labels = df["Target"].values
    labels = np.reshape(labels, (len(labels), 1))

    shortLabel = np.zeros(
        shape=(1, 1), dtype="float32"
    )
    finalLabels = np.zeros(
        shape=(0, 1), dtype="float32"
    )

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(0, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros(
        shape=(0, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    length = len(video_paths)
    # For each video.
    for idx, path in enumerate(video_paths):
        inf = 0
        print("IDX: " + str(idx) + "/" + str(length) + "| path: " + path + "|inf:" + str(inf))
        # Gather all its frames and add a batch dimension.
        video = load_video(path, video_resolution[idx])
        while inf < len(video):
            print("inf: " + str(inf) + "|to: " + str(inf + MAX_SEQ_LENGTH))
            frames = video[inf:inf + MAX_SEQ_LENGTH]
            inf = inf + MAX_SEQ_LENGTH
        
            frames = frames[None, ...]

            # Initialize placeholders to store the masks and features of the current video.
            temp_frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
            temp_frame_features = np.zeros(
                shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
            )
            
            # Extract features from the frames of the current video.
            for i, batch in enumerate(frames):
                video_length = batch.shape[0]
                length = min(MAX_SEQ_LENGTH, video_length)
                for j in range(length):
                    temp_frame_features[i, j, :] = feature_extractor.predict(
                        batch[None, j, :]
                    )
                temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

            #frame_features[idx,] = temp_frame_features.squeeze()     
            #frame_masks[idx,] = temp_frame_mask.squeeze()       
            frame_features = np.concatenate((frame_features, temp_frame_features), axis=0)
            frame_features[idx,] = temp_frame_features.squeeze()
            
            frame_masks = np.concatenate((frame_masks, temp_frame_mask), axis=0)
            frame_masks[idx,] = temp_frame_mask.squeeze()


            shortLabel[0,0] = labels[idx, 0]
            finalLabels = np.concatenate((finalLabels, shortLabel), axis = 0)


    return (frame_features, frame_masks), finalLabels

In [56]:
train_data, train_labels = prepare_all_videos(train_df)
test_data, test_labels = prepare_all_videos(test_df)

IDX: 0/80| path: Noviembre2020\PAC32\pre (convertido).mov|inf:0
inf: 0|to: 60
inf: 60|to: 120
inf: 120|to: 180
inf: 180|to: 240
inf: 240|to: 300
inf: 300|to: 360
inf: 360|to: 420
IDX: 1/12| path: Noviembre2020\PAC35\post.mov|inf:0
inf: 0|to: 60
inf: 60|to: 120
inf: 120|to: 180
IDX: 2/59| path: Noviembre2020\PAC46\pre.mov|inf:0
inf: 0|to: 60
inf: 60|to: 120
inf: 120|to: 180
inf: 180|to: 240
inf: 240|to: 300
IDX: 3/6| path: Noviembre2020\PAC05\PRE.mov|inf:0
inf: 0|to: 60
inf: 60|to: 120
inf: 120|to: 180
inf: 180|to: 240
inf: 240|to: 300
IDX: 4/10| path: Noviembre2020\PAC21\POST.mov|inf:0
inf: 0|to: 60
inf: 60|to: 120
inf: 120|to: 180
inf: 180|to: 240
IDX: 5/1| path: Noviembre2020\PAC07\PRE.mov|inf:0
inf: 0|to: 60
inf: 60|to: 120
inf: 120|to: 180
inf: 180|to: 240
inf: 240|to: 300
inf: 300|to: 360
IDX: 6/48| path: Noviembre2020\PAC37\pre.mov|inf:0
inf: 0|to: 60
inf: 60|to: 120
inf: 120|to: 180
inf: 180|to: 240
inf: 240|to: 300
inf: 300|to: 360
IDX: 7/60| path: Noviembre2020\PAC33\13.12 (co

In [4]:
print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")
print(f"Frame labels in train set: {train_labels.shape}")
print()
print(f"Frame features in test set: {test_data[0].shape}")
print(f"Frame masks in test set: {test_data[1].shape}")
print(f"Frame labels in test set: {test_labels.shape}")

NameError: name 'train_data' is not defined

In [59]:
np.save(f"Index{INDEX}/train_data.npy", train_data[0], fix_imports=True, allow_pickle=False)
np.save(f"Index{INDEX}/train_data_mask.npy", train_data[1], fix_imports=True, allow_pickle=False)
np.save(f"Index{INDEX}/train_labels.npy", train_labels, fix_imports=True, allow_pickle=False)
np.save(f"Index{INDEX}/test_data.npy", test_data[0], fix_imports=True, allow_pickle=False)
np.save(f"Index{INDEX}/test_data_mask.npy", test_data[1], fix_imports=True, allow_pickle=False)
np.save(f"Index{INDEX}/test_labels.npy", test_labels, fix_imports=True, allow_pickle=False)

In [6]:
train_data = (np.load(f"Index{INDEX}/_features/train_data.npy"), np.load(f"Index{INDEX}/_features/train_data_mask.npy"))
train_labels = np.load(f"Index{INDEX}/_features/train_labels.npy")

test_data = (np.load(f"Index{INDEX}/_features/test_data.npy"), np.load(f"Index{INDEX}/_features/test_data_mask.npy"))
test_labels = np.load(f"Index{INDEX}/_features/test_labels.npy")

print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")

print(f"Frame features in test set: {test_data[0].shape}")
print(f"Frame masks in test set: {test_data[1].shape}")

Frame features in train set: (580, 40, 2048)
Frame masks in train set: (580, 40)
Frame features in test set: (141, 40, 2048)
Frame masks in test set: (141, 40)


Modificar Datos

In [76]:
column = train_labels[:, 0]
column[column < 3] = 0
column[column >= 3] = 1

column = test_labels[:, 0]
column[column < 3] = 0
column[column >= 3] = 1

print(f"Frame features in train set: {train_labels.shape}")
print(f"Frame features in test set: {test_labels.shape}")

Frame features in train set: (580, 1)


Contar Datos

In [12]:
import numpy as np

def Contar(tensor):
    unique_values, counts = np.unique(tensor, return_counts=True)
    totalSum = np.sum(counts)
    for value, count in zip(unique_values, counts):
        print(f"Número {value}: {count} veces, {(count / totalSum) * 100}")

print(f"Train")
Contar(train_labels[:, 0])

print(f"Test")
Contar(test_labels[:, 0])

Train
Número 0.0: 38 veces, 6.551724137931035
Número 1.0: 77 veces, 13.275862068965516
Número 2.0: 29 veces, 5.0
Número 3.0: 436 veces, 75.17241379310344
Test
Número 0.0: 14 veces, 9.929078014184398
Número 1.0: 26 veces, 18.439716312056735
Número 2.0: 13 veces, 9.219858156028367
Número 3.0: 88 veces, 62.4113475177305


The sequence model

In [16]:
def get_sequence_model():
    classes = len(np.unique(train_df["Target"]))

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")

    # Refer to the following tutorial to understand the significance of using `mask`:
    # https://keras.io/api/layers/recurrent_layers/gru/
    x = keras.layers.LSTM(16, return_sequences=True)(
        frame_features_input, mask=mask_input
    )
    x = keras.layers.LSTM(8)(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(8, activation="relu")(x)
    output = keras.layers.Dense(classes, activation="softmax")(x)

    rnn_model = keras.Model([frame_features_input, mask_input], output)

    rnn_model.compile(
        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    return rnn_model


# Utility for running experiments.
def run_experiment():
    filepath = f"Index{INDEX}/_model/video_classifier"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, save_best_only=True, verbose=1
    )

    seq_model = get_sequence_model()
    history = seq_model.fit(
        [train_data[0], train_data[1]],
        train_labels,
        validation_split=0.3,
        epochs=EPOCHS,
        callbacks=[checkpoint],
    )

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model

print(f"Index{INDEX}")
print(test_data[0].shape)
print(test_data[1].shape)
_, sequence_model = run_experiment()

Index10
(141, 40, 2048)
(141, 40)
Epoch 1/10
Epoch 1: val_loss improved from inf to 1.23324, saving model to Index10/_model\video_classifier
Epoch 2/10
Epoch 2: val_loss improved from 1.23324 to 1.19982, saving model to Index10/_model\video_classifier
Epoch 3/10
Epoch 3: val_loss improved from 1.19982 to 1.16476, saving model to Index10/_model\video_classifier
Epoch 4/10
Epoch 4: val_loss improved from 1.16476 to 1.11746, saving model to Index10/_model\video_classifier
Epoch 5/10
Epoch 5: val_loss improved from 1.11746 to 1.07289, saving model to Index10/_model\video_classifier
Epoch 6/10
Epoch 6: val_loss improved from 1.07289 to 1.01562, saving model to Index10/_model\video_classifier
Epoch 7/10
Epoch 7: val_loss improved from 1.01562 to 0.97272, saving model to Index10/_model\video_classifier
Epoch 8/10
Epoch 8: val_loss improved from 0.97272 to 0.91343, saving model to Index10/_model\video_classifier
Epoch 9/10
Epoch 9: val_loss improved from 0.91343 to 0.84938, saving model to Ind

Inference

In [13]:
def prepare_single_video(frames):
    frames = frames[None, ...]
    frame_mask = np.zeros(shape=(1, MAX_SEQ_LENGTH,), dtype="bool")
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

    return frame_features, frame_mask


def sequence_prediction(path):
    class_vocab = label_processor.get_vocabulary()

    frames = load_video(os.path.join("test", path))
    frame_features, frame_mask = prepare_single_video(frames)
    probabilities = sequence_model.predict([frame_features, frame_mask])[0]

    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    return frames

test_video = np.random.choice(test_df["video_name"].values.tolist())
print(f"Test video path: {test_video}")
test_frames = sequence_prediction(test_video)

KeyError: 'video_name'