# RATING Project

Comic Mischief Detection Task

Files:

1. "train.csv" : 
-- Contains multiclass classification content annotations for each video scene used in the training set.
-- Annotations are on a scene level and do not correspond to a specific modality
-- a ".csv" file containing video URLs as well as the IDs of the scenes used in the training set.
-- Videos are available in the form of URLs, collected from the Youtube and the IMDB websites.
-- Contains metadata about the videos.
-- Four content categories related to comic mischief are used (Sarcasm, Slapstick Humor, Gory Humor, Mature Humor).

2. "val.csv" : 
-- Contains multiclass classification content annotations for each video scene used in the validation set.
-- You can use this set for performing model hyperparameter tuning before using the test set


3. "test.csv" : 
-- Contains multiclass classification content annotations for each video scene used in the test set.
-- You can use this set for evaluating your method

In [2]:
from tensorflow_docs.vis import embed
from tensorflow.keras import layers
from tensorflow import keras
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, confusion_matrix
from tensorflow.keras import backend as K

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import os

In [22]:
## global variables

train_dir = os.getcwd() + "/train_data/"
val_dir = os.getcwd() + "/val_data/"
test_dir = os.getcwd() + "/test_data/"

# Hyperparameters
MAX_SEQ_LENGTH = 60
FRAME_GAP = 24
NUM_FEATURES = 1024
IMG_SIZE = 224

EPOCHS = 10
BATCH_SIZE = 64

### References
1. https://keras.io/examples/vision/video_transformers/)
2. https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub)
3. https://colab.research.google.com/github/sayakpaul/Action-Recognition-in-TensorFlow/blob/main/Data_Preparation_UCF101.ipynb

## Data Preparation

### METADATA loading

In [3]:
# Create a dataframe which contains multiclass classification content annotations for each video scene used in the training set.
train_df = pd.read_csv('train-updated.csv', dtype={'combination': object}).iloc[:,1:]
train_df["path"] = train_dir + train_df["Video ID"]+ ".0" + train_df["Scene_ID"].astype(str) + ".mp4"
train_df.head()

Unnamed: 0,Video ID,Scene_ID,Video URL,Codec,Resolution,Avg Frame rate,Mature Humor - Scene Annotation,Slapstick Humor - Scene Annotation,Gory Humor - Scene Annotation,Sarcasm - Scene Annotation,combination,path
0,tt2872718,0,https://www.imdb.com/videoplayer/vi4179799833,h264,854 x 480,23.976024,0,0,0,0,0,c:\Users\maitr\Desktop\COSC6373-ComputerVision...
1,tt2872718,1,https://www.imdb.com/videoplayer/vi4179799833,h264,854 x 480,23.976024,0,0,1,0,10,c:\Users\maitr\Desktop\COSC6373-ComputerVision...
2,tt2788710,0,https://www.imdb.com/videoplayer/vi1114222361,h264,854 x 480,23.976024,0,0,0,0,0,c:\Users\maitr\Desktop\COSC6373-ComputerVision...
3,tt2788710,1,https://www.imdb.com/videoplayer/vi1114222361,h264,854 x 480,23.976024,1,0,0,0,1000,c:\Users\maitr\Desktop\COSC6373-ComputerVision...
4,tt2788710,2,https://www.imdb.com/videoplayer/vi1114222361,h264,854 x 480,23.976024,0,0,0,0,0,c:\Users\maitr\Desktop\COSC6373-ComputerVision...


In [4]:
# Create a dataframe which contains multiclass classification content annotations for each video scene used in the validation set.
val_df = pd.read_csv('val.csv', dtype={'combination': object}).iloc[:,1:]
val_df["path"] = val_dir + val_df["Video ID"]+ ".0" + val_df["Scene_ID"].astype(str) + ".mp4"
val_df.head()

Unnamed: 0,Video ID,Scene_ID,Video URL,Codec,Resolution,Avg Frame rate,Mature Humor - Scene Annotation,Slapstick Humor - Scene Annotation,Gory Humor - Scene Annotation,Sarcasm - Scene Annotation,combination,path
0,tt1308728,0,https://www.youtube.com/watch?v=QP9qbhTeBII,h264,640 x 360,23.975945,1,0,0,0,1000,c:\Users\maitr\Desktop\COSC6373-ComputerVision...
1,tt1308728,1,https://www.youtube.com/watch?v=QP9qbhTeBII,h264,640 x 360,23.975945,1,1,0,0,1100,c:\Users\maitr\Desktop\COSC6373-ComputerVision...
2,tt1308728,2,https://www.youtube.com/watch?v=QP9qbhTeBII,h264,640 x 360,23.975945,1,0,0,0,1000,c:\Users\maitr\Desktop\COSC6373-ComputerVision...
3,PGuqnE35cCg,0,https://www.youtube.com/watch?v=PGuqnE35cCg,h264,640 x 360,23.976024,1,0,0,0,1000,c:\Users\maitr\Desktop\COSC6373-ComputerVision...
4,PGuqnE35cCg,1,https://www.youtube.com/watch?v=PGuqnE35cCg,h264,640 x 360,23.976024,1,0,0,0,1000,c:\Users\maitr\Desktop\COSC6373-ComputerVision...


In [5]:
# Create a dataframe which contains multiclass classification content annotations for each video scene used in the test set.
test_df = pd.read_csv('test-updated.csv', dtype={'combination': object}).iloc[:,1:]
test_df["path"] = test_dir + test_df["Video ID"]+ ".0" + test_df["Scene_ID"].astype(str) + ".mp4"
test_df.head()

Unnamed: 0,Video ID,Scene_ID,Video URL,Codec,Resolution,Avg Frame rate,Mature Humor - Scene Annotation,Slapstick Humor - Scene Annotation,Gory Humor - Scene Annotation,Sarcasm - Scene Annotation,combination,path
0,tt1741243,0,https://www.imdb.com/videoplayer/vi2169153049,h264,534 x 360,29.97,1,0,0,0,1000,c:\Users\maitr\Desktop\COSC6373-ComputerVision...
1,tt1741243,1,https://www.imdb.com/videoplayer/vi2169153049,h264,534 x 360,29.97,0,1,1,0,110,c:\Users\maitr\Desktop\COSC6373-ComputerVision...
2,tt1723121,0,https://www.youtube.com/watch?v=O7NHfAzg7Yg,h264,640 x 360,23.976024,1,0,0,0,1000,c:\Users\maitr\Desktop\COSC6373-ComputerVision...
3,tt1723121,1,https://www.youtube.com/watch?v=O7NHfAzg7Yg,h264,640 x 360,23.976024,1,0,0,0,1000,c:\Users\maitr\Desktop\COSC6373-ComputerVision...
4,tt1723121,2,https://www.youtube.com/watch?v=O7NHfAzg7Yg,h264,640 x 360,23.976024,1,0,0,0,1000,c:\Users\maitr\Desktop\COSC6373-ComputerVision...


### Data processing

How to feed videos to a neural network for training? <br>

1. Use OpenCV VideoCapture() method to read frames from videos.

In [23]:
# Utilities to open video files using CV2
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y:start_y+min_dim,start_x:start_x+min_dim]

def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    count = 0
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            
            if not ret:
                break

            if count % FRAME_GAP == 0:
                frame = crop_center_square(frame)
                frame = cv2.resize(frame, resize)
                frame = frame[:, :, [2, 1, 0]]
                frames.append(frame)

            count=count+1
            
            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

#### Feature Extraction with pre-trained DenseNet121

In [24]:
def build_feature_extractor():
    feature_extractor = keras.applications.DenseNet121(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.densenet.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()

#### Label Preprocessing 

Multilabel classification -->  Multi-class Binarization



In [25]:
labels = []
for x in train_df["combination"].values:
    lst = list(map(int, x))
    arr = np.asarray(lst)
    labels.append(arr)
labels = np.reshape(labels,(len(labels),4))
print(labels.shape)

(942, 4)


In [26]:
def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["path"].values.tolist()

    labels = []
    for x in df["combination"].values:
        lst = list(map(int, x))
        arr = np.asarray(lst)
        labels.append(arr)
    labels = np.reshape(labels,(len(labels),4))

    # `frame_features` are what we will feed to our sequence model.
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    for idx, path in enumerate(video_paths):
        print(path)
        # Gather all its frames and add a batch dimension.
        frames = load_video(path)

        # Pad shorter videos.
        if len(frames) < MAX_SEQ_LENGTH:
            diff = MAX_SEQ_LENGTH - len(frames)
            padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))
            frames = np.concatenate((frames, padding))

        frames = frames[None, ...]

        # Initialize placeholder to store the features of the current video.
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                if np.mean(batch[j, :]) > 0.0:
                    temp_frame_features[i, j, :] = feature_extractor.predict(
                        batch[None, j, :]
                    )

                else:
                    temp_frame_features[i, j, :] = 0.0

        frame_features[idx,] = temp_frame_features.squeeze()

    return frame_features, labels

## Build the Transformer-based model - BASE MODEL

In [27]:
# Embedding Layer
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim
        )
        self.sequence_length = sequence_length
        self.output_dim = output_dim

    def call(self, inputs):
        # The inputs are of shape: `(batch_size, frames, num_features)`
        length = tf.shape(inputs)[1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_positions = self.position_embeddings(positions)
        return inputs + embedded_positions

    def compute_mask(self, inputs, mask=None):
        mask = tf.reduce_any(tf.cast(inputs, "bool"), axis=-1)
        return mask

In [28]:
# Subclassed layer
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.3
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation=tf.nn.gelu), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]

        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

### Utility functions for training

In [29]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def get_compiled_model():
    sequence_length = MAX_SEQ_LENGTH
    embed_dim = NUM_FEATURES
    dense_dim = 4
    num_heads = 1
    # classes = len(label_processor.get_vocabulary())
    classes = 4

    inputs = keras.Input(shape=(None, None))
    x = PositionalEmbedding(
        sequence_length, embed_dim, name="frame_position_embedding"
    )(inputs)
    x = TransformerEncoder(embed_dim, dense_dim, num_heads, name="transformer_layer")(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(classes, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)

    # compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy',f1_m,precision_m, recall_m])

    model.summary()
    return model

## Model Training and Testing (max seq length = 128)

In [51]:
def run_experiment():
    log_dir = "logs/fit/video_chkpt_128" 
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

    filepath = os.getcwd() + "/seq_length_128/video_chkpt_128/video_classifier"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, 
        monitor='val_f1_m',
        mode='max',
        save_best_only=True,
        verbose = 1
    )

    with tf.device('/device:CPU:0'):
        model = get_compiled_model()
        history = model.fit(
            train_data,
            train_labels,
            validation_data=(val_data,val_labels),
            epochs=EPOCHS,
            callbacks=[checkpoint, tensorboard_callback],
        )

    model.load_weights(filepath)
    # _, accuracy = model.evaluate(test_data, test_labels)
    # evaluate the model
    loss, accuracy, f1_score, precision, recall = model.evaluate(test_data, test_labels, verbose=0)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
    print(f"F1 score: {round(f1_score, 2)}")
    print(f"Precision: {round(precision, 2)}")
    print(f"Recall: {round(recall, 2)}")

    return model

In [52]:
train_data, train_labels = np.load("seq_length_128/extracted_data/train_data.npy"), np.load("seq_length_128/extracted_data/train_labels.npy")
val_data, val_labels = np.load("seq_length_128/extracted_data/val_data.npy"), np.load("seq_length_128/extracted_data/val_labels.npy")
test_data, test_labels = np.load("seq_length_128/extracted_data/test_data.npy"), np.load("seq_length_128/extracted_data/test_labels.npy")

print(f"Frame features in train set: {train_data.shape}")

Frame features in train set: (942, 128, 1024)


In [53]:
trained_model = run_experiment()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        [(None, None, None)]      0         
_________________________________________________________________
frame_position_embedding (Po (None, None, 1024)        131072    
_________________________________________________________________
transformer_layer (Transform (None, None, 1024)        4211716   
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 1024)              0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_14 (Dense)             (None, 4)                 4100      
Total params: 4,346,888
Trainable params: 4,346,888
Non-trainable params: 0
_________________________________________________

## Model training and testing (max seq length = 60)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=7327af42-8a03-4c46-b38e-e6931aa020f3' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>

In [30]:
train_data, train_labels = np.load("seq_length_60/extracted_data/train_data.npy"), np.load("seq_length_60/extracted_data/train_labels.npy")
val_data, val_labels = np.load("seq_length_60/extracted_data/val_data.npy"), np.load("seq_length_60/extracted_data/val_labels.npy")
test_data, test_labels = np.load("seq_length_60/extracted_data/test_data.npy"), np.load("seq_length_60/extracted_data/test_labels.npy")

print(f"Frame features in train set: {train_data.shape}")

Frame features in train set: (942, 60, 1024)


In [31]:
def run_experiment_60():
    log_dir = "logs/fit/video_chkpt_60" 
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

    filepath = os.getcwd() + "/seq_length_60/video_chkpt_60_2/video_classifier"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, 
        monitor='val_f1_m',
        mode='max',
        save_best_only=True,
        verbose = 1
    )

    with tf.device('/device:CPU:0'):
        model = get_compiled_model()
        history = model.fit(
            train_data,
            train_labels,
            validation_data=(val_data,val_labels),
            epochs=EPOCHS,
            callbacks=[checkpoint, tensorboard_callback],
        )

    model.load_weights(filepath)
    # _, accuracy = model.evaluate(test_data, test_labels)
    # evaluate the model
    loss, accuracy, f1_score, precision, recall = model.evaluate(test_data, test_labels, verbose=0)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
    print(f"F1 score: {round(f1_score, 2)}")
    print(f"Precision: {round(precision, 2)}")
    print(f"Recall: {round(recall, 2)}")

    return model

In [32]:
trained_model_60 = run_experiment_60()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         [(None, None, None)]      0         
_________________________________________________________________
frame_position_embedding (Po (None, None, 1024)        61440     
_________________________________________________________________
transformer_layer (Transform (None, None, 1024)        4211716   
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 1024)              0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_14 (Dense)             (None, 4)                 4100      
Total params: 4,277,256
Trainable params: 4,277,256
Non-trainable params: 0
_________________________________________________

## Max Seq Length = 20

In [18]:
train_data, train_labels = np.load("seq_length_20/extracted_data/train_data.npy"), np.load("seq_length_20/extracted_data/train_labels.npy")
val_data, val_labels = np.load("seq_length_20/extracted_data/val_data.npy"), np.load("seq_length_20/extracted_data/val_labels.npy")
test_data, test_labels = np.load("seq_length_20/extracted_data/test_data.npy"), np.load("seq_length_20/extracted_data/test_labels.npy")

print(f"Frame features in train set: {train_data.shape}")

Frame features in train set: (942, 20, 1024)


In [19]:
def run_experiment_20():
    log_dir = "logs/fit/video_chkpt_20" 
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

    filepath = os.getcwd() + "/seq_length_20/video_chkpt_20/video_classifier"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_weights_only=True, 
        monitor='val_f1_m',
        mode='max',
        save_best_only=True,
        verbose = 1
    )

    with tf.device('/device:CPU:0'):
        model = get_compiled_model()
        history = model.fit(
            train_data,
            train_labels,
            validation_data=(val_data,val_labels),
            epochs=EPOCHS,
            callbacks=[checkpoint, tensorboard_callback],
        )

    model.load_weights(filepath)
    # _, accuracy = model.evaluate(test_data, test_labels)
    # evaluate the model
    loss, accuracy, f1_score, precision, recall = model.evaluate(test_data, test_labels, verbose=0)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
    print(f"F1 score: {round(f1_score, 2)}")
    print(f"Precision: {round(precision, 2)}")
    print(f"Recall: {round(recall, 2)}")

    return model

In [21]:
trained_model_20 = run_experiment_20()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, None, None)]      0         
_________________________________________________________________
frame_position_embedding (Po (None, None, 1024)        20480     
_________________________________________________________________
transformer_layer (Transform (None, None, 1024)        4211716   
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 1024)              0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_11 (Dense)             (None, 4)                 4100      
Total params: 4,236,296
Trainable params: 4,236,296
Non-trainable params: 0
_________________________________________________