# How Important is Reccurrence for Video Classification?
## Introduction
Human behavior such as standing, running and jumping can be used in crowd control, for safety regulations, and can be analysed (for instance, to quantify the performance of a soccer player in a given match). Accordingly, the classification of human actions is an important tool that can be deployed for many use cases and is an important goal in the area of computer vision. Human actions are not instantaneous and usually play out over a longer sequence of time (e.g., a couple of seconds). 

Prior research has found that videos (in particular short clips) can be classified well using single frame models (Karpathy et al., 2014). However, the authors suggested utilising recurrent neural networks (RNNs) as a potential improvement for video classification. 

The goal of the current study is to estimate whether RNNs can encode temporal information of videos well and thereby improve classification accuracy of human actions.

## Method
We construct an image-encoder with a pretrained base and a trainable head (3 feed-forward layers for classification). This image-encoder can be used for predicting actions already, however, videos are sequences of images. Therefore, we speculate that a neural network that can account for temporal dependencies may improve performance. To this end, we extend the image encoder with a recurrent neural network that takes chunks of encoded images as input.

Data set: (

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import sys
import cv2
import math
import random
import numpy as np
import datetime as dt
import tensorflow as tf
from tensorflow import keras
from keras import layers
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline
import gc

from sklearn.model_selection import train_test_split
 
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model
from tensorflow import keras

In [None]:
seed_constant = 23
np.random.seed(seed_constant)
random.seed(seed_constant)
tf.random.set_seed(seed_constant)

In [None]:
#Get all classes 
main_data_dir = '/kaggle/input/HMDB_dataset/'  # Path without the extra folders
all_classes_names = os.listdir(main_data_dir)
print(all_classes_names)

In [None]:
# Create a Matplotlib figure
fig = plt.figure(figsize = (30, 30))

# Generate a random sample of images each time the cell runs
random_range = random.sample(range(len(all_classes_names)), 20)

# Iterating through all the random samples
for counter, random_index in enumerate(random_range, 1):
 
    # Getting Class Name using Random Index
    selected_class_Name = all_classes_names[random_index]
 
    # Getting a list of all the video files present in a Class Directory
    video_files_names_list = os.listdir(f'{main_data_dir}{selected_class_Name}')

    # Randomly selecting a video file
    selected_video_file_name = random.choice(video_files_names_list)
 
    # Reading the Video File Using the Video Capture
    print(f'{main_data_dir}{selected_class_Name}/{selected_video_file_name}')
    video_reader = cv2.VideoCapture(f'{main_data_dir}{selected_class_Name}/{selected_video_file_name}')

    # Reading The First Frame of the Video File
    _, bgr_frame = video_reader.read()
    
    # Closing the VideoCapture object and releasing all resources. 
    video_reader.release()
 
    # Converting the BGR Frame to RGB Frame 
    rgb_frame = cv2.cvtColor(bgr_frame, cv2.COLOR_BGR2RGB)
 
    # Adding The Class Name Text on top of the Video Frame.
    cv2.putText(rgb_frame, selected_class_Name, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
     
    # Assigning the Frame to a specific position of a subplot
    plt.subplot(5, 4, counter)
    plt.imshow(rgb_frame)
    plt.axis('off') 


In [None]:
#Free RAM space
plt.close(fig)
fig.clf()
gc.collect()

In [None]:
all_classes_names

In [None]:
image_height, image_width = 128, 128
max_images_per_train_class = 8000
max_images_per_test_class = 4000
 
classes_list = ["shoot_gun", "shoot_ball", "kiss", "smoke"]
 
model_output_size = len(classes_list)
num_classes = 51

In [None]:
# Define data augmentation layer
data_augmentation = tf.keras.Sequential([
    layers.RandomFlip("horizontal_and_vertical"), 
    layers.RandomRotation(0.4),  
    layers.RandomZoom(0.2), 
    layers.RandomContrast(0.2), 
])

In [None]:
# Function to extract frames from a video
def frames_extraction(video_path):
    frames_list = []
    video_reader = cv2.VideoCapture(video_path)

    while True:
        success, frame = video_reader.read()
        if not success:
            break
        resized_frame = cv2.resize(frame, (image_height, image_width))
        normalized_frame = resized_frame / 255.0  # Normalize pixel values
        frames_list.append(normalized_frame)

    video_reader.release()
    return frames_list

# Function to load data for selected classes
def create_dataset(main_data_dir, classes_list, max_images_per_class, ):
    data_train, labels_train = [], []
    data_test, labels_test = [], []
    for class_index, class_name in enumerate(classes_list):
        class_folder = os.path.join(main_data_dir, class_name)
        video_files = os.listdir(class_folder)
    
        count = 0
        for video_file in video_files:
            video_path = os.path.join(class_folder, video_file)
            frames = frames_extraction(video_path)
            if frames:  # Ensure frames were extracted
                rand_frame_ids = np.random.choice(len(frames), size=2, replace=False)
            if "training" in video_file:
                for id in rand_frame_ids:
                    data_train.append(frames[id])  # Use the first frame (can use others too)
                    labels_train.append(class_index)
                    count += 1
            else:
                for id in rand_frame_ids:
                    data_test.append(frames[id])  # Use the first frame (can use others too)
                    labels_test.append(class_index)
                    count += 1
            if count >= max_images_per_class:
                break

    return np.array(data_train), np.array(data_test), np.array(labels_train), np.array(labels_test)
# Create the dataset
max_images_per_class = 200
X_train, X_test, y_train, y_test = create_dataset(main_data_dir, all_classes_names, max_images_per_class)
print(f"Dataset size: {y_train.shape[0] + y_test.shape[0]}, Train: {y_train.shape}, Test: {y_test.shape}")

# batch and shuffle train data
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(y_train.shape[0]).batch(100)
# split test data into training and validation
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).shuffle(y_test.shape[0]).batch(100)
test_ds, val_ds = tf.keras.utils.split_dataset(test_ds, left_size=0.5, right_size=0.5)

### Model Definition (Auto-Encoder)

In [None]:
# Define the VGG16 base model with an explicit input shape
base = tf.keras.applications.VGG16(
    include_top=False,  # Remove the fully connected layers
    weights='imagenet',  # Use pretrained weights
    input_shape=(image_height, image_width, 3)  # Explicitly define the input shape
)

# Freeze base model layers
base.trainable = False
for layer in base.layers[-4:]:  
    layer.trainable = True
base.summary()

In [None]:
# Build Model 
num_classes = len(all_classes_names)
model = keras.Sequential([
    data_augmentation,
    base,
    layers.Flatten(),
    layers.Dense(num_classes, activation="softmax")
])

# **Explicitly Build the Model by Passing Sample Input**
# Generate a sample input with the correct shape
sample_input = np.random.random((1, image_height, image_width, 3))  # Batch of 1
model(sample_input)

model.summary()

## Training the ImageEncoder

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(epsilon=0.01),
    loss='sparse_categorical_crossentropy',
    metrics=["accuracy"]
)

## Results

In [None]:
train_hist = model.fit(
    train_ds,
    epochs=30,
    validation_data=val_ds,
    callbacks= [keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=2, min_lr=1e-6, verbose=1),
    keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)]
)

In [None]:
import pandas as pd
history_frame = pd.DataFrame(train_hist.history)

fig, ax = plt.subplots(1,2, figsize=(10,4))
ax[0].plot(history_frame.index, history_frame['accuracy'], label='train')
ax[0].plot(history_frame.index, history_frame['val_accuracy'], label='val')
_ = ax[0].set_xlabel('Epoch')
_ = ax[0].set_ylabel('Accuracy')
_ = ax[0].legend(loc='lower right')
_ = ax[0].set_title("Accuracy Curve")

ax[1].plot(history_frame.index, history_frame['loss'], label='train')
ax[1].plot(history_frame.index, history_frame['val_loss'], label='val')
_ = ax[1].set_xlabel('Epoch')
_ = ax[1].set_ylabel('Loss')
_ = ax[1].legend(loc='upper right')
_ = ax[1].set_title("Loss Curve")

In [None]:
y_dist = model.predict(X_test)
y_pred = np.argmax(y_dist, axis=1)

In [None]:
correct_pred = [pred==true for pred, true in zip(y_pred, y_test)]
print(f"Accuracy: {np.mean(y_pred==y_test)}")

In [None]:
# visualize some false predictions
false_ids = [i for i, correct in enumerate(correct_pred) if not correct]
n_false_plotted = 5
fig, ax = plt.subplots(5, 2, figsize=(8,20), width_ratios=[1, 2])
for i, id in enumerate(np.random.choice(false_ids, n_false_plotted)):
    img = X_test[id] * 255.
    img = img.astype(np.uint8)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    ax[i,0].imshow(img)
    ax[i,0].set_title(f"Pred:{all_classes_names[y_pred[id]]} True: {all_classes_names[y_test[id]]}")
    ax[i,0].set_axis_off()
    
    ax[i,1].bar(range(51),y_dist[id])
    ax[i,1].set_title("Probability Distribution")
    ax[i,1].set_xlabel("action")
    ax[i,1].set_ylabel("P(action|image)")
    ax[i,1].set_xticks(range(51), labels=[all_classes_names[i] for i in range(51)], rotation=90, fontsize=8)
    ax[i,1].spines['top'].set_visible(False)
    ax[i,1].spines['right'].set_visible(False)
    fig.tight_layout()

In [None]:
# Save the model architecture as a JSON file
model_json = model.to_json()
with open("model_architecture.json", "w") as json_file:
    json_file.write(model_json)

# Save the model weights with the correct file extension
model.save_weights("img_enc_weights.weights.h5")



## Discussion

## References

Karpathy, A., Toderici, G., Shetty, S., Leung, T., Sukthankar, R., & Fei-Fei, L. (2014). Large-scale video classification with convolutional neural networks. In Proceedings of the IEEE conference on Computer Vision and Pattern Recognition (pp. 1725-1732).

#### Potential extra references:

A. Ullah, J. Ahmad, K. Muhammad, M. Sajjad and S. W. Baik, "Action Recognition in Video Sequences using Deep Bi-Directional LSTM With CNN Features," in IEEE Access, vol. 6, pp. 1155-1166, 2018, doi: 10.1109/ACCESS.2017.2778011.

