In [45]:
from tensorflow_docs.vis import embed
from tensorflow.keras import layers
from tensorflow import keras
from tensorflow.keras.layers import StringLookup


import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
import cv2
import time 
import os
import math
import random

In [2]:
MAX_SEQ_LENGTH = 55 # defining sequence 
NUM_FEATURES = 1024
IMG_SIZE = 128

EPOCHS = 50

In [7]:

# Define the directory path
directory = 'models/vgg16'

# Create the directory if it does not exist
if not os.path.exists(directory):
    os.makedirs(directory)

dataset_path = "sample-videos"  # dataset path

# Lists to collect data
train_data = []
test_data = []

for root, dirs, files in os.walk(dataset_path):
    class_name = os.path.basename(root)
    
    if class_name:
        print(f"Processing class: {class_name} with {len(files)} videos.")
        thresh = math.ceil(len(files) * 0.8)
        random.shuffle(files)
        
        for i, video in enumerate(files):
            video_path = os.path.join(class_name, video)
            tag = class_name
            if i < thresh:
                train_data.append({"video_name": video_path, "tag": tag})
            else:
                test_data.append({"video_name": video_path, "tag": tag})

# Create DataFrames from the collected data
train = pd.DataFrame(train_data, columns=['video_name', 'tag'])
test = pd.DataFrame(test_data, columns=['video_name', 'tag'])

# Save to CSV
train.to_csv(os.path.join(directory, 'train_transfromer_new120.csv'), index=False)
test.to_csv(os.path.join(directory, 'test_transfromer_new120.csv'), index=False)


Processing class: sample-videos with 0 videos.
Processing class: Amilo with 28 videos.
Processing class: Baisakh with 28 videos.


In [10]:
train_df = pd.read_csv("models/vgg16/train_transfromer_new120.csv")
test_df = pd.read_csv("models/vgg16/test_transfromer_new120.csv")

# train_df = pd.read_csv("models/vgg16/shuffle_train.csv") # loading train data
# test_df = pd.read_csv("models/vgg16/shuffle_test.csv") # loading test data

# Randomly shuffle the data
train_df = train_df.sample(frac=1).reset_index(drop=True)  # shuffle and reset index
test_df = test_df.sample(frac=1).reset_index(drop=True)    # shuffle and reset index
"""
    The shuffeling is done to prevent model from overfiting 
    src: https://stackoverflow.com/questions/67327697/how-to-avoid-overfitting-with-keras
    https://discuss.huggingface.co/t/why-transformer-overfit-quickly-how-to-solve-it/1842
"""

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

Total videos for training: 46
Total videos for testing: 10


In [11]:
"""
    Croping center of the frame
"""

def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 3) - (min_dim // 5) 
    start_y = (y // 2) - (min_dim // 2)
    crop= frame[start_y : start_y + min_dim, start_x : start_x + min_dim] # croping center of the frame
    
    return cv2.resize(crop,(128,128)) # resizing image to 128,128 pixel

# Following method is modified from this tutorial:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
def load_video(path, max_frames=0):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = frame[:, :, [2, 1, 0]] # accepting blue channel only
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

In [12]:
"""
Feature Extractor for vgg16

def build_feature_extractor():
    feature_extractor = keras.applications.VGG16(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.vgg_16.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")

"""
"""
def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")
"""
def build_feature_extractor():
    # it reututn 1024 features
    feature_extractor = keras.applications.DenseNet121(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.densenet.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")

feature_extractor = build_feature_extractor()


# Label preprocessing with StringLookup.
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["tag"]), mask_token=None
)
print(label_processor.get_vocabulary())


['Amilo', 'Baisakh']


In [61]:
"""
    Video Feature extraction 
"""


# Define constants
IMG_SIZE = 128
MAX_SEQ_LENGTH = 55
NUM_FEATURES = 1024  # Adjust based on your feature extractor output size

# Path to save the processed features and labels
# save_path = "models/dense121/train_dense121_seq55.npz"

# Ensure the save directory exists
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# Define label processor
def create_label_processor(csv_path):
    df = pd.read_csv(csv_path)
    return tf.keras.layers.StringLookup(
        num_oov_indices=0, vocabulary=np.unique(df["tag"]), mask_token=None
    )

# Function to prepare video data using a CSV file
def prepare_all_videos(csv_path, root_dir, label_processor):
    # Load CSV
    df = pd.read_csv(csv_path)
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    labels = label_processor(df["tag"]).numpy()  # Encode labels

    # Initialize placeholder for frame features
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    for idx, video_name in enumerate(video_paths):
        video_path = os.path.join(root_dir, video_name)
        
        # Check if video file exists
        if not os.path.exists(video_path):
            print(f"File not found: {video_path}. Skipping...")
            continue

        # Load video frames
        frames = load_video(video_path)

        # Skip if frames are empty
        if frames.size == 0:
            print(f"Failed to load video at {video_path}. Skipping...")
            continue

        # Ensure frames have 4 dimensions (batch, sequence length, height, width, channels)
        if frames.ndim == 4:
            frames = frames[None, ...]  # Add a batch dimension if missing
        elif frames.ndim < 4:
            frames = np.expand_dims(frames, axis=(0, 1))  # Make it 5D

        # Pad shorter videos to match MAX_SEQ_LENGTH
        if frames.shape[1] < MAX_SEQ_LENGTH:
            diff = MAX_SEQ_LENGTH - frames.shape[1]
            padding = np.zeros((1, diff, IMG_SIZE, IMG_SIZE, 3))  # 5D padding
            frames = np.concatenate((frames, padding), axis=1)

        # Prepare feature extraction placeholder for the current video
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from frames of the current video
        for j in range(min(MAX_SEQ_LENGTH, frames.shape[1])):
            if np.mean(frames[0, j, :]) > 0.0:  # Process non-empty frames
                temp_frame_features[0, j, :] = feature_extractor.predict(
                    frames[0, j, :][None, ...]
                )
            else:
                temp_frame_features[0, j, :] = 0.0

        # Assign features to the main frame_features array
        frame_features[idx] = temp_frame_features.squeeze()

    return frame_features, labels



In [None]:
"""


features = np.load("/media/gpu/157/hand_sign/models/dense121/train_dense121_seq55.npz")
train_data, train_labels,test_data, test_labels = features["arr_0"], features["arr_1"], features["arr_2"], features["arr_3"]
"""
# # Check if the file already exists
# if os.path.exists(save_path):
#     # Load existing features
#     features = np.load(save_path)
#     train_data, train_labels, test_data, test_labels = (
#         features["arr_0"],
#         features["arr_1"],
#         features["arr_2"],
#         features["arr_3"],
#     )
# else:
#     # Process videos and save features
#     train_data, train_labels = prepare_all_videos(train_df, "models/features/train-videos")
#     test_data, test_labels = prepare_all_videos(test_df, "models/features/test-videos")
    
#     # Save extracted features to a file for future use
#     np.savez(save_path, train_data, train_labels, test_data, test_labels)
#     print(f"Data saved successfully to {save_path}")

# Define paths to CSVs and directories
train_csv = "models/vgg16/train_transfromer_new120.csv"
test_csv = "models/vgg16/test_transfromer_new120.csv"
train_dir = "sample-videos"
test_dir = "sample-videos"

# Load train and test data from CSV
label_processor_train = create_label_processor(train_csv)
train_data, train_labels = prepare_all_videos(train_csv, train_dir, label_processor_train)

label_processor_test = create_label_processor(test_csv)
test_data, test_labels = prepare_all_videos(test_csv, test_dir, label_processor_test)

# Define paths to save extracted features
train_save_path = "models/dense121/train_dense121_seq55.npz"
test_save_path = "models/dense121/test_dense121_seq55.npz"

# Save extracted features to separate files for train and test
os.makedirs(os.path.dirname(train_save_path), exist_ok=True)
np.savez(train_save_path, data=train_data, labels=train_labels)
print(f"Train data saved successfully to {train_save_path}")

os.makedirs(os.path.dirname(test_save_path), exist_ok=True)
np.savez(test_save_path, data=test_data, labels=test_labels)
print(f"Test data saved successfully to {test_save_path}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 95ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[

In [None]:
# Feature extraction
start_time = time.time()
train_data, train_labels = prepare_all_videos(train_df, "sample-videos") # extracting train data feature
test_data, test_labels = prepare_all_videos(test_df, "sample-videos") # # extracting test data feature


print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")
print(f"Total time taken for feature extraction is {(time.time()-start_time)/60}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0

In [64]:
# Model Building For Transformer for classification
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, output_dim, **kwargs):
        super(PositionalEmbedding,self).__init__(**kwargs)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim
        )
        self.sequence_length = sequence_length
        self.output_dim = output_dim

    def call(self, inputs):
        # The inputs are of shape: `(batch_size, frames, num_features)`
        length = tf.shape(inputs)[1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_positions = self.position_embeddings(positions)
        return inputs + embedded_positions

    def compute_mask(self, inputs, mask=None):
        mask = tf.reduce_any(tf.cast(inputs, "bool"), axis=-1)
        return mask
    
    def get_config(self):
        config = super(PositionalEmbedding,self).get_config()
        config.update({
            "position_embeddings": self.position_embeddings,
            "sequence_length": self.sequence_length,
            "output_dim": self.output_dim
        })
        return config


In [65]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super(TransformerEncoder,self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim, dropout=0.5
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation=tf.nn.gelu),layers.Dropout(0.7), layers.Dense(embed_dim, activation=tf.nn.gelu),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]

        attention_output = self.attention(inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)
    
    def get_config(self):
        config = super(TransformerEncoder,self).get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "dense_dim": self.dense_dim,
            "num_heads": self.num_heads,
            "attention": self.attention,
            "layernorm_1":self.layernorm_1,
            "layernorm_2":self.layernorm_2,
        })
        return config


In [66]:
""" 
    This model architecture is for vgg16
    vgg16 give 512 features

"""
tf.keras.utils.set_random_seed(1024) # defining random seed 

"""
    Random seed is to generate same dataset in each shuffel while training.
    src: https://stackoverflow.com/questions/51249811/reproducible-results-in-tensorflow-with-tf-set-random-seed
"""


def get_compiled_model():
    sequence_length = MAX_SEQ_LENGTH
    embed_dim = NUM_FEATURES
    dense_dim = 4  # definig dense layer on dense 4 model gives 87% acc and 88% acc on 512 dense
    num_heads = 4 # defining  number of MultiHeadAttention layer
    classes = len(label_processor.get_vocabulary())

    inputs = keras.Input(shape=(None, None))
    x = PositionalEmbedding(
        sequence_length, embed_dim, name="frame_position_embedding"
    )(inputs)
    x = TransformerEncoder(embed_dim, dense_dim, num_heads, name="transformer_layer")(x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dropout(0.5)(x)
    
    
    outputs = layers.Dense(classes, activation="softmax")(x)
    model = keras.Model(inputs, outputs)

    model.compile(
        optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
    )
    model.summary()
    return model


# def run_experiment():
#     filepath = "models/vgg16/ckpt/"
#     checkpoint = keras.callbacks.ModelCheckpoint(
#         filepath, save_weights_only=True, save_best_only=True, verbose=1
#     )

#     model = get_compiled_model()
#     history = model.fit(
#         train_data,
#         train_labels,
#         validation_split=0.10,
#         epochs=20,
#         callbacks=[checkpoint],
#         batch_size= 64
#     )

#     model.load_weights(filepath)
#     _, accuracy = model.evaluate(test_data, test_labels)
#     print(f"Test accuracy: {round(accuracy * 100, 2)}%")

#     return model, history

In [67]:
# Placeholder for your actual data preparation logic
num_samples = 100  # For example, if you want 100 samples
sequence_length = 55
feature_dimension = 1024

# Simulated data (replace this with your actual data loading logic)
train_data_reshaped = np.random.random((num_samples, sequence_length, feature_dimension))
train_labels = np.random.randint(0, 2, num_samples)  # Assuming binary classification

# Confirm the shapes again
print("Train data shape:", train_data_reshaped.shape)  # Expected: (num_samples, 55, 1024)
print("Train labels shape:", train_labels.shape)   

Train data shape: (100, 55, 1024)
Train labels shape: (100,)


In [71]:
data = np.load("models/dense121/train_dense121_seq55.npz")
train_data = data["data"]  # This should represent the combined train/test data
train_labels = data["labels"]  # This should represent the combined train/test labels


In [80]:
# Print the number of samples for each dataset
print("Number of samples in train_data:", train_data.shape[0])
print("Number of samples in mask_input:", mask_input.shape[0])
print("Number of samples in train_labels:", train_labels.shape[0])



Number of samples in train_data: 36
Number of samples in mask_input: 100
Number of samples in train_labels: 36


In [86]:

# Set random seed for reproducibility
tf.random.set_seed(42)

# Constants
MAX_SEQ_LENGTH = 55  # Example sequence length
NUM_FEATURES = 1024  # Number of features in each frame
NUM_CLASSES = 46  # Number of classes for output, based on your vocabulary size

# Sample label processor; replace with your actual implementation
class LabelProcessor:
    def get_vocabulary(self):
        return range(NUM_CLASSES)

label_processor = LabelProcessor()

# Define the LSTM model
def get_sequence_model():
    class_vocab = label_processor.get_vocabulary()

    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")  # Mask input for padding

    # Model architecture
    x = keras.layers.LSTM(256, return_sequences=True)(frame_features_input, mask=mask_input)
    x = keras.layers.LSTM(128)(x)
    x = keras.layers.Dropout(0.5)(x)
    x = keras.layers.Dense(64, activation="relu")(x)

    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)

    # Create and compile the model
    rnn_model = keras.Model(inputs=[frame_features_input, mask_input], outputs=output)
    rnn_model.compile(loss="sparse_categorical_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])

    return rnn_model

# Utility for running experiments
def run_experiment():
    # Define directory and file paths
    directory = 'models/weights'
    filepath = "models/weights/video_classifier.weights.h5"

    # Ensure the directory exists
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Ensure the parent directory of the file exists
    os.makedirs(os.path.dirname(filepath), exist_ok=True)

    # Define callbacks
    checkpoint = keras.callbacks.ModelCheckpoint(filepath, save_weights_only=True, save_best_only=True, verbose=1)
    early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)

    # Get the model
    seq_model = get_sequence_model()

    # Start training
    history = seq_model.fit(
        [train_data, mask_input],  # Input data
        train_labels,  # Target labels
        validation_split=0.0,  # No validation split
        epochs=50,  # Number of epochs
        callbacks=[checkpoint, early_stop],  # Callbacks for saving weights and early stopping
        batch_size=64
    )

    # Load the best weights after training
    seq_model.load_weights(filepath)

    # Simulate test data (use your actual test data here)
    test_data = np.random.random((20, MAX_SEQ_LENGTH, NUM_FEATURES))  # Example test data
    test_labels = np.random.randint(0, NUM_CLASSES, size=(20,))  # Example test labels
    test_mask = np.ones((20, MAX_SEQ_LENGTH), dtype=bool)  # Example mask for test data

    # Evaluate the model on test data
    _, accuracy = seq_model.evaluate([test_data, test_mask], test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model

# Measure training time
train_time = time.time()

# Load your training data from the .npz file
data = np.load("models/dense121/train_dense121_seq55.npz")
all_data = data["data"]  # Combined data for both train and test
all_labels = data["labels"]  # Combined labels for both train and test

# Split data into training and testing sets
split_index = int(0.8 * len(all_data))  # 80% training data, 20% testing data
train_data = all_data[:split_index]
train_labels = all_labels[:split_index]
test_data = all_data[split_index:]
test_labels = all_labels[split_index:]

# Prepare mask input (if needed)
mask_input = np.ones((train_data.shape[0], MAX_SEQ_LENGTH), dtype=bool)  # Example mask input

# Run the experiment to train the model and save weights
model, history = run_experiment()

train_endTime = time.time() - train_time
print(f"Total time taken for training is : {train_endTime:.2f} seconds")


Epoch 1/50




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step - accuracy: 0.1667 - loss: 3.7092
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 416ms/step - accuracy: 0.3333 - loss: 3.2845
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 460ms/step - accuracy: 0.5556 - loss: 2.8324
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 366ms/step - accuracy: 0.5000 - loss: 2.3125
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 351ms/step - accuracy: 0.5833 - loss: 1.8421
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 404ms/step - accuracy: 0.6111 - loss: 1.3799
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 395ms/step - accuracy: 0.8056 - loss: 1.0373
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 383ms/step - accuracy: 0.8611 - loss: 0.7923
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'models/weights/video_classifier.weights.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [87]:
training_time = time.time()
trained_model, history = run_experiment()
end_time = time.time()-training_time
print(f"Total time take for trainning is {end_time/60} minnute")

Epoch 1/50




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.0000e+00 - loss: 3.7860
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 351ms/step - accuracy: 0.5556 - loss: 3.3329
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 353ms/step - accuracy: 0.5833 - loss: 3.0183
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 382ms/step - accuracy: 0.8889 - loss: 2.5255
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 442ms/step - accuracy: 0.8611 - loss: 2.0494
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 374ms/step - accuracy: 0.8611 - loss: 1.5835
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 429ms/step - accuracy: 0.9167 - loss: 1.0435
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 393ms/step - accuracy: 0.8611 - loss: 0.7281
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'models/weights/video_classifier.weights.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [33]:


if history:
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.savefig("models/inceptionv3/Vit_acc_81.png")
    plt.show()
    
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.savefig("models/inceptionv3/Vit_loss_81.png")
    plt.show()
else:
    print("Training did not produce a history object.")


NameError: name 'history' is not defined

In [34]:
def prepare_single_video(frames):
    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

    # Pad shorter videos.
    if len(frames) < MAX_SEQ_LENGTH:
        diff = MAX_SEQ_LENGTH - len(frames)
        padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))
        frames = np.concatenate((frames, padding))

    frames = frames[None, ...]

    # Extract features from the frames of the current video.
    for i, batch in enumerate(frames):
        video_length = batch.shape[0]
        length = min(MAX_SEQ_LENGTH, video_length)
        for j in range(length):
            if np.mean(batch[j, :]) > 0.0:
                frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
            else:
                frame_features[i, j, :] = 0.0

    return frame_features


def predict_action(path):
    class_vocab = label_processor.get_vocabulary()

    frames = load_video(os.path.join("dataset", path))
    frame_features = prepare_single_video(frames)
    probabilities = trained_model.predict(frame_features)[0]
#     print(f"Predicted label: {class_vocab[np.argmax(probabilities)]}: {probabilities[np.argmax(probabilities)] * 100:5.2f}")


    for i in np.argsort(probabilities)[::-1]:
        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
    return frames, class_vocab[np.argmax(probabilities)] 


# This utility is for visualization.
# Referenced from:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
def to_gif(images):
    converted_images = images.astype(np.uint8)
    imageio.mimsave("animation.gif", converted_images, fps=10)
    return embed.embed_file("animation.gif")


test_video = np.random.choice(test_df["video_name"].values.tolist())
print(f"Test video path: {test_video}")
test_frames = predict_action(test_video)[0]
to_gif(test_frames[:MAX_SEQ_LENGTH])

Test video path: Amilo\20241008_091534_noise.mp4


ValueError: all the input arrays must have same number of dimensions, but the array at index 0 has 1 dimension(s) and the array at index 1 has 4 dimension(s)

In [None]:
def check_res(video_file):
    frames = predict_action(video_file)
    print(video_file,frames[1])
    
    return frames[1]

In [None]:
test_df['predicted'] = test_df['video_name'].apply(check_res)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
report = classification_report(test_df["tag"],test_df["predicted"])
confusion_matrix = confusion_matrix(test_df["tag"],test_df["predicted"])
import seaborn as sn
df_cm = pd.DataFrame(confusion_matrix, range(25), range(25))
plt.figure(figsize=(8,8))
sn.set(font_scale=1.4) # for label size
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size
plt.savefig("models/inceptionv3/vit82.png")
plt.show()