In [1]:
import os
import tensorflow as tf
import cv2
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers, models
import itertools
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.models import model_from_json

In [73]:
# Direktori dataset
VIDEO_TRAIN_DIR = '../data/videos/train/'
VIDEO_VAL_DIR = '../data/videos/validation/'

In [74]:
# Function to extract frames from video
def extract_frames_from_video(video_path, num_frames=30, target_size=(224, 224)):
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_interval = frame_count // num_frames

    for i in range(num_frames):
        cap.set(cv2.CAP_PROP_POS_FRAMES, i * frame_interval)
        ret, frame = cap.read()
        if ret:
            frame = cv2.resize(frame, target_size)
            frame = frame / 255.0
            frames.append(frame)

    cap.release()
    return np.array(frames)

In [None]:
# Function to get all video files and labels based on subfolder
def get_video_files_and_labels(video_dir):
    video_files = []
    labels = {}
    for subfolder in os.listdir(video_dir):
        subfolder_path = os.path.join(video_dir, subfolder)
        if os.path.isdir(subfolder_path):
            for file in os.listdir(subfolder_path):
                if file.endswith(".mp4"):
                    video_path = os.path.join(subfolder_path, file)
                    video_files.append(video_path)
                    labels[video_path] = subfolder
    return video_files, labels

In [76]:
# Function to generate batches of video data with labels
def video_data_generator(video_dir, labels, batch_size=16, num_frames=30, target_size=(224, 224)):
    video_files, _ = get_video_files_and_labels(video_dir)
    while True:
        batch_videos = []
        batch_labels = []
        for video_path in itertools.islice(video_files, 0, batch_size):
            frames = extract_frames_from_video(video_path, num_frames, target_size)
            label = labels.get(video_path)
            batch_videos.append(frames)
            batch_labels.append(label)

        batch_videos = np.array(batch_videos)
        batch_labels = np.array(batch_labels)
        yield batch_videos, batch_labels


In [77]:
# Get all video files and labels for training and validation
video_train_files, labels_train = get_video_files_and_labels(VIDEO_TRAIN_DIR)
video_val_files, labels_val = get_video_files_and_labels(VIDEO_VAL_DIR)

# Initialize LabelEncoder to convert string labels to integer labels
label_encoder = LabelEncoder()

# Fit the encoder on the training labels and transform labels
train_labels_encoded = label_encoder.fit_transform(list(labels_train.values()))
val_labels_encoded = label_encoder.transform(list(labels_val.values()))

# Map the labels back to their respective video files
labels_train_encoded = {video_file: train_labels_encoded[i] for i, video_file in enumerate(video_train_files)}
labels_val_encoded = {video_file: val_labels_encoded[i] for i, video_file in enumerate(video_val_files)}

# Calculate steps per epoch and validation steps
steps_per_epoch = len(video_train_files) // 32
validation_steps = len(video_val_files) // 32

# Create video data generators for training and validation
train_video_generator = video_data_generator(VIDEO_TRAIN_DIR, labels_train_encoded, batch_size=16)
validation_video_generator = video_data_generator(VIDEO_VAL_DIR, labels_val_encoded, batch_size=16)

print(f"Train generator and validation generator prepared.")
print(f"Jumlah video pelatihan: {len(video_train_files)}")
print(f"Jumlah video validasi: {len(video_val_files)}")

Train generator and validation generator prepared.
Jumlah video pelatihan: 52
Jumlah video validasi: 52


In [78]:
layers.Dropout(0.5)  # Dropout layer with 50% probability

data_augmentation = tf.keras.Sequential([
    layers.RandomFlip('horizontal'),
    layers.RandomRotation(0.2),
    layers.RandomZoom(0.2),
])

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-3,
    decay_steps=100000,
    decay_rate=0.96,
    staircase=True)

early_stopping = EarlyStopping(monitor='val_loss', patience=3)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)


In [79]:
# Build the CNN 3D model for sign language recognition
model = models.Sequential([
    layers.Input(shape=(30, 224, 224, 3)),
    layers.Conv3D(32, (3, 3, 3), activation='relu'),
    layers.MaxPooling3D((2, 2, 2)),
    layers.Conv3D(64, (3, 3, 3), activation='relu'),
    layers.MaxPooling3D((2, 2, 2)),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(len(label_encoder.classes_), activation='softmax')
])

optimizer = Adam(learning_rate=1e-4)  # Lower learning rate
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [80]:
# Train the model with video data
history = model.fit(
    train_video_generator,
    validation_data=validation_video_generator,
    epochs=10,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps
)

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 54s/step - accuracy: 0.0000e+00 - loss: 3.2643 - val_accuracy: 0.1250 - val_loss: 4.0727
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 34s/step - accuracy: 0.1250 - loss: 4.0727 - val_accuracy: 0.3750 - val_loss: 4.2594
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 32s/step - accuracy: 0.3750 - loss: 4.2594 - val_accuracy: 0.1875 - val_loss: 3.0400
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 30s/step - accuracy: 0.1875 - loss: 3.0400 - val_accuracy: 0.4375 - val_loss: 2.5409
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 31s/step - accuracy: 0.4375 - loss: 2.5409 - val_accuracy: 0.3750 - val_loss: 2.5287
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 35s/step - accuracy: 0.3750 - loss: 2.5287 - val_accuracy: 0.5625 - val_loss: 2.1370
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━

In [None]:
# Create the directory if it doesn't exist
os.makedirs('../models/models_video', exist_ok=True)

# Now save the model
model.save('../models/models/video_model.h5')

# Menyimpan arsitektur model ke format JSON
model_json = model.to_json()
with open('../models/models/video_model.json', 'w') as json_file:
    json_file.write(model_json)



## Test Prediction

In [4]:
# Load the pretrained model
model_path = '../models/models_video/video_model.h5'
model = load_model(model_path)

# Print the input shape of the model
print(model.input_shape)



(None, 30, 224, 224, 3)


In [13]:
# Preprocessing function for the video
def preprocess_video(video_path, target_size=(224, 224), max_frames=30):
    cap = cv2.VideoCapture(video_path)
    frames = []

    count = 0
    while cap.isOpened() and count < max_frames:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.resize(frame, target_size)
        frame = frame / 255.0
        frames.append(frame)
        count += 1

    cap.release()

    # Pad or truncate the frames to exactly match the required number of frames (30 in this case)
    if len(frames) < max_frames:
        # If there are fewer than 30 frames, pad with empty frames
        frames += [np.zeros_like(frames[0])] * (max_frames - len(frames))
    frames = np.array(frames)

    # Add batch dimension, making shape (1, 30, 224, 224, 3)
    frames = np.expand_dims(frames, axis=0)

    return frames

In [21]:
# Define the class labels
class_labels = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

# Path to your video
video_path = '../data/videos/train/C/c1.mp4'

# Preprocess the video
preprocessed_video = preprocess_video(video_path)

# Check the shape of the preprocessed video
print("Preprocessed video shape:", preprocessed_video.shape)

# Predict using the model
prediction = model.predict(preprocessed_video)

# Print the prediction
print("Prediction:", prediction)

# Get the index of the class with the highest probability
predicted_index = np.argmax(prediction)

# Map the predicted index to the corresponding class label
predicted_label = class_labels[predicted_index]

# Print the predicted label
print(f"Predicted label: {predicted_label}")

Preprocessed video shape: (1, 30, 224, 224, 3)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
Prediction: [[2.86035985e-02 3.13805610e-01 1.85759723e-01 7.16920570e-02
  1.29557149e-02 1.10213503e-01 1.06744289e-01 1.19678535e-01
  1.81934040e-04 1.80583569e-07 2.91786011e-04 4.00133540e-05
  2.39438901e-04 2.41743913e-03 1.03944685e-05 1.49587882e-04
  1.01852493e-05 4.20578499e-06 2.22209748e-03 1.62031240e-04
  6.04126370e-04 4.03868034e-05 6.88902655e-05 3.16705591e-05
  1.84414536e-03 4.22284752e-02]]
Predicted label: B
