In [7]:
from tensorflow_docs.vis import embed
from tensorflow import keras
from imutils import paths

from extract_face import extract_face

import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import numpy as np
import imageio
from time import time
import cv2
import os

In [6]:
IMG_SIZE = 224
BATCH_SIZE = 64
FRAME_RATE = 10
SOURCE_FRAME_RATE = 30
FRAME_STEP = (30 / FRAME_RATE)
EPOCHS = 10
FRAMES_PER_CLIP = 8

NUM_FEATURES = 2048

In [5]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print(f"Total videos for training: {len(train_df)}")
print(f"Total videos for testing: {len(test_df)}")

Total videos for training: 594
Total videos for testing: 224


In [66]:
import json

with open('bite_frame_indexes.json') as frame:
    bite_frame_indexes = json.load(frame)

def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y+min_dim, start_x : start_x+min_dim]

def extract_all_frames():
    video_capture = cv2.VideoCapture('../raw_session.mp4')
    frames = {}
    success, frame = video_capture.read()
    count = 0
    while success:
        if count % FRAME_STEP == 0:  # Extract frames at the frame rate
            # frame = crop_center_square(frame)
            # Extract face
            face = extract_face(frame)
            if face is not None and face.size != 0:
                face = cv2.resize(face, (IMG_SIZE, IMG_SIZE))
                face = face[:, :, [2, 1, 0]]
                frames[count] = face
            else:
                print('skipping frame', count)
                # cv2.imwrite(f"test_frames/frame_{count}.jpg", frame)
        if count % 1000 == 0:
            print(count)
        success, frame = video_capture.read()
        count += 1
    video_capture.release()
    return frames

all_frames = extract_all_frames()

0
1000
2000
3000
4000
skipping frame 4365
skipping frame 4368
skipping frame 4371
skipping frame 4374
skipping frame 4377
skipping frame 4380
skipping frame 4383
skipping frame 4386
skipping frame 4389
skipping frame 4431
skipping frame 4434
skipping frame 4437
skipping frame 4440
skipping frame 4443
skipping frame 4446
skipping frame 4449
skipping frame 4452
skipping frame 4455
skipping frame 4458
skipping frame 4461
skipping frame 4464
skipping frame 4467
skipping frame 4470
skipping frame 4473
skipping frame 4476
skipping frame 4479
skipping frame 4482
skipping frame 4485
skipping frame 4488
skipping frame 4491
skipping frame 4494
skipping frame 4497
skipping frame 4500
skipping frame 4503
skipping frame 4506
skipping frame 4509
skipping frame 4512
skipping frame 4515
skipping frame 4518
skipping frame 4521
skipping frame 4524
skipping frame 4527
skipping frame 4530
skipping frame 4533
skipping frame 4536
skipping frame 4539
skipping frame 4542
skipping frame 4545
skipping frame 454

In [67]:
for i, frame in enumerate(list(all_frames.values())[:10]):
    cv2.imwrite(f"test_frames/frame_{i}.jpg", frame[:, :, [2, 1, 0]])

In [88]:
bites = []
non_bites = []
current_clip = []
for i, frame in all_frames.items():
    if len(current_clip) < FRAMES_PER_CLIP:
        current_clip.append(frame)  # Collect frames for the current clip
    else:  # If the current clip is full, add it to the list of clips and start a new clip
        first_index = i - (FRAMES_PER_CLIP * FRAME_STEP)
        last_index = i - 1
        is_bite = False
        for bite_frame_index in bite_frame_indexes:
            if first_index <= bite_frame_index <= last_index:
                is_bite = True
                break
        if is_bite:
            bites.append(np.array(current_clip))
        else:
            non_bites.append(np.array(current_clip))
        current_clip = [frame]

print('Bites:', len(bites))
print('Non bites:', len(non_bites))

train_bites = bites[:int(len(bites) * 0.8)]
train_non_bites = non_bites[:int(len(non_bites) * 0.8)]
test_bites = bites[int(len(bites) * 0.8):]
test_non_bites = non_bites[int(len(non_bites) * 0.8):]

print('Train bites:', len(train_bites))
print('Train non bites:', len(train_non_bites))
print('Test bites:', len(test_bites))
print('Test non bites:', len(test_non_bites))

Bites: 13
Non bites: 278
Train bites: 10
Train non bites: 222
Test bites: 3
Test non bites: 56


In [20]:
# The following two methods are taken from this tutorial:
# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub

def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

In [3]:
def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(weights="imagenet",
                                                       include_top=False, pooling="avg",
                                                       input_shape=(IMG_SIZE, IMG_SIZE, 3))
    preprocess_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")

feature_extractor = build_feature_extractor()

In [31]:
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["tag"])
)
print(label_processor.get_vocabulary())

['CricketShot', 'PlayingCello', 'Punch', 'ShavingBeard', 'TennisSwing']


In [97]:
def prepare_all_videos(bites_list, non_bites_list):
    num_samples = len(bites_list) + len(non_bites_list)

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    # frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros(shape=(num_samples, FRAMES_PER_CLIP, NUM_FEATURES),
                                dtype="float32")
    labels = np.zeros(shape=(num_samples), dtype="int")

    for is_bite, sequences in enumerate([non_bites_list, bites_list]):
        for idx, frames in enumerate(sequences):
            frames = frames[None, ...]

            # Initialize placeholders to store the features of the current video.
            temp_frame_features = np.zeros(shape=(1, FRAMES_PER_CLIP, NUM_FEATURES),
                                    dtype="float32")

            # Extract features from the frames of the current video.
            for i, frame in enumerate(frames):
                for j in range(FRAMES_PER_CLIP):
                    temp_frame_features[i, j, :] = feature_extractor.predict(frame[None, j, :])

            print(is_bite, idx, '    ', temp_frame_features.shape)
            frame_features[idx, ] = temp_frame_features.squeeze()
            labels[idx] = is_bite

    return frame_features, labels

In [99]:
train_data, train_labels = prepare_all_videos(train_bites, train_non_bites)
test_data, test_labels = prepare_all_videos(test_bites, test_non_bites)

print(f"Frame features in train set: {train_data.shape} and {test_data.shape}")

Frame features in train set: (232, 8, 2048) and (59, 8, 2048)


In [122]:
# Utility for our sequence model.
def get_sequence_model():
    frame_features_input = keras.Input((FRAMES_PER_CLIP, NUM_FEATURES))

    x = keras.layers.GRU(16, return_sequences=True)(frame_features_input)
    x = keras.layers.GRU(8)(x)
    x = keras.layers.Dropout(0.4)(x)
    x = keras.layers.Dense(8, activation="relu")(x)
    output = keras.layers.Dense(1, activation="sigmoid")(x)

    rnn_model = keras.Model(frame_features_input, output)

    rnn_model.compile(
        loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
    )
    return rnn_model


# Utility for running experiments.
def run_experiment():
    filepath = "./model.keras"
    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath, save_best_only=True, verbose=1
    )

    seq_model = get_sequence_model()
    history = seq_model.fit(
        train_data,
        train_labels,
        validation_split=0.3,
        epochs=EPOCHS,
        callbacks=[checkpoint],
        class_weight={0: 13 / 278, 1: 1},
    )

    seq_model.load_weights(filepath)
    _, accuracy = seq_model.evaluate(test_data, test_labels)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

    return history, seq_model


history, sequence_model = run_experiment()

Epoch 1/10
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9511 - loss: 0.0801
Epoch 1: val_loss improved from inf to 0.62868, saving model to ./model.keras
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 141ms/step - accuracy: 0.9493 - loss: 0.0817 - val_accuracy: 1.0000 - val_loss: 0.6287
Epoch 2/10
[1m1/6[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m0s[0m 45ms/step - accuracy: 0.9688 - loss: 0.0249
Epoch 2: val_loss did not improve from 0.62868
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.9224 - loss: 0.0705 - val_accuracy: 0.8571 - val_loss: 0.6547
Epoch 3/10
[1m5/6[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 15ms/step - accuracy: 0.9399 - loss: 0.0773
Epoch 3: val_loss did not improve from 0.62868
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.9377 - loss: 0.0776 - val_accuracy: 0.8571 - val_loss: 0.6441
Epoch 4/10
[1m6/6[0m [32m━━━━

In [13]:
# clip = bites[2]
# features = np.zeros(shape=(1, FRAMES_PER_CLIP, NUM_FEATURES),
#                                     dtype="float32")
# for j in range(FRAMES_PER_CLIP):
#     features[0, j, :] = feature_extractor.predict(clip[None, j, :])
# sequence_model.predict(features)
sequence_model = keras.models.load_model('./model.keras')


features = feature_extractor.predict(bites[0])
sequence_model.predict(features[None, ...])

NameError: name 'bites' is not defined

In [15]:
video_capture = cv2.VideoCapture(0)
success, frame = video_capture.read()
count = 0
current_clip = []
while success:
    # Extract face
    cv2.imshow('frame', frame)

    face = extract_face(frame)
    if face is not None and face.size != 0:
        face = cv2.resize(face, (IMG_SIZE, IMG_SIZE))
        face = face[:, :, [2, 1, 0]]
        if len(current_clip) < FRAMES_PER_CLIP:
            current_clip.append(face)
        else:
            frames = np.array(current_clip)
            features = feature_extractor.predict(frames)
            prediction = sequence_model.predict(features[None, ...])
            print(prediction)
            current_clip = []

    else:
        print('skipping frame because no face')
        # cv2.imwrite(f"test_frames/frame_{count}.jpg", frame)

    cv2.waitKey(1000 // FRAME_RATE)

    success, frame = video_capture.read()
    count += 1
video_capture.release()

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 489ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[[0.49062577]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 380ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[[0.50065696]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 372ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[[0.52434826]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 385ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[[0.52913594]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 364ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
[[0.52666116]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 388ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[[0.516014]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

KeyboardInterrupt: 

In [16]:
video_capture.release()
