In [1]:
import cv2
import numpy as np
import random
import tensorflow as tf

from official.projects.movinet.modeling import movinet
from official.projects.movinet.modeling import movinet_model

frame_step = 8
frames_per_clip = 8
output_size = (224, 224)


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.10.1 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extract_all_frames():
    video_capture = cv2.VideoCapture('../raw_session.mp4')
    frames = {}
    success, frame = video_capture.read()
    count = 0
    while success:
        if count % frame_step == 0:  # Extract frame every 'frame_rate' frames
            # frame = frame[0:720, 280:1000]
            # frame = cv2.resize(frame, (224, 224))
            frame = tf.image.convert_image_dtype(frame, tf.float32)
            frame = tf.image.resize_with_pad(frame, *output_size)
            frames[count] = frame
            if count % 800 == 0:
                print(count)
            # cv2.imshow('frame', frame)
            # if cv2.waitKey(10) & 0xFF == ord('q'):
            #     break
        success, frame = video_capture.read()
        count += 1
    video_capture.release()
    return frames

In [3]:
all_frames = extract_all_frames()

0
800
1600
2400
3200
4000
4800
5600
6400
7200


In [4]:
import json
with open('bite_frame_indexes.json') as f:
    bite_frame_indexes = json.load(f)
bite_frame_indexes

[508, 947, 1201, 1578, 1959, 2492, 2881, 3136, 3377, 3727, 5149, 6310, 6848]

In [5]:
bites = []
non_bites = []
current_clip = []
for i, frame in all_frames.items():
    if len(current_clip) < frames_per_clip:
        current_clip.append(frame)  # Collect frames for the current clip
    else:  # If the current clip is full, add it to the list of clips and start a new clip
        first_index = i - (frames_per_clip * frame_step)
        last_index = i - 1
        is_bite = False
        for bite_frame_index in bite_frame_indexes:
            if first_index <= bite_frame_index <= last_index:
                is_bite = True
                break
        if is_bite:
            bites.append(current_clip)
        else:
            non_bites.append(current_clip)
        current_clip = [frame]

print('Bites:', len(bites))
print('Non bites:', len(non_bites))

train_bites = bites[:int(len(bites) * 0.8)]
train_non_bites = non_bites[:int(len(non_bites) * 0.8)]
test_bites = bites[int(len(bites) * 0.8):]
test_non_bites = non_bites[int(len(non_bites) * 0.8):]

print('Train bites:', len(train_bites))
print('Train non bites:', len(train_non_bites))
print('Test bites:', len(test_bites))
print('Test non bites:', len(test_non_bites))


Bites: 13
Non bites: 110
Train bites: 10
Train non bites: 88
Test bites: 3
Test non bites: 22


In [6]:
def save_video(images, output_path, fps=30):
    # Get the height, width, and number of channels of the images
    height, width, layers = images[0].shape

    # Define the codec and create a VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # or 'XVID', 'MJPG', etc.
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    for image in images:
        out.write(image)  # Write each image to the video

    out.release()  # Release the VideoWriter object

In [7]:
# images = [np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(60)]  # 60 random images
images = non_bites[1]
images = np.array(images)[..., [2, 1, 0]]
save_video(images, 'output_video.mp4', fps=4)

In [8]:
batch_size = 8
num_frames = 8

class ClipGenerator:
    def __init__(self, training):
        self.training = training

    def __call__(self):
        if self.training:
            all_clips = [(clip, 0) for clip in train_non_bites] + [(clip, 1) for clip in train_bites]
        else:
            all_clips = [(clip, 0) for clip in test_non_bites] + [(clip, 1) for clip in test_bites]
        random.shuffle(all_clips)
        for clip, label in all_clips:
            yield clip, label


output_signature = (tf.TensorSpec(shape = (None, None, None, 3), dtype = tf.float32),
                    tf.TensorSpec(shape = (), dtype = tf.int16))

train_ds = tf.data.Dataset.from_generator(ClipGenerator(True), output_signature = output_signature)
train_ds = train_ds.batch(batch_size)

test_ds = tf.data.Dataset.from_generator(ClipGenerator(False), output_signature = output_signature)
test_ds = test_ds.batch(batch_size)

In [9]:
for frames, labels in train_ds.take(10):
  print(labels)
print(f"Shape: {frames.shape}")
print(f"Label: {labels.shape}")

tf.Tensor([0 0 0 0 0 0 0 0], shape=(8,), dtype=int16)
tf.Tensor([0 0 0 0 1 1 0 0], shape=(8,), dtype=int16)
tf.Tensor([0 0 0 0 0 0 1 0], shape=(8,), dtype=int16)
tf.Tensor([0 0 0 0 0 0 1 1], shape=(8,), dtype=int16)
tf.Tensor([0 0 0 0 0 0 0 0], shape=(8,), dtype=int16)
tf.Tensor([0 0 0 1 0 0 0 0], shape=(8,), dtype=int16)
tf.Tensor([0 1 1 0 0 0 0 0], shape=(8,), dtype=int16)
tf.Tensor([0 0 0 0 0 0 0 0], shape=(8,), dtype=int16)
tf.Tensor([0 0 0 0 0 0 0 0], shape=(8,), dtype=int16)
tf.Tensor([0 0 0 0 0 0 1 0], shape=(8,), dtype=int16)
Shape: (8, 8, 224, 224, 3)
Label: (8,)


In [10]:
# !wget https://storage.googleapis.com/tf_model_garden/vision/movinet/movinet_a0_base.tar.gz -O movinet_a0_base.tar.gz
# !tar -xvf movinet_a0_base.tar.gz

model_id = 'a0'
resolution = 224

tf.keras.backend.clear_session()

backbone = movinet.Movinet(model_id=model_id)
backbone.trainable = False

# Set num_classes=600 to load the pre-trained weights from the original model
model = movinet_model.MovinetClassifier(backbone=backbone, num_classes=600)
model.build([None, None, None, None, 3])

checkpoint_dir = f'movinet_{model_id}_base'
checkpoint_path = tf.train.latest_checkpoint(checkpoint_dir)
checkpoint = tf.train.Checkpoint(model=model)
status = checkpoint.restore(checkpoint_path)
status.assert_existing_objects_matched()

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x190d8f14610>

In [11]:
def build_classifier(batch_size, num_frames, resolution, backbone, num_classes):
  """Builds a classifier on top of a backbone model."""
  model = movinet_model.MovinetClassifier(
      backbone=backbone,
      num_classes=num_classes)
  model.build([batch_size, frames_per_clip, resolution, resolution, 3])

  return model

In [12]:
model = build_classifier(batch_size, frames_per_clip, resolution, backbone, 2)

In [13]:
num_epochs = 2

loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)

model.compile(loss=loss_obj, optimizer=optimizer, metrics=['accuracy'])

class_weights = {0: 1.0, 1: len(non_bites) / len(bites)}
class_weights

{0: 1.0, 1: 8.461538461538462}

In [14]:
results = model.fit(train_ds,
                    validation_data=test_ds,
                    epochs=num_epochs,
                    validation_freq=1,
                    class_weight=class_weights,
                    verbose=1)

Epoch 1/2
Epoch 2/2


In [17]:
model.save('movinet_fine_tuned.h5')

In [25]:
def preprocess_frame(frame):
    # Resize the frame to the input size expected by the MoViNet model
    frame_resized = cv2.resize(frame, (224, 224))  # Adjust size if your model uses different dimensions
    # Normalize the frame (0-255 to 0-1)
    frame_normalized = frame_resized / 255.0
    return frame_normalized

def get_clips_from_stream(cap, num_frames=8, stride=8):
    frames = []
    clip_count = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame = cv2.flip(frame, 1)

        clip_count += 1
        if clip_count % stride == 0:
            processed_frame = preprocess_frame(frame)
            frames.append(processed_frame)

        # Show the real-time camera feed
        cv2.imshow('Camera Feed', frame)

        # Once we've gathered enough frames for one clip
        if len(frames) == num_frames:
            yield np.array(frames)
            frames = []  # Reset for the next clip

        # Exit if 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

In [31]:
import cv2

# Initialize video capture (0 for the first camera, or provide a filename for video file)
cap = cv2.VideoCapture(0)

print('starting')

for clip in get_clips_from_stream(cap):
    # Add batch dimension and predict
    clip_input = np.expand_dims(clip, axis=0)  # Shape should be (1, 8, 224, 224, 3)
    print(clip_input.shape)

    # Predict with the model
    prediction = model.predict(clip_input)

    # Analyze the prediction (for binary classification, for instance)
    print(prediction[0])
    if prediction[0][0] < prediction[0][1]:  # Adjust threshold if necessary
        print("Action detected!")
    else:
        print("No action detected.")

    # You can add visualization or further processing here

# Release the capture when done
cap.release()
cv2.destroyAllWindows()

starting
(1, 8, 224, 224, 3)
[ 0.6878443 -0.6533364]
No action detected.
(1, 8, 224, 224, 3)
[-0.15973273  0.27262238]
Action detected!
(1, 8, 224, 224, 3)
[-2.335931   2.4199948]
Action detected!
(1, 8, 224, 224, 3)
[ 0.8604256  -0.76669306]
No action detected.
(1, 8, 224, 224, 3)
[-1.252853   1.4119458]
Action detected!
(1, 8, 224, 224, 3)
[ 0.9320724  -0.83366436]
No action detected.
(1, 8, 224, 224, 3)
[ 0.5351406 -0.4519203]
No action detected.
(1, 8, 224, 224, 3)
[-0.82155776  0.9903483 ]
Action detected!
(1, 8, 224, 224, 3)
[-0.26073048  0.3499933 ]
Action detected!


In [None]:
    video_capture.release()


In [55]:
# Suppose the input shape is (batch_size, num_frames, height, width, channels)
# For your case with 8 frames of size 224x224 and 3 channels (RGB):
input_shape = (1, 8, 224, 224, 3)  # 1 for batch_size, can be any batch size

# Build the model with a dummy input to ensure all layers are initialized
model.build(input_shape)

# Now convert the model to TFLite
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.experimental_new_converter = True
tflite_model = converter.convert()

# Save the TFLite model
with open('model.tflite', 'wb') as f:
    f.write(tflite_model)





































































INFO:tensorflow:Assets written to: C:\Users\EVANMA~1\AppData\Local\Temp\tmptoidoco5\assets


INFO:tensorflow:Assets written to: C:\Users\EVANMA~1\AppData\Local\Temp\tmptoidoco5\assets


ConverterError: c:\Users\Evan Mazor\code\repos\bite-recognition\.venv310\lib\site-packages\keras\engine\base_layer.py:1097:0: error: 'tf.AvgPool3D' op is neither a custom op nor a flex op
<unknown>:0: note: loc(fused["StatefulPartitionedCall:", "StatefulPartitionedCall"]): called from
c:\Users\Evan Mazor\code\repos\bite-recognition\.venv310\lib\site-packages\keras\engine\base_layer.py:1097:0: note: Error code: ERROR_NEEDS_FLEX_OPS
c:\Users\Evan Mazor\code\repos\bite-recognition\.venv310\lib\site-packages\keras\engine\base_layer.py:1097:0: error: 'tf.AvgPool3D' op is neither a custom op nor a flex op
<unknown>:0: note: loc(fused["StatefulPartitionedCall:", "StatefulPartitionedCall"]): called from
c:\Users\Evan Mazor\code\repos\bite-recognition\.venv310\lib\site-packages\keras\engine\base_layer.py:1097:0: note: Error code: ERROR_NEEDS_FLEX_OPS
c:\Users\Evan Mazor\code\repos\bite-recognition\.venv310\lib\site-packages\keras\engine\base_layer.py:1097:0: error: 'tf.AvgPool3D' op is neither a custom op nor a flex op
<unknown>:0: note: loc(fused["StatefulPartitionedCall:", "StatefulPartitionedCall"]): called from
c:\Users\Evan Mazor\code\repos\bite-recognition\.venv310\lib\site-packages\keras\engine\base_layer.py:1097:0: note: Error code: ERROR_NEEDS_FLEX_OPS
c:\Users\Evan Mazor\code\repos\bite-recognition\.venv310\lib\site-packages\keras\engine\base_layer.py:1097:0: error: 'tf.AvgPool3D' op is neither a custom op nor a flex op
<unknown>:0: note: loc(fused["StatefulPartitionedCall:", "StatefulPartitionedCall"]): called from
c:\Users\Evan Mazor\code\repos\bite-recognition\.venv310\lib\site-packages\keras\engine\base_layer.py:1097:0: note: Error code: ERROR_NEEDS_FLEX_OPS
<unknown>:0: error: failed while converting: 'main': 
Some ops are not supported by the native TFLite runtime, you can enable TF kernels fallback using TF Select. See instructions: https://www.tensorflow.org/lite/guide/ops_select 
TF Select ops: AvgPool3D
Details:
	tf.AvgPool3D(tensor<?x?x?x?x32xf32>) -> (tensor<?x?x?x?x32xf32>) : {data_format = "NDHWC", device = "", ksize = [1, 1, 3, 3, 1], padding = "SAME", strides = [1, 1, 2, 2, 1]}
	tf.AvgPool3D(tensor<?x?x?x?x56xf32>) -> (tensor<?x?x?x?x56xf32>) : {data_format = "NDHWC", device = "", ksize = [1, 1, 3, 3, 1], padding = "SAME", strides = [1, 1, 2, 2, 1]}
	tf.AvgPool3D(tensor<?x?x?x?x8xf32>) -> (tensor<?x?x?x?x8xf32>) : {data_format = "NDHWC", device = "", ksize = [1, 1, 3, 3, 1], padding = "SAME", strides = [1, 1, 2, 2, 1]}



In [63]:
# print(np.array(bites[0]).shape)

bite_clip = np.array(bites[0]).astype(np.float32)
bite_clip = np.expand_dims(bite_clip, axis=0)
bite_clip.tofile('bite_clip.raw')
print(bite_clip.shape)
print(model.predict(bite_clip))

(1, 8, 224, 224, 3)
[[-1.1264563  1.172403 ]]


In [58]:
model.save_weights('movinet_fine_tuned')