In [1]:
import cv2
import numpy as np
from tensorflow import keras

In [2]:
# Define class labels
class_labels = {'cycling': 0, 'drinking': 1, 'eating': 2, 'fighting': 3, 'running': 4, 'sleeping': 5}
class_labels = {v:k for k,v in class_labels.items()} # flip the key-value 

# Define dependencies relative path 
path_model = r'model'
path_test_video = r'test-video'
path_predicted_video = r'avg-predicted-video'

# Model name
modelname = '230206_har6.h5'

# Test video name
filename = 'sleeping_2'

In [3]:
# Load pre-trained model
model = keras.models.load_model(path_model + '/' + modelname)

In [4]:
# Load a video
cap = cv2.VideoCapture(path_test_video + '/' + filename + '.mp4')

# Get the resolution of the input video
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))

# Set the output resolution to half the input resolution
output_width = int(frame_width / 2)
output_height = int(frame_height / 2)

# Write the output video
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(path_predicted_video + '/' + 'avg-predicted-' + filename + '.mp4', fourcc, 20.0, (output_width, output_height))

# Initialize variables for averaging
num_of_frame = 18
count = 0
prediction_sum = np.zeros((1, 6)) # 6 classes
is_first_frame = True

In [5]:
while(cap.isOpened()):
    ret, frame = cap.read()
    if ret == True:
        # Resize frame to 224x224
        frame224 = cv2.resize(frame, (224, 224))

        # Convert to 3-channel image
        frame224 = cv2.cvtColor(frame224, cv2.COLOR_BGR2RGB)

        # Convert to numpy array and expand dimension for model input
        frame224 = np.expand_dims(frame224, axis=0)
        
        # Run image through model
        prediction = model.predict(frame224)
        prediction_sum += prediction
        count += 1
        
        if is_first_frame:
            prediction_class = np.argmax(prediction, axis=1)
            prediction_percent = np.max(prediction)*100
            is_first_frame = False
        
        # If we've averaged over X frames, get the average prediction
        if count == num_of_frame:
            prediction_avg = prediction_sum / num_of_frame
            prediction_class = np.argmax(prediction_avg, axis=1)
            prediction_percent = np.max(prediction_avg)*100
            
            # Reset the count and prediction sum
            count = 0
            prediction_sum = np.zeros((1, 6))
        
        # Do something with prediction
        text = f'{class_labels[prediction_class[0]]}: {prediction_percent:.2f}%'
        # For debugger
        # print(text)
        
        # Add a black background to the text
        text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)
        text_width = text_size[0][0]
        text_height = text_size[0][1]
        cv2.rectangle(frame, (10 - 2, 30 - text_height - 2), (10 + text_width + 2, 30 + 2), (0,0,0), -1)
        
        # Add the text on top of the black background
        cv2.putText(frame, text, (10,30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2, cv2.LINE_AA)
        
        # Resize the frame to half the resolution
        frame = cv2.resize(frame, (output_width, output_height), interpolation = cv2.INTER_CUBIC)

        # Write the resized frame to the output video
        out.write(frame)

        # Check if the user pressed the 'q' key to exit
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    else:
        break

# Release the video capture and writer objects
cap.release()
out.release()

# Close all windows
cv2.destroyAllWindows()