Program that counts the number of jumping jacks in a given video

In [None]:
!cp "/content/drive/MyDrive/cv2_gpu/cv2.cpython-37m-x86_64-linux-gnu.so" .

In [None]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import models, transforms, datasets
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
import cv2
from google.colab.patches import cv2_imshow
import os
from imutils import paths
import scipy.io
import pandas as pd
import csv
import matplotlib
import matplotlib.pyplot as plt
cv2.__version__

'4.5.2-dev'

In [None]:
if torch.cuda.is_available():
  device = torch.device('cuda:0')
else:
  device = torch.device('cpu')
  
print(device)

cuda:0


In [None]:
MASK_RCNN_PATH = '/content/drive/MyDrive/Colab_Notebooks/mask_rcnn.pt'
CLASSIFIER_PATH = '/content/drive/MyDrive/Colab_Notebooks/classfier-3d-6.pt'
OPEN_POSE_PROTO = '/content/drive/MyDrive/OpenPose/pose_deploy_linevec_faster_4_stages.prototxt'
OPEN_POSE_MODEL = '/content/drive/MyDrive/OpenPose/pose_iter_160000.caffemodel'

In [None]:
mask_rcnn = models.detection.maskrcnn_resnet50_fpn(pretrained=True)
in_features = mask_rcnn.roi_heads.box_predictor.cls_score.in_features
mask_rcnn.roi_heads.box_predictor = FastRCNNPredictor(in_features, 2)
in_features_mask = mask_rcnn.roi_heads.mask_predictor.conv5_mask.in_channels
hidden_layer = 256
mask_rcnn.roi_heads.mask_predictor = MaskRCNNPredictor(in_features_mask, hidden_layer, 2)
mask_rcnn.load_state_dict(torch.load(MASK_RCNN_PATH))

classifier = models.video.r3d_18(pretrained=True)
classifier.fc = nn.Linear(512, 2)
classifier.load_state_dict(torch.load(CLASSIFIER_PATH))

pose_rcnn = cv2.dnn.readNetFromCaffe(OPEN_POSE_PROTO, OPEN_POSE_MODEL)
pose_rcnn.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
pose_rcnn.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)

In [None]:
edges = [
    (0, 1), (1, 2), (1, 5), (2, 3), (3, 4), (5, 6),
    (6, 7), (8, 9), (9, 10), (11, 12), (12, 13), (8, 11),
    (2, 8), (5, 11)
]

def draw_keypoints(keypoints, image):       
    for ie, e in enumerate(edges):
        # get different colors for the edges
        rgb = matplotlib.colors.hsv_to_rgb([
            ie/float(len(edges)), 1.0, 1.0
        ])
        rgb = rgb*255
        point1 = e[0]
        point2 = e[1]
        # join the keypoint pairs to draw the skeletal structure
        cv2.line(image, (keypoints[point1][0], keypoints[point1][1]),
                (keypoints[point2][0], keypoints[point2][1]),
                tuple(rgb), 5, lineType=cv2.LINE_AA)
    
    return image


def extract_points(frame, outputs):
    H = outputs.shape[1]
    W = outputs.shape[2]
    height = frame.shape[0]
    width = frame.shape[1]

    # Empty list to store the detected keypoints
    points = []

    for i in range(14):
        # confidence map of corresponding body's part.
        probMap = outputs[0, i, :, :]

        # Find global maxima of the probMap.
        minVal, prob, minLoc, point = cv2.minMaxLoc(probMap)

        # Scale the point to fit on the original image
        x = (point[0] * width) / W
        y = (point[1] * height) / H
        if prob > 0.10 :
            # Add the point to the list if the probability is greater than the threshold
            points.append((int(x), int(y)))

    return points

'''
Returns true if human looks like this:
   /   \
   \ O /
    \_/
     V
    / \
   /   \
'''
def check_jumping_jack(keypoints):
    top_head = keypoints[1]
    left_elbow = keypoints[3]
    right_elbow = keypoints[6]
    left_hip = keypoints[8]
    right_hip = keypoints[11]
    left_foot = keypoints[10]
    right_foot = keypoints[13]

    if top_head[1] > left_elbow[1] and top_head[1] > right_elbow[1] and left_foot[0] < left_hip[0] and right_foot[0] > right_hip[0]:
        return True

    return False

In [None]:
# using pose cnn, check if left wrist and right wrist is above head level
# and falls to hip joint/ below elbows
# count as 1 jumping jack each time action is repeated
# also needs to see if feet y value changes (jump) and x value changes (split)
VIDEO_PATH = '/content/test_video.mp4'

mask_rcnn.to(device)
classifier.to(device)

action_classes = ['Jumping Jack', 'Other']
video_arr = []
current_clip = []
raw_frames = []

complete_jj = False

mask_rcnn.eval()
classifier.eval()

counter = 0
cap = cv2.VideoCapture(VIDEO_PATH)
size = None

while cap.isOpened():
    ret, frame = cap.read()

    if ret == True:
        size = (frame.shape[1], frame.shape[0]) # width, height
        inputs = np.transpose(frame, (2, 0, 1))
        inputs = inputs.astype(float)/255
        inputs = torch.from_numpy(inputs)

        inputs = torch.unsqueeze(inputs, 0).float()
        inputs = inputs.to(device)
        outputs = mask_rcnn(inputs)

        # if no human detected, skip frame
        if list(outputs[0]['scores'].size())[0] == 0:
            frame = cv2.rectangle(frame, (0, 0), (int(size[0] / 2), int(size[1] / 10)), (0, 0, 0), -1)
            frame = cv2.putText(frame, 'Counter:{}'.format(counter), (20, int(size[1] / 10) - 20), cv2.FONT_HERSHEY_DUPLEX,
                                2, (255, 255, 255), 2, cv2.LINE_AA)
            video_arr.append(frame)
        else:
            found_human = outputs[0]['scores'][0].item() > 0.5
            current_clip.append(frame)

            if found_human == True and len(current_clip) == 16:
                # prepare for 3d classifier
                inputs_classifier = np.array(current_clip)
                # add an extra dimension        
                inputs_classifier = np.expand_dims(inputs_classifier, axis=0)
                # transpose to get [1, 3, num_clips, height, width]
                inputs_classifier = np.transpose(inputs_classifier, (0, 4, 1, 2, 3))
                # convert the frames to tensor
                inputs_classifier = torch.tensor(inputs_classifier, dtype=torch.float32)
                inputs_classifier = inputs_classifier.to(device)

                outputs = classifier(inputs_classifier)
                _, result = torch.max(outputs.data, 1)
                predict_class = action_classes[result] == 'Jumping Jack'
                print(predict_class)

                for f in current_clip:
                    inpBlob = cv2.dnn.blobFromImage(f, 1.0 / 255, (368, 368), (0, 0, 0), swapRB=False, crop=False)

                    # Set the prepared object as the input blob of the network
                    pose_rcnn.setInput(inpBlob)

                    outputs = pose_rcnn.forward()
                    points = extract_points(f, outputs)
                    
                    if len(points) == 14:
                        # check of apex of jump
                        check_jj = check_jumping_jack(points)

                        if complete_jj == False and check_jj == True and predict_class:
                            counter += 1
                            complete_jj = True
                        else:
                            if complete_jj == True and check_jj == False:
                                complete_jj = False
                        f = draw_keypoints(points, f)

                    f = cv2.rectangle(f, (0, 0), (int(size[0] / 2), int(size[1] / 10)), (0, 0, 0), -1)
                    f = cv2.putText(f, 'Counter:{}'.format(counter), (0, int(size[1] / 10)), cv2.FONT_HERSHEY_DUPLEX,
                                0.8, (255, 255, 255), 2, cv2.LINE_AA)
                    video_arr.append(f)
                
                current_clip.clear()
    else:
        break

cap.release()

In [None]:
out = cv2.VideoWriter('/content/result.avi', cv2.VideoWriter_fourcc(*'DIVX'), 10, size)

for i in range(len(video_arr)):
    out.write(video_arr[i])
out.release()