In [3]:
import cv2
import numpy as np
import os
import sys
from samples.coco import coco
from mrcnn import utils
from mrcnn import model as modellib

In [4]:
# Load the pre-trained model data
ROOT_DIR = os.getcwd()
MODEL_DIR = os.path.join(ROOT_DIR, "logs")
COCO_MODEL_PATH = os.path.join(ROOT_DIR, "mask_rcnn_coco.h5")
if not os.path.exists(COCO_MODEL_PATH):
    utils.download_trained_weights(COCO_MODEL_PATH)

In [5]:
# Change the config infermation
class InferenceConfig(coco.CocoConfig):
    GPU_COUNT = 1
    
    # Number of images to train with on each GPU. A 12GB GPU can typically
    # handle 2 images of 1024x1024px.
    # Adjust based on your GPU memory and image sizes. Use the highest
    # number that your GPU can handle for best performance.
    IMAGES_PER_GPU = 1
    
config = InferenceConfig()

In [6]:
# COCO dataset object names
model = modellib.MaskRCNN(
    mode="inference", model_dir=MODEL_DIR, config=config
)
model.load_weights(COCO_MODEL_PATH, by_name=True)
class_names = [
    'BG', 'person', 'bicycle', 'car', 'motorcycle', 'airplane',
    'bus', 'train', 'truck', 'boat', 'traffic light',
    'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird',
    'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear',
    'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
    'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard',
    'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
    'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
    'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster',
    'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
    'teddy bear', 'hair drier', 'toothbrush'
]

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [7]:
# This function is used to change the background information.
# image[:,:,0] is the Blue channel,image[:,:,1] is the Green channel, image[:,:,2] is the Red channel
# mask == 0 means that this pixel is not belong to the object.
# np.where function means that if the pixel belong to background, change it to new background Image.

def apply_mask(image, mask, video):
    # desired background image
    background_image = 'demo/background.jpg'
    # reading the background image
    background_img = cv2.imread(background_image)
    # changing the size of background image bacuase it shiuld be equal to original image
    size = (image.shape[1], image.shape[0])
    resized_background_img = cv2.resize(background_img, size, interpolation = cv2.INTER_AREA)
    
    # If video is the input
    if video:
        image[:, :, 0] = np.where(
            mask == 0,
            resized_background_img[:, :, 0],
            image[:, :, 0]
        )
        image[:, :, 1] = np.where(
            mask == 0,
            resized_background_img[:, :, 1],
            image[:, :, 1]
        )
        image[:, :, 2] = np.where(
            mask == 0,
            resized_background_img[:, :, 2],
            image[:, :, 2]
        )
        
    # If a single image is the input
    else:
        image[:, :, 0] = np.where(
            mask == 0,
            resized_background_img[:, :, 0],
            image[:, :, 0]
        )
        image[:, :, 1] = np.where(
            mask == 0,
            resized_background_img[:, :, 1],
            image[:, :, 1]
        )
        image[:, :, 2] = np.where(
            mask == 0,
            resized_background_img[:, :, 2],
            image[:, :, 2]
        )
        # Showing the desired background for visulization
        cv2.imshow('Desired_Background', resized_background_img)
    
    return image

In [8]:
# This function is used to show the object detection result in original image.
def display_instances(image, boxes, masks, ids, names, scores, video=False):
    # max_area will save the largest object (a person) for all the detection results
    max_area = 0
    # Initialising the mask variable that will carry out the largest object (person) mask
    mask = masks[:,:]
    
    # n_instances saves the amount of all objects
    n_instances = boxes.shape[0]

    if not n_instances:
        print('NO INSTANCES TO DISPLAY')
    else:
        assert boxes.shape[0] == masks.shape[-1] == ids.shape[0]

    for i in range(n_instances):
        if not np.any(boxes[i]):
            continue

        # compute the square of each object
        y1, x1, y2, x2 = boxes[i]
        square = (y2 - y1) * (x2 - x1)

        # use label to select person object from all the 80 classes in COCO dataset
        label = names[ids[i]]
        if label == 'person':
            # save the largest object in the image as main character
            # other people will be regarded as background
            if square > max_area:
                max_area = square
                mask = masks[:, :, i]
            else:
                continue
        else:
            continue

    # apply mask for the image
    image = apply_mask(image, mask, video)
        
    return image

In [9]:
# Run this cell for the image input otherwise skip it

# Input the original image name
original_image = 'demo/test_data/tree/File3.jpg'
image = cv2.imread(original_image)
# Keeping the copy of original image as test image for visulization
test_image = cv2.imread(original_image)

results = model.detect([image], verbose=0)
r = results[0]
frame = display_instances(
    image, r['rois'], r['masks'], r['class_ids'], class_names, r['scores']
)

# Showing the Original and final images for visulization
cv2.imshow('Original_Image', test_image)
cv2.imshow('Output_Image', frame)

# Wait for keys to exit (by pressing the esc key) or save
key = cv2.waitKey(0)
if key == 27:                 
    cv2.destroyAllWindows()
elif key == ord('s'):        
    cv2.imwrite('save_image_tree.jpg', image)
    cv2.destroyAllWindows()

In [11]:
# Run this cell for the video input otherwise skip it

# Input video
input_video = "demo/dual_dance_trim.mp4"
# passing the video file into openCV Video capture object
capture = cv2.VideoCapture(input_video)

# these 2 lines can be removed if you dont have a 1080p camera.
#capture.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)
#capture.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)

# Recording Video
fps = 25.0
width = int(capture.get(3))
height = int(capture.get(4))
fcc = cv2.VideoWriter_fourcc('D', 'I', 'V', 'X')
out = cv2.VideoWriter("demo_results/new_dual_dance_1.avi", fcc, fps, (width, height))

while True:
    ret, frame = capture.read()
    if not ret:
        break
    results = model.detect([frame], verbose=0)
    r = results[0]
    frame = display_instances(
        frame, r['rois'], r['masks'], r['class_ids'], class_names, r['scores'], video=True
    )
    # Showing each processed frame
    cv2.imshow('video', frame)

    # Recording Video
    out.write(frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

capture.release()
cv2.destroyAllWindows()