In [None]:
import os
import cv2
import json
import matplotlib.pyplot as plt
import tensorflow as tf
from PIL import Image
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from pycocotools.coco import COCO

#ds = load_dataset("chengyenhsieh/TAO-Amodal")

In [None]:
#Extract video frames
def extract_videoframes(videoPath):
    capture = cv2.VideoCapture(videoPath)
    frames = []
    while capture.isOpened():
        ret, frame = capture.read()
        if not ret:
            print("Error: cannot receive frame")
            break
        frames.append(frame)
    capture.release()
    return frames
    

In [None]:
coco_images_dir = "/Users/ivanng/dataset/coco-2017/train/data"
coco_ann_file_caption = "/Users/ivanng/dataset/coco-2017/raw/captions_train2017.json"
coco_ann_file_detection = "/Users/ivanng/dataset/coco-2017/raw/instances_train2017.json"

#Preprocess images
def preprocessImage(image, targetSize=(128,128), paddingType='zero'):
    """
    Resize and pad an image to target size
    Args:
        image: input image ready for preprocessing
        targetSize: resize the image to the ideal size
        paddingType: type of padding would implement
    Returns:
        imagePadded: padded image
        scale: scale factor for image width and height
        x_offset, y_offset: padding offsets
    """
    height, width = image.shape[:2]
    scale = min(targetSize[0]/height, targetSize[1]/width)
    scaledHeight, scaledWidth = int(height*scale), int(width*scale)
    x_offset = (targetSize[1]-scaledWidth)//2
    y_offset = (targetSize[0]-scaledHeight)//2
    #Resize image
    resizedImage = cv2.resize(image, (scaledWidth, scaledHeight))
    #Create padding for resized image
    if paddingType=='zero':
        imagePadded = np.zeros((targetSize[0], targetSize[1], 3), dtype=np.uint8)
    elif paddingType=='mirror':
        imagePadded = cv2.copyMakeBorder(resizedImage, y_offset, targetSize[0]-scaledHeight-y_offset, x_offset, targetSize[1]-scaledWidth-x_offset, borderType=cv2.BORDER_REFLECT)
        return imagePadded/255, scale, x_offset, y_offset
    elif paddingType=="replicate":
        imagePadded = cv2.copyMakeBorder(resizedImage, y_offset, targetSize[0]-scaledHeight-y_offset, x_offset, targetSize[1]-scaledWidth-x_offset, borderType=cv2.BORDER_REPLICATE)
        return imagePadded/255, scale, x_offset, y_offset
    imagePadded[y_offset:y_offset+scaledHeight, x_offset:x_offset+scaledWidth] = resizedImage
    #Normalise to [0, 1]
    imagePadded = imagePadded/255
    return imagePadded, scale, x_offset, y_offset

#Preprocess bounding boxes
def preprocessBboxes(bboxes, scale, x_offset, y_offset):
    """
    Adjust bboxes after resized image
    Args:
        bboxes: bounding boxes for preprocessing
        scale: scale factor
        x_offset, y_offset: padding offsets
    Returns:
        newBboxes: a list of bounding boxes
    """
    newBboxes = []
    for bbox in bboxes:
        x, y, width, height = bbox
        xStart = x*scale+x_offset
        yStart = y*scale+y_offset
        xEnd = xStart+width*scale
        yEnd = yStart+height*scale
        newBboxes.append([xStart, yStart, xEnd, yEnd])
    return newBboxes

#Get entire preprocess data
def getImageAnnotations(coco_ann, coco_images_dir, targetSize=(128, 128), paddingType='zero'):
    """
    Preprocess MS COCO dataset
    Args:
        coco_ann: json annotation file directory
        coco_images_dir: image directory
        targetSize: size of standardised image
        paddingType: type of padding used
    Returns:
        data: a list of preprocessed images and annotations
    """
    coco = COCO(coco_ann)
    data = []
    for imageID in coco.getImgIds():
        imageInfo = coco.loadImgs(imageID)[0]
        #Load image
        imagePath = os.path.join(coco_images_dir, imageInfo['file_name'])
        image = cv2.imread(imagePath)
        if image is None: continue
        #Invoke image preprocess
        preprocessedImage, scale, x_offset, y_offset = preprocessImage(image, targetSize, paddingType)
        #Extract annotations for corresponding image
        annotations = coco.loadAnns(coco.getAnnIds(imgIds=imageID))
        #Obtain labels, bboxes
        bboxes = []
        labels = []
        for ann in annotations:
            #Invoke bboxes preprocess
            preprocessedBboxes = preprocessBboxes([ann['bbox']], scale, x_offset, y_offset)[0]
            bboxes.append(preprocessedBboxes)
            labels.append(ann['category_id'])
        data.append((preprocessedImage, np.array(bboxes), np.array(labels)))
    return data

#TEMPORARY: visualisation of bboxes and labels on images
def visualise(image, bboxes, labels):
    image = (image*255).astype(np.uint8)
    for i, box in enumerate(bboxes):
        x1, y1, x2, y2 = map(int, box)
        label = labels[i]
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 1)
        cv2.putText(image, labels[i], (x1, y1-2), cv2.FONT_HERSHEY_SIMPLEX, 0.2, (0, 255, 0), 1)
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.axis("off")
    plt.show()

In [None]:
#Real-time detection transformer framework
def RT_DETR(numClasses, inputShape=(128,128,3)):
    """
    RT-DETR framework
    """
    inputs = tf.keras.Input(shape=inputShape)
    #CNN backbone
    x = tf.keras.layers.Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    x = tf.keras.layers.MaxPooling2D((2,2))(x)
    x = tf.keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = tf.keras.layers.MaxPooling2D((2,2))(x)
    #Flatten feature map for encoder-decorder
    featureMap = tf.keras.layers.Reshape((-1, x.shape[-1]))(x)
    #Encoder
    encoder = tf.keras.layers.MultiHeadAttention(num_heads=4, key_dim=64)(featureMap, featureMap)
    encoder = tf.keras.layers.LayerNormalisation()(encoder)
    #Query selection
    query = tf.keras.layers.Dense(64)(tf.keras.layers.Flatten()(featureMap))
    #Decoder
    decoder = tf.keras.layers.MultiHeadAttention(num_heads=4, key_dim=64)(query, encoder)
    decoder = tf.keras.layers.LayerNormalisation()(decoder) 
    #Detection heads
    outputBbox = tf.keras.layers.Dense(4, activation='sigmoid')(decoder)
    outputClass = tf.keras.layers.Dense(numClasses, activation='softmax')(decoder)
    
    model = tf.keras.Model(inputs=inputs, outputs=[outputBbox, outputClass])
    return model
    

In [None]:
#Temporary execution
coco, categoryMap = loadCocoAnnotations(coco_ann_file_detection)
data = getImageAnnotations(coco, coco_images_dir, categoryMap)

In [None]:
data