In [1]:
from data_utils import extract_rectangles_from_xml
from evaluation import mAP
import cv2
import numpy as np
import matplotlib.pyplot as plt
import tqdm
import os

In [2]:
annotation = extract_rectangles_from_xml('data/ai_challenge_s03_c010-full_annotation.xml')
parked_cars = annotation[0]

In [64]:
cap = cv2.VideoCapture('data/AICity_data/train/S03/c010/vdo.avi')
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
print('Frame count:', frame_count, 'FPS:', fps)

split_frame = frame_count // 4

# Choose color space
color_space = "lab"
if color_space == "rgb":
    num_gaussians = 3
    color_transform = cv2.COLOR_BGR2RGB
elif color_space == "lab":
    num_gaussians = 2
    color_transform = cv2.COLOR_BGR2LAB
if color_space == "grayscale":
    num_gaussians = 1
    color_transform = cv2.COLOR_BGR2GRAY


# Initialize cumulative sum and sum of squares
cum_sum = np.zeros((height, width, num_gaussians), dtype=np.float64)
cum_sum_sq = np.zeros((height, width, num_gaussians), dtype=np.float64)

for i in tqdm.tqdm(range(split_frame)):  # Process 25% of the frames
    ret, frame = cap.read()
    if not ret:
        break
    color_frame = cv2.cvtColor(frame, color_transform).astype(np.float64)
    if color_space == "lab":
        color_frame = color_frame[:, :, 1:]
    elif color_space == "grayscale":
        color_frame = color_frame[:, :, None]
    cum_sum += color_frame
    cum_sum_sq += color_frame ** 2

cap.release()

# Calculate mean and variance
mean = cum_sum / split_frame
variance = (cum_sum_sq / split_frame) - (mean ** 2)

Frame count: 2141 FPS: 10


  0%|          | 0/535 [00:00<?, ?it/s]

100%|██████████| 535/535 [00:30<00:00, 17.46it/s]


In [65]:
def shadow_detection(image):
    # Future work, implement the shadow detection with brightness and 
    # color detection
    return 0, 0

def remove_shadow_gabor(image):
    # Gabor filter parameters
    num = 8  # Number of different orientations
    vects = 8  # Number of different wavelengths (vector sizes)

    gabor_features = np.zeros((image.shape[0], image.shape[1], num * vects), dtype=np.double)

    for i in range(num):
        theta = i / num * np.pi
        for j in range(vects):
            lamda = int(image.shape[0] / (2 ** j))
            g_kernel = cv2.getGaborKernel((lamda, lamda), sigma=4.0, theta=theta, lambd=lamda, gamma=0.5)
            filtered_img = cv2.filter2D(image, cv2.CV_8UC3, g_kernel)
            gabor_features[:, :, i * vects + j] = filtered_img

    gabor_features_binary = (gabor_features.mean(axis=2) > 2.5*gabor_features.mean(axis=2).mean()).astype(np.uint8)

    
    # Find the columns that have non-zero values
    non_zero_columns = np.where(gabor_features_binary.sum(axis=0) > 0)[0]

    if len(non_zero_columns) == 0:
        return 0, image.shape[1]
    # The minimum and maximum x values with non-zero values (bbox horizontal)
    min_x = non_zero_columns.min()
    max_x = non_zero_columns.max()

    return min_x, max_x

# Function to calculate if rectangles a and b are close
def are_close(a, b, proximity_threshold):
    left_a, top_a, right_a, bottom_a = a
    left_b, top_b, right_b, bottom_b = b

    # Check if rectangles are close based on the threshold
    horizontal_close = (left_b <= right_a + proximity_threshold and right_b >= left_a - proximity_threshold)
    vertical_close = (top_b <= bottom_a + proximity_threshold and bottom_b >= top_a - proximity_threshold)

    return horizontal_close and vertical_close

# Function to merge two rectangles
def merge_rects(a, b):
    left_a, top_a, right_a, bottom_a = a
    left_b, top_b, right_b, bottom_b = b
    return (min(left_a, left_b), min(top_a, top_b), max(right_a, right_b), max(bottom_a, bottom_b))

def merge_close_rectangles(rectangles, proximity_threshold):
    # Convert rectangles to a format that includes the bottom-right corner for easier comparison
    rects_with_br = [(x, y, x+w, y+h) for x, y, w, h in rectangles]

    merged = True
    while merged:
        merged = False
        new_rects = []
        while rects_with_br:
            current = rects_with_br.pop(0)
            for i, other in enumerate(rects_with_br):
                if are_close(current, other, proximity_threshold):
                    new_rect = merge_rects(current, other)
                    rects_with_br[i] = new_rect  # Replace the "other" rect with the merged one
                    current = new_rect  # Update current to be the merged rect
                    merged = True
                    break
            else:
                new_rects.append(current)  # Add current rect if it wasn't merged
        rects_with_br = new_rects  # Update list with merged rects

    # Convert back to original format
    merged_rectangles = [(left, top, right-left, bottom-top) for left, top, right, bottom in rects_with_br]
    return merged_rectangles

In [66]:
gt_bbox = [
    [list(np.array(r).astype(int)) for r in rect if r not in parked_cars]
    for rect in list(annotation.values())[split_frame:]
]

In [94]:
def process_frame_adaptative(frame, mean, variance, alpha, rho, color_space="lab"):
    """Process a single frame to extract foreground bounding boxes."""
    assert color_space in ["lab", "rgb", "grayscale"], 'Choose colorspace in ["lab", "rgb", "grayscale"]'
    if color_space == "lab":
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2LAB)[:, :, 1:]
        alphas, rhos = np.full(2, alpha), np.full(2, rho)
    elif color_space == "rgb":
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        alphas, rhos = np.full(3, alpha), np.full(3, rho)
    elif color_space == "grayscale":
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)[:,:,None]
        alphas, rhos = np.full(1, alpha), np.full(1, rho)

    abs_diff = np.abs(frame - mean)

    intensity = abs_diff.sum(axis=(0,1))
    for i, intensity in enumerate(intensity):
        if intensity <= 25000000:
            alphas[i] *= 0.5
        elif intensity >= 55000000:
            alphas[i] *= 1.5

    foreground_mask = abs_diff >= alphas * (np.sqrt(variance) + 2)
    foreground_binary = np.where(foreground_mask, 255, 0).astype(np.uint8)

    background_mask = ~foreground_mask  # Inverting the foreground mask to get the background
    for c in range(frame.shape[2]):
        mean[background_mask[:,:,c], c] = rhos[c] * frame[background_mask[:,:,c], c] + (1 - rhos[c]) * mean[background_mask[:,:,c], c]
        variance[background_mask[:,:,c], c] = rhos[c] * ((frame[background_mask[:,:,c], c] - mean[background_mask[:,:,c], c]) ** 2) + (1 - rhos[c]) * variance[background_mask[:,:,c], c]
    
    foreground_clean = np.zeros(frame.shape)
    for c in range(frame.shape[2]):
        foreground_clean[:, :, c] = cv2.morphologyEx(foreground_binary[:, :, c], cv2.MORPH_CLOSE, np.ones((5, 5), np.uint8))
        foreground_clean[:, :, c] = cv2.morphologyEx(foreground_clean[:, :, c], cv2.MORPH_OPEN, np.ones((7,7), np.uint8))

    if color_space == "lab":
        foreground_clean = cv2.bitwise_and(foreground_clean[:,:,0], foreground_clean[:,:,1]).astype(np.uint8)
    elif color_space == "rgb":
        foreground_clean_aux = cv2.bitwise_and(foreground_clean[:,:,0], foreground_clean[:,:,1])
        foreground_clean = cv2.bitwise_and(foreground_clean_aux, foreground_clean[:,:,2]).astype(np.uint8)
    elif color_space == "grayscale":
        foreground_clean = foreground_clean[:,:,0].astype(np.uint8)

    contours, _ = cv2.findContours(foreground_clean, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rectangles_merged = merge_close_rectangles([cv2.boundingRect(contour) for contour in contours], 20)
    
    rectangles_output = []
    for i, (x, y, w, h) in enumerate(rectangles_merged):
        if w < 80 or h < 80:
            continue
        
        if color_space == "grayscale":
            new_xmin = remove_shadow_gabor(frame[y:y+h, x:x+w])[0]
            rectangles_output.append([x + new_xmin, y, x + w, y + h])
            foreground_clean[y:y+h, x:x+new_xmin] = 0

        elif color_space == "lab":
            rectangles_output.append([x, y, x + w, y + h])

        elif color_space == "rgb":
            new_xmin = shadow_detection(frame[y:y+h, x:x+w])[0]
            rectangles_output.append([x + new_xmin, y, x + w, y + h])
            foreground_clean[y:y+h, x:x+new_xmin] = 0

    return rectangles_output, foreground_binary, foreground_clean, mean, variance

def process_frame(frame, mean, variance, alpha):
    """Process a single frame to extract foreground bounding boxes."""
    gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    abs_diff = np.abs(gray_frame - mean)
    
    intensity = abs_diff.sum()
    if intensity <= 25000000:
        alpha *= 0.5
    elif intensity >= 55000000:
        alpha *= 1.5

    foreground_mask = abs_diff >= alpha * (np.sqrt(variance) + 2)
    foreground_binary = np.where(foreground_mask, 255, 0).astype(np.uint8)
    
    foreground_clean = cv2.morphologyEx(foreground_binary, cv2.MORPH_CLOSE, np.ones((5, 5), np.uint8))
    foreground_clean = cv2.morphologyEx(foreground_clean, cv2.MORPH_OPEN, np.ones((7,7), np.uint8))

    contours, _ = cv2.findContours(foreground_clean, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    rectangles_merged = merge_close_rectangles([cv2.boundingRect(contour) for contour in contours], 20)
    
    rectangles_output = []
    for i, (x, y, w, h) in enumerate(rectangles_merged):

        if w < 80 or h < 80:
            continue

        new_xmin = remove_shadow_gabor(gray_frame[y:y+h, x:x+w])[0]
        rectangles_output.append([x + new_xmin, y, x + w, y + h])

        foreground_clean[y:y+h, x:x+new_xmin] = 0

    return rectangles_output, foreground_binary, foreground_clean

def process_video(video_path, split_frame, frame_count, mean, variance, gt_bbox, alpha, rho=None, adaptative=False, color_space="lab"):
    """Process the video to overlay predicted and ground truth bounding boxes."""
    cap = cv2.VideoCapture(video_path)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    mAPs = []
    
    for n in tqdm.tqdm(range(split_frame, (frame_count//100)*100, 100)):
        print(f'Processing frames {n-split_frame} to {min(n+100, frame_count)-split_frame}...')
        out = cv2.VideoWriter(f'media/{color_space.upper()}_output_{n-split_frame}.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 10, (frame_width, frame_height))
        out3 = cv2.VideoWriter(f'media/{color_space.upper()}_output_{n-split_frame}_clean.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 10, (frame_width, frame_height), isColor=False)
        cap.set(cv2.CAP_PROP_POS_FRAMES, n)
        pred_bbox = []

        for _ in range(n, min(n+100, frame_count)):
            ret, frame = cap.read()
            if not ret:
                break

            if adaptative:
                bbox, binary, clean, mean, variance = process_frame_adaptative(frame, mean, variance, alpha, rho, color_space)
            else:
                bbox, binary, clean = process_frame(frame, mean, variance, alpha)
            
            pred_bbox.append(bbox)
            out3.write(clean)

        cap.set(cv2.CAP_PROP_POS_FRAMES, n)
        for i, _ in enumerate(pred_bbox):
            ret, frame = cap.read()
            if not ret:
                break

            for rect in pred_bbox[i]:
                cv2.rectangle(frame, (rect[0], rect[1]), (rect[2], rect[3]), (0, 0, 255), 2)
            # Assuming gt_bbox is defined elsewhere and accessible here
            for rect in gt_bbox[n + i - split_frame]:
                cv2.rectangle(frame, (rect[0], rect[1]), (rect[2], rect[3]), (0, 255, 0), 2)

            out.write(frame)
        m = mAP(gt_bbox[(n - split_frame):(min(n+100, frame_count)- split_frame)], pred_bbox)
        mAPs.append(m)
        
        out.release()
        #out2.release()
        out3.release()
    cap.release()
    return mAPs

mean_mAPs = []
for rho in [0.005]:#, 0.1, 0.2, 0.3, 0.4]:
    for alpha in [3]:#, 3, 4, 5]:
        print(f'Alpha: {alpha}')
        print(f'Rho: {rho}')
        mean_mAPs.append(np.mean(process_video('data/AICity_data/train/S03/c010/vdo.avi', split_frame, frame_count, mean, variance, gt_bbox, alpha, rho, adaptative=True, color_space=color_space)))

Alpha: 3
Rho: 0.005


  0%|          | 0/16 [00:00<?, ?it/s]

Processing frames 0 to 100...


  6%|▋         | 1/16 [01:00<15:13, 60.89s/it]

Processing frames 100 to 200...


 12%|█▎        | 2/16 [01:58<13:42, 58.77s/it]

Processing frames 200 to 300...


 19%|█▉        | 3/16 [02:56<12:40, 58.49s/it]

Processing frames 300 to 400...


 25%|██▌       | 4/16 [03:52<11:33, 57.76s/it]

Processing frames 400 to 500...


 31%|███▏      | 5/16 [04:51<10:38, 58.08s/it]

Processing frames 500 to 600...


 38%|███▊      | 6/16 [05:57<10:06, 60.65s/it]

Processing frames 600 to 700...


 44%|████▍     | 7/16 [07:07<09:33, 63.68s/it]

Processing frames 700 to 800...


 50%|█████     | 8/16 [08:14<08:38, 64.86s/it]

Processing frames 800 to 900...


 56%|█████▋    | 9/16 [09:20<07:36, 65.19s/it]

Processing frames 900 to 1000...


 62%|██████▎   | 10/16 [10:24<06:28, 64.77s/it]

Processing frames 1000 to 1100...


 69%|██████▉   | 11/16 [11:29<05:24, 64.94s/it]

Processing frames 1100 to 1200...


 75%|███████▌  | 12/16 [12:39<04:25, 66.32s/it]

Processing frames 1200 to 1300...


 81%|████████▏ | 13/16 [13:49<03:22, 67.57s/it]

Processing frames 1300 to 1400...


 88%|████████▊ | 14/16 [14:57<02:15, 67.70s/it]

Processing frames 1400 to 1500...


 94%|█████████▍| 15/16 [16:04<01:07, 67.45s/it]

Processing frames 1500 to 1600...


100%|██████████| 16/16 [17:12<00:00, 64.50s/it]


In [95]:
mean_mAPs

[0.2601229030526333]