## Project Structure
```
2DIP_exercise/
│-- data/             # Contains images & videos
│   │-- input/        # 1 image and 1 video for each phase respectively
│   │-- output/       # All output images/videos must be stored here
│-- notebooks/        # Jupyter Notebooks for each phase
│   │-- part1.ipynb   # Image processing & feature extraction
│   │-- part2.ipynb   # Optical flow, object detection and tracking 
│-- README.md         # Project instructions
```

In [1]:
# imports
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# define paths
base_path = os.path.abspath(os.path.join(os.getcwd(), ".."))
inputs = os.path.join(base_path, 'data','input')
outputs = os.path.join(base_path, 'data','output')

## Supplementary Code for Visualization

In [3]:
def display_images(image):
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(8, 6))
    plt.imshow(image_rgb)
    plt.axis('off')
    plt.show()

In [4]:
def get_frames(video_path):
    # Re-open the video
    cap = cv2.VideoCapture(video_path)

    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        # Convert BGR to RGB for matplotlib
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame_rgb)

    cap.release()

    return frames

In [8]:
import matplotlib.animation as animation
from IPython.display import HTML

import matplotlib as mpl
mpl.rcParams['animation.embed_limit'] = 100

def display_video(video_path):
    
    frames = get_frames(video_path)

    fig, ax = plt.subplots()
    plt.subplots_adjust(left=0, right=1, top=1, bottom=0)
    im = ax.imshow(np.zeros_like(frames[0]))
    ax.axis('off')

    def update(frame):
        im.set_array(frame)
        return [im]

    ani = animation.FuncAnimation(fig, update, frames=frames, interval=50, blit=True, repeat=False)

    plt.close(fig)

    return ani

## Task 1 : Analyze movement patterns in a video sequence. **(6)**

a) Compute dense optical flow for each frame in a video of a moving crowd. **(2)**

b) Visualize the movement patterns in 2 different ways. **(2+2)**

In [9]:
import cv2
import numpy as np

def optical_flow(video_path, output_path1, output_path2):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error opening video file")
        return
    
    # Get frame dimensions and fps
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    # Define output video writers
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out_hsv = cv2.VideoWriter(output_path1, fourcc, fps, (width, height))
    out_draw = cv2.VideoWriter(output_path2, fourcc, fps, (width, height))

    ret, frame1 = cap.read()
    if not ret:
        print("Error reading first frame")
        return
    prev_gray = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)

    # Prepare HSV image for visualization
    hsv = np.zeros_like(frame1)
    hsv[...,1] = 255  # saturation set to max

    while True:
        ret, frame2 = cap.read()
        if not ret:
            break
        gray = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)

        # Calculate dense optical flow using Farneback method
        flow = cv2.calcOpticalFlowFarneback(prev_gray, gray, 
                                            None, 0.5, 3, 15, 3, 5, 1.2, 0)

        # Compute magnitude and angle of flow
        mag, ang = cv2.cartToPolar(flow[...,0], flow[...,1])

        # Set hue according to flow direction
        hsv[...,0] = ang * 180 / np.pi / 2
        # Set value according to flow magnitude (normalize to 0-255)
        hsv[...,2] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX)

        # Convert HSV to BGR for visualization
        flow_bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)

        # Draw flow vectors on the original frame for output2
        step = 16
        vis = frame2.copy()
        for y in range(0, height, step):
            for x in range(0, width, step):
                fx, fy = flow[y, x]
                cv2.arrowedLine(vis, (x, y), (int(x+fx), int(y+fy)), (0, 255, 0), 1, tipLength=0.3)

        # Write frames to output videos
        out_hsv.write(flow_bgr)
        out_draw.write(vis)

        prev_gray = gray

    cap.release()
    out_hsv.release()
    out_draw.release()
    print("Optical flow processing completed and videos saved.")

# Example usage:
# optical_flow('input_video.mp4', 'output_flow_hsv.mp4', 'output_flow_arrows.mp4')


In [6]:
video_path = os.path.join(inputs, 'part2.mp4')  # Replace with your input video path
output_path1 = os.path.join(outputs, 'optical_flow_1.mp4')  # Output visualization video path
output_path2 = os.path.join(outputs, 'optical_flow_2.mp4')  # Output visualization video path

optical_flow(video_path, output_path1, output_path2)

Optical flow processing completed and videos saved.


In [20]:
ani = display_video(output_path1)
HTML(ani.to_jshtml())

In [19]:
ani = display_video(output_path2)
HTML(ani.to_jshtml())

## Task 2 : Identify and track a moving object in a video sequence. **(9)**

a) Detect an object using template matching. The output would be the first frame where it appears, with a bounding box around the detected object. **(2)**

In [11]:


def locate_object(video_path, template_path, output_path):
    # Read the template image and convert to grayscale
    template = cv2.imread(template_path, cv2.IMREAD_GRAYSCALE)
    w, h = template.shape[::-1]

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error opening video file")
        return None

    last_frame = None
    last_top_left = None
    last_bottom_right = None

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        # Perform template matching
        res = cv2.matchTemplate(gray_frame, template, cv2.TM_CCOEFF_NORMED)
        min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res)

        # Threshold for detection - adjust as needed
        threshold = 0.7
        if max_val >= threshold:
            last_top_left = max_loc
            last_bottom_right = (max_loc[0] + w, max_loc[1] + h)
            last_frame = frame.copy()

    cap.release()

    if last_frame is not None and last_top_left and last_bottom_right:
        # Draw rectangle around detected template
        cv2.rectangle(last_frame, last_top_left, last_bottom_right, (0, 255, 0), 2)
        # Save the result image
        cv2.imwrite(output_path, last_frame)
        return last_frame
    else:
        print("Template not found in any frame.")
        return None

# Usage example:
# image = locate_object(video_path, template_path, output_path)
# display_images(image)


In [18]:
video_path = os.path.join(inputs, 'part2.mp4')  # Replace with your input video path
template_path = os.path.join(inputs, 'template.png')  # Replace with your template image path
output_path = os.path.join(outputs, 'detected_object.jpg')  # Output video path

image = locate_object(video_path, template_path, output_path)
display_images(image)

b) Implement a Kalman filter to predict the object's position in subsequent frames. **(5)**

In [14]:
import cv2
import numpy as np

def track(video_path, template_path, output_path):
    # Load template
    template = cv2.imread(template_path, cv2.IMREAD_COLOR)
    if template is None:
        raise ValueError("Template image not found.")
    template_gray = cv2.cvtColor(template, cv2.COLOR_BGR2GRAY)
    w, h = template_gray.shape[::-1]

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise ValueError("Error opening video file.")

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    # Kalman filter setup
    kf = cv2.KalmanFilter(4, 2)
    kf.transitionMatrix = np.array([[1, 0, 1, 0],
                                    [0, 1, 0, 1],
                                    [0, 0, 1, 0],
                                    [0, 0, 0, 1]], np.float32)
    kf.measurementMatrix = np.array([[1, 0, 0, 0],
                                     [0, 1, 0, 0]], np.float32)
    kf.processNoiseCov = np.eye(4, dtype=np.float32) * 1e-1
    kf.measurementNoiseCov = np.eye(2, dtype=np.float32) * 1e-1

    detected = False
    threshold = 0.6
    redetect_threshold = 0.6
    base_search_margin = 150
    max_search_margin = 400  # maximum margin for adaptive search window
    missed_count = 0
    max_missed = 30

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

        # Adaptive search margin during tracking (increases with missed count)
        if missed_count > 5:
            search_margin = min(base_search_margin + 20 * (missed_count - 5), max_search_margin)
        else:
            search_margin = base_search_margin

        # If lost tracking or missed too many frames, do multi-scale re-detection
        if not detected or missed_count >= max_missed:
            scales = [1.0, 0.9, 1.1, 0.8, 1.2]  # scales to try for multi-scale template matching
            best_val = -1
            best_loc = None
            best_scale = 1.0
            best_w, best_h = w, h

            for scale in scales:
                scaled_template = cv2.resize(template_gray, (int(w * scale), int(h * scale)))
                if scaled_template.shape[0] > frame_gray.shape[0] or scaled_template.shape[1] > frame_gray.shape[1]:
                    continue

                result = cv2.matchTemplate(frame_gray, scaled_template, cv2.TM_CCOEFF_NORMED)
                min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)

                if max_val > best_val:
                    best_val = max_val
                    best_loc = max_loc
                    best_scale = scale
                    best_w, best_h = scaled_template.shape[::-1]

            print(f"Redetection max_val: {best_val:.3f}")

            if best_val >= redetect_threshold:
                x, y = best_loc
                # Initialize Kalman state with center of detected bounding box
                kf.statePre = np.array([[np.float32(x + best_w / 2)],
                                        [np.float32(y + best_h / 2)],
                                        [0],
                                        [0]], dtype=np.float32)
                kf.statePost = kf.statePre.copy()
                detected = True
                missed_count = 0
                # Update template to detected scale and region
                template_gray = frame_gray[y:y+best_h, x:x+best_w].copy()
                w, h = best_w, best_h
                cv2.rectangle(frame, (x, y), (x + best_w, y + best_h), (255, 0, 0), 2)
                print("Redetection at:", (x, y), "Scale:", best_scale)
                out.write(frame)
                continue
            else:
                print("Redetection failed.")
                out.write(frame)
                continue

        # Prediction step
        prediction = kf.predict()
        pred_x, pred_y = int(prediction[0, 0]), int(prediction[1, 0])
        pred_x = np.clip(pred_x, 0, width - w)
        pred_y = np.clip(pred_y, 0, height - h)

        search_top = max(0, pred_y - search_margin)
        search_bottom = min(height, pred_y + search_margin + h)
        search_left = max(0, pred_x - search_margin)
        search_right = min(width, pred_x + search_margin + w)
        search_roi = frame_gray[search_top:search_bottom, search_left:search_right]

        measurement = None
        if search_roi.shape[0] >= h and search_roi.shape[1] >= w:
            result = cv2.matchTemplate(search_roi, template_gray, cv2.TM_CCOEFF_NORMED)
            min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(result)

            print(f"Tracking max_val: {max_val:.3f}")

            if max_val >= threshold:
                meas_x = search_left + max_loc[0] + w / 2
                meas_y = search_top + max_loc[1] + h / 2
                measurement = np.array([[np.float32(meas_x)], [np.float32(meas_y)]])
                matched_region = frame_gray[search_top + max_loc[1]:search_top + max_loc[1] + h,
                                            search_left + max_loc[0]:search_left + max_loc[0] + w]
                # Update template only if confidence is very high to avoid drift
                if matched_region.shape == template_gray.shape and max_val >= 0.90:
                    template_gray = matched_region.copy()

        if measurement is not None:
            kf.correct(measurement)
            missed_count = 0
            tracked_x = int(measurement[0, 0] - w / 2)
            tracked_y = int(measurement[1, 0] - h / 2)
            print(f"Predicted: ({pred_x}, {pred_y}), Measured: ({tracked_x}, {tracked_y})")
        else:
            missed_count += 1
            tracked_x = pred_x
            tracked_y = pred_y
            print(f"Predicted: ({pred_x}, {pred_y}), Measured: None")

        if missed_count >= max_missed:
            print("Tracking lost. Reinitializing detection...")
            detected = False
            continue

        cv2.rectangle(frame, (tracked_x, tracked_y), (tracked_x + w, tracked_y + h), (0, 255, 0), 2)
        out.write(frame)

    cap.release()
    out.release()


In [15]:
video_path = os.path.join(inputs, 'part2.mp4')  # Replace with your input video path
template_path = os.path.join(inputs, 'template.png')  # Replace with your template image path
output_path = os.path.join(outputs, 'tracked_object.mp4')  # Output video path

track(video_path, template_path, output_path)

Redetection max_val: 0.797
Redetection at: (1318, 0) Scale: 1.0
Tracking max_val: 0.978
Predicted: (1465, 112), Measured: (1315, 6)
Tracking max_val: 0.985
Predicted: (1463, 115), Measured: (1313, 10)
Tracking max_val: 0.985
Predicted: (1460, 122), Measured: (1310, 15)
Tracking max_val: 0.989
Predicted: (1455, 130), Measured: (1307, 20)
Tracking max_val: 0.986
Predicted: (1451, 136), Measured: (1304, 26)
Tracking max_val: 0.980
Predicted: (1448, 143), Measured: (1300, 32)
Tracking max_val: 0.989
Predicted: (1443, 149), Measured: (1297, 38)
Tracking max_val: 0.975
Predicted: (1440, 155), Measured: (1294, 45)
Tracking max_val: 0.973
Predicted: (1437, 163), Measured: (1291, 52)
Tracking max_val: 0.967
Predicted: (1434, 170), Measured: (1288, 60)
Tracking max_val: 0.981
Predicted: (1431, 179), Measured: (1285, 66)
Tracking max_val: 0.893
Predicted: (1428, 185), Measured: (1281, 72)
Tracking max_val: 0.951
Predicted: (1424, 190), Measured: (1278, 80)
Tracking max_val: 0.941
Predicted: (1421

In [17]:
ani = display_video(output_path)
HTML(ani.to_jshtml())

c) Compare Bayesian filtering and Kalman filtering (theoretically). **(2)**

In [None]:
#TODO c):