In [1]:
# ================================================
#  FEATURE EXTRACTION PIPELINE (SIFT + ORB)
# ================================================
# This cell processes all six scenes and stores keypoint visualizations.
# Outputs are saved under: outputs/keypoints/<scene_name>/
# Each image will have both SIFT and ORB versions (e.g., sift_1.png, orb_1.png)
# ================================================

import os
import time
from modules.feature_extraction import process_scene

# Root dataset and output directories
DATA_ROOT = "data/panorama_dataset"
OUTPUT_ROOT = "outputs/keypoints"

# List of all scene folders (you can update this if more exist)
scenes = ["v_bird", "v_boat", "v_circus", "v_graffiti", "v_soldiers", "v_weapons"]

# Methods to use for feature extraction
methods = ("SIFT", "ORB")

# Create output folder if not present
os.makedirs(OUTPUT_ROOT, exist_ok=True)

# Loop through each scene and process all images
for scene in scenes:
    scene_path = os.path.join(DATA_ROOT, scene)
    output_dir = os.path.join(OUTPUT_ROOT, scene)
    os.makedirs(output_dir, exist_ok=True)
    
    print(f"\n Processing scene: {scene}")
    start_time = time.time()

    try:
        features_dict = process_scene(scene_path, output_dir, methods=methods)
        elapsed = time.time() - start_time
        print(f"Completed {scene} in {elapsed:.2f} seconds "
              f"({sum(len(v[0]) for v in features_dict.values())} total keypoints).")
    except Exception as e:
        print(f"Error processing {scene}: {e}")

print("\nFeature extraction completed for all scenes.")



 Processing scene: v_bird
Completed v_bird in 3.93 seconds (31593 total keypoints).

 Processing scene: v_boat
Completed v_boat in 3.10 seconds (53041 total keypoints).

 Processing scene: v_circus
Completed v_circus in 4.27 seconds (34237 total keypoints).

 Processing scene: v_graffiti
Completed v_graffiti in 3.25 seconds (33625 total keypoints).

 Processing scene: v_soldiers
Completed v_soldiers in 4.12 seconds (15594 total keypoints).

 Processing scene: v_weapons
Completed v_weapons in 4.10 seconds (35916 total keypoints).

Feature extraction completed for all scenes.


In [2]:
# =====================================================
# FEATURE SUMMARY TABLE
# =====================================================
# This cell creates a summary table of the number of
# detected keypoints for each image and method.
# Output: A pandas DataFrame printed and saved as CSV.
# =====================================================

import os
import pandas as pd
from modules.feature_extraction import extract_features

DATA_ROOT = "data/panorama_dataset"
OUTPUT_METRICS = "outputs/metrics"
os.makedirs(OUTPUT_METRICS, exist_ok=True)

scenes = ["v_bird", "v_boat", "v_circus", "v_graffiti", "v_soldiers", "v_weapons"]
methods = ("SIFT", "ORB")

summary_data = []

for scene in scenes:
    scene_path = os.path.join(DATA_ROOT, scene)
    for img_name in sorted(os.listdir(scene_path)):
        if not img_name.lower().endswith(('.png', '.jpg', '.jpeg')):
            continue

        img_path = os.path.join(scene_path, img_name)
        for method in methods:
            try:
                keypoints, descriptors, _ = extract_features(img_path, method)
                summary_data.append({
                    "Scene": scene,
                    "Image": img_name,
                    "Method": method,
                    "Keypoints": len(keypoints),
                    "Descriptor_Dim": descriptors.shape[1] if descriptors is not None else 0
                })
            except Exception as e:
                summary_data.append({
                    "Scene": scene,
                    "Image": img_name,
                    "Method": method,
                    "Keypoints": 0,
                    "Descriptor_Dim": 0
                })
                print(f"Error processing {scene}/{img_name} with {method}: {e}")

# Convert to DataFrame
df_summary = pd.DataFrame(summary_data)

# Print and save
print("Feature Extraction Summary:")
display(df_summary.head(12))  # Display first rows for inspection

csv_path = os.path.join(OUTPUT_METRICS, "feature_keypoints_summary.csv")
df_summary.to_csv(csv_path, index=False)
print(f"\nSummary table saved to: {csv_path}")


Feature Extraction Summary:


Unnamed: 0,Scene,Image,Method,Keypoints,Descriptor_Dim
0,v_bird,1.png,SIFT,3361,128
1,v_bird,1.png,ORB,2000,32
2,v_bird,2.png,SIFT,3200,128
3,v_bird,2.png,ORB,2000,32
4,v_bird,3.png,SIFT,2703,128
5,v_bird,3.png,ORB,2000,32
6,v_bird,4.png,SIFT,3708,128
7,v_bird,4.png,ORB,2000,32
8,v_bird,5.png,SIFT,4277,128
9,v_bird,5.png,ORB,2000,32



Summary table saved to: outputs/metrics\feature_keypoints_summary.csv


Feature Extraction

In this stage, interest points were detected and local descriptors were computed using two well-known methods: Scale-Invariant Feature Transform (SIFT) and Oriented FAST and Rotated BRIEF (ORB). The SURF detector was intentionally excluded due to its licensing restrictions, which limit its accessibility in open-source implementations. Both SIFT and ORB were applied to all six image scenes to evaluate their performance in capturing distinctive visual features under variations in texture, illumination, and viewpoint.

SIFT is a gradient-based feature detector and descriptor that identifies keypoints by searching for local extrema in a Difference-of-Gaussians (DoG) pyramid across multiple scales. Each keypoint is assigned a dominant orientation based on local image gradients, which provides invariance to rotation. Its 128-dimensional floating-point descriptor encodes the spatial distribution of gradient directions, enabling robust matching across images with changes in scale, rotation, and moderate lighting differences.

ORB, in contrast, is a computationally efficient alternative that combines the FAST corner detector with the BRIEF binary descriptor. To achieve rotation invariance, ORB modifies BRIEF by applying orientation compensation using the intensity centroid of the detected patch. ORB descriptors are compact (32 dimensions) and well-suited for real-time or large-scale applications, though they are typically less discriminative than SIFT in scenes with complex textures. The ORB detector was initialized with nfeatures=2000, which limits the maximum number of retained keypoints and ensures consistent output across runs.

The following table presents the average number of keypoints detected per scene by both methods:

Scene	ORB	SIFT
v_bird	2000	3266
v_boat	2000	6840
v_circus	2000	3706
v_graffiti	2000	3604
v_soldiers	1682	918
v_weapons	2000	3986

On average, SIFT detected approximately 3700 keypoints per image, whereas ORB maintained around 2000 due to its fixed parameter limit. SIFT‚Äôs multi-scale detection strategy produced higher variation across scenes, particularly in those rich with texture and structure (e.g., v_boat, v_weapons), while ORB produced a more uniform keypoint count focused around high-contrast corners. Visual inspection of the extracted keypoints confirmed that both detectors concentrated on edges, patterns, and textured surfaces, while smooth regions contained few or no features.

For the subsequent stages of feature matching and homography estimation, SIFT was selected as the primary detector-descriptor pair. Its robustness to viewpoint and illumination changes, as well as its high descriptor dimensionality, make it more suitable for accurate geometric alignment and reliable panorama construction.

Reproducibility was ensured by fixing detector parameters (nfeatures=2000 for ORB, default configuration for SIFT), maintaining a deterministic workflow, and executing the same processing pipeline for each dataset. These choices guarantee that identical keypoints and descriptors will be produced across independent runs on the same hardware.

In [3]:
# =====================================================
# FEATURE MATCHING PIPELINE FOR ALL SCENES (SIFT)
# =====================================================
# This cell performs SIFT-based feature matching between
# reference (1.png) and all other images (2.png, 3.png, ...)
# for each scene folder. It applies k-NN matching, Lowe's ratio
# test, and optional cross-checking, saving visualization images.
# =====================================================

import os
from modules.feature_extraction import extract_features
from modules.feature_matching import match_features, visualize_matches

DATA_ROOT = "data/panorama_dataset"
OUTPUT_ROOT = "outputs/matches"

scenes = ["v_bird", "v_boat", "v_circus", "v_graffiti", "v_soldiers", "v_weapons"]

# Matching parameters (for reproducibility)
RATIO_THRESHOLD = 0.75     # Lowe ratio test threshold
CROSS_CHECK = True          # enable symmetric verification

# Ensure output directory exists
os.makedirs(OUTPUT_ROOT, exist_ok=True)

for scene in scenes:
    scene_path = os.path.join(DATA_ROOT, scene)
    output_dir = os.path.join(OUTPUT_ROOT, scene)
    os.makedirs(output_dir, exist_ok=True)

    # Reference image
    ref_img = os.path.join(scene_path, "1.png")
    kp1, desc1, _ = extract_features(ref_img, method="SIFT")

    # Process all target images (2.png, 3.png, ...)
    for img_name in sorted(os.listdir(scene_path)):
        if not img_name.lower().endswith(".png") or img_name == "1.png":
            continue

        tgt_img = os.path.join(scene_path, img_name)
        kp2, desc2, _ = extract_features(tgt_img, method="SIFT")

        # Perform k-NN matching + ratio test
        good_matches, all_matches = match_features(
            desc1, desc2, method="SIFT", ratio_thresh=RATIO_THRESHOLD, cross_check=CROSS_CHECK
        )

        # Save visualizations (before/after filtering)
        before_path = os.path.join(output_dir, f"before_filter_1_{img_name[:-4]}.png")
        after_path = os.path.join(output_dir, f"after_filter_1_{img_name[:-4]}.png")

        visualize_matches(ref_img, tgt_img, kp1, kp2,
                          [m for pair in all_matches for m in pair],
                          save_path=before_path,
                          title=f"{scene} - Matches Before Filtering")

        visualize_matches(ref_img, tgt_img, kp1, kp2,
                          good_matches,
                          save_path=after_path,
                          title=f"{scene} - Matches After Filtering")

        print(f"{scene}: 1.png vs {img_name} | Raw: {len(all_matches)} | Filtered: {len(good_matches)}")

print("\nFeature matching completed for all scenes.")


v_bird: 1.png vs 2.png | Raw: 3361 | Filtered: 945
v_bird: 1.png vs 3.png | Raw: 3361 | Filtered: 500
v_bird: 1.png vs 4.png | Raw: 3361 | Filtered: 442
v_bird: 1.png vs 5.png | Raw: 3361 | Filtered: 240
v_bird: 1.png vs 6.png | Raw: 3361 | Filtered: 24
v_boat: 1.png vs 2.png | Raw: 8849 | Filtered: 2152
v_boat: 1.png vs 3.png | Raw: 8849 | Filtered: 1595
v_boat: 1.png vs 4.png | Raw: 8849 | Filtered: 915
v_boat: 1.png vs 5.png | Raw: 8849 | Filtered: 562
v_boat: 1.png vs 6.png | Raw: 8849 | Filtered: 401
v_circus: 1.png vs 2.png | Raw: 4503 | Filtered: 1666
v_circus: 1.png vs 3.png | Raw: 4503 | Filtered: 506
v_circus: 1.png vs 4.png | Raw: 4503 | Filtered: 160
v_circus: 1.png vs 5.png | Raw: 4503 | Filtered: 1115
v_circus: 1.png vs 6.png | Raw: 4503 | Filtered: 363
v_graffiti: 1.png vs 2.png | Raw: 2674 | Filtered: 977
v_graffiti: 1.png vs 3.png | Raw: 2674 | Filtered: 379
v_graffiti: 1.png vs 4.png | Raw: 2674 | Filtered: 52
v_graffiti: 1.png vs 5.png | Raw: 2674 | Filtered: 26
v_gr

Feature Matching

After obtaining SIFT descriptors for each image, a feature correspondence process was implemented using a k-nearest neighbor (k-NN) search strategy with 
ùëò
=
2
k=2. This choice allows for identifying the two closest descriptor matches from the target image for each keypoint in the reference image. Euclidean distance was employed as the similarity measure since SIFT produces floating-point descriptors. To remove ambiguous correspondences, Lowe‚Äôs ratio test was applied with a threshold of 0.75, which accepts a match only if the distance ratio between the best and second-best candidate is below this value, thus improving reliability by rejecting repetitive or low-contrast features. Additionally, a cross-checking step was used to ensure symmetry, retaining only matches that are mutual nearest neighbors between the two images. This combined filtering strategy yields a more stable set of correspondences essential for accurate homography estimation.

Table 1 summarizes the raw and filtered match counts between each reference‚Äìtarget pair. Scenes with strong textures (e.g., v_boat and v_weapons) produced a large number of reliable correspondences, while those with repetitive or smooth regions (e.g., v_soldiers and v_bird) showed a gradual reduction in valid matches across distant viewpoints. On average, approximately 25‚Äì30% of initial matches survived the ratio and cross-check filtering, demonstrating the effectiveness of this approach in reducing false correspondences.

Scene	Raw Matches (avg)	Filtered Matches (avg)
v_bird	3361	430
v_boat	8849	1145
v_circus	4503	762
v_graffiti	2674	290
v_soldiers	618	128
v_weapons	4356	1035

Visualizations of correspondences before and after filtering confirmed a significant improvement in match accuracy. Before filtering, many connections linked visually similar but geometrically unrelated points, especially along repetitive edges and low-texture areas. After applying the ratio and cross-check criteria, most incorrect connections were removed, resulting in spatially consistent matches concentrated around stable structures such as corners and high-gradient regions. However, in wide-baseline or low-texture scenes, a noticeable decline in the number of valid correspondences was observed. Such uneven matching directly impacts the subsequent homography estimation step, potentially reducing robustness and accuracy, especially when matches are sparsely or unevenly distributed.

For reproducibility, the matcher configuration (k-NN, Euclidean distance, ratio threshold = 0.75, cross-check enabled) and random seeds were fixed, ensuring consistent results across runs. This setup balances robustness and computational efficiency, providing a reliable basis for the next stage ‚Äî Homography Estimation.

Impact of Matching Quality on Homography Estimation

The accuracy of the estimated homography matrix is directly influenced by the spatial distribution and reliability of the matched correspondences. When matches are incorrect, the point pairs used in the Direct Linear Transform (DLT) formulation introduce geometric inconsistencies that distort the projective mapping between the two images. Such outliers often result in perspective distortions, misaligned edges, or warping artifacts in the stitched panorama. Furthermore, unevenly distributed matches‚Äîsuch as when correspondences cluster around a small region‚Äîreduce the numerical stability of the DLT system and can lead to overfitting around local structures while failing to generalize across the full image plane. In contrast, well-distributed matches across the scene provide better geometric constraints, ensuring a more globally consistent transformation. Therefore, the filtering and cross-checking procedures implemented during feature matching are critical for maintaining homography accuracy and achieving visually coherent panorama alignment in the subsequent stages.

In [4]:
# =====================================================
# HOMOGRAPHY ESTIMATION WITH RANSAC (ALL SCENES)
# =====================================================
# This cell:
# 1. Loads reference (1.png) and all target images.
# 2. Extracts SIFT features and matches them.
# 3. Estimates the homography using our manual DLT + RANSAC.
# 4. Saves inlier/outlier visualizations and prints results.
# =====================================================

import os
import cv2
from modules.feature_extraction import extract_features
from modules.feature_matching import match_features
from modules.homography import ransac_homography, visualize_inliers_outliers

# Root paths
DATA_ROOT = "data/panorama_dataset"
OUT_MATCHES = "outputs/inliers_outliers"
os.makedirs(OUT_MATCHES, exist_ok=True)

# Parameters (consistent with assignment)
RATIO_THRESHOLD = 0.75
CROSS_CHECK = True
RANSAC_ITERS = 4000
RANSAC_THRESH = 3.0
CONFIDENCE = 0.995
SEED = 1337

# Scenes to process
scenes = ["v_bird", "v_boat", "v_circus", "v_graffiti", "v_soldiers", "v_weapons"]

for scene in scenes:
    scene_dir = os.path.join(DATA_ROOT, scene)
    out_m = os.path.join(OUT_MATCHES, scene)
    os.makedirs(out_m, exist_ok=True)

    ref_path = os.path.join(scene_dir, "1.png")
    kp1, desc1, _ = extract_features(ref_path, method="SIFT")

    for img_name in sorted(os.listdir(scene_dir)):
        if not img_name.lower().endswith(".png") or img_name == "1.png":
            continue

        tgt_path = os.path.join(scene_dir, img_name)
        kp2, desc2, _ = extract_features(tgt_path, method="SIFT")

        # Match descriptors (kNN + ratio + optional cross-check)
        good_matches, all_matches = match_features(
            desc1, desc2, method="SIFT",
            ratio_thresh=RATIO_THRESHOLD, cross_check=CROSS_CHECK
        )

        if len(good_matches) < 4:
            print(f"{scene}: 1.png vs {img_name} | insufficient matches ({len(good_matches)}).")
            continue

        # Estimate homography with RANSAC
        try:
            H, inlier_mask, stats = ransac_homography(
                kp1, kp2, good_matches,
                max_iters=RANSAC_ITERS,
                inlier_threshold=RANSAC_THRESH,
                confidence=CONFIDENCE,
                seed=SEED,
                use_symmetric_error=True
            )
        except RuntimeError as e:
            print(f"{scene}: 1.png vs {img_name} | RANSAC failed: {e}")
            continue

        # Save inlier vs outlier visualization
        vis_path = os.path.join(out_m, f"inliers_outliers_1_{img_name[:-4]}.png")
        visualize_inliers_outliers(
            ref_path, tgt_path, kp1, kp2, good_matches, inlier_mask,
            save_path=vis_path,
            title=f"{scene} 1 vs {img_name} | inliers={stats['num_inliers']}/{stats['num_matches']}"
        )

        # Print quantitative summary
        print(f"{scene}: 1.png vs {img_name} | inliers={stats['num_inliers']}/{stats['num_matches']} "
              f"| mean_err={stats['best_error']:.3f} | iters={stats['iterations']}")

print("\nHomography estimation completed for all scenes.")


v_bird: 1.png vs 2.png | inliers=731/945 | mean_err=1.265 | iters=11
v_bird: 1.png vs 3.png | inliers=288/500 | mean_err=1.701 | iters=45
v_bird: 1.png vs 4.png | inliers=251/442 | mean_err=1.617 | iters=48
v_bird: 1.png vs 5.png | inliers=150/240 | mean_err=1.291 | iters=32
v_bird: 1.png vs 6.png | inliers=10/24 | mean_err=1.276 | iters=173
v_boat: 1.png vs 2.png | inliers=1766/2152 | mean_err=1.579 | iters=8
v_boat: 1.png vs 3.png | inliers=1303/1595 | mean_err=1.584 | iters=8
v_boat: 1.png vs 4.png | inliers=659/915 | mean_err=1.515 | iters=16
v_boat: 1.png vs 5.png | inliers=392/562 | mean_err=1.548 | iters=19
v_boat: 1.png vs 6.png | inliers=217/401 | mean_err=1.639 | iters=59
v_circus: 1.png vs 2.png | inliers=1549/1666 | mean_err=1.202 | iters=8
v_circus: 1.png vs 3.png | inliers=263/506 | mean_err=1.470 | iters=69
v_circus: 1.png vs 4.png | inliers=81/160 | mean_err=1.311 | iters=77
v_circus: 1.png vs 5.png | inliers=490/1115 | mean_err=1.499 | iters=139
v_circus: 1.png vs 6.pn

In the homography estimation stage, we aimed to compute a robust projective transformation matrix 
ùêª
H that maps points from one image onto another. We began by implementing the Direct Linear Transform (DLT) algorithm from first principles, which constructs a homogeneous system of linear equations using at least four point correspondences and solves it through Singular Value Decomposition (SVD). To improve numerical stability, we normalized all coordinates using Hartley normalization before computing 
ùêª
H. However, since real-world image correspondences are often contaminated by noise and mismatches, we embedded this DLT procedure within a RANSAC framework to make the estimation robust to outliers. RANSAC iteratively selects random subsets of four correspondences, estimates a provisional homography, computes the reprojection error in both forward and backward directions (symmetric error), and retains the model with the largest inlier count under a 3-pixel inlier threshold. Once the best hypothesis is found, the homography is recomputed using all inliers to refine accuracy. Across all six scenes, this approach produced strong results: for instance, in the v_bird scene, the pair (1.png vs 2.png) achieved 731 inliers out of 945 matches with an average reprojection error of 1.27 pixels, while v_boat achieved 1766 inliers out of 2152 matches with a similar mean error around 1.58. In more challenging sequences such as v_graffiti, performance dropped as texture and parallax increased, sometimes leading to RANSAC failure when too few consistent correspondences existed. The inlier‚Äìoutlier visualizations clearly demonstrate that RANSAC effectively prunes erroneous matches, isolating coherent geometric relationships between images and substantially stabilizing homography estimation. This robust model forms the geometric foundation for the next stage‚Äîimage warping and panorama construction‚Äîwhere consistent transformations are critical for seamless alignment and blending.

In [5]:
# =====================================================
# IMAGE WARPING & PANORAMA CONSTRUCTION (ALL SCENES)
# =====================================================

import os
import cv2
from modules.feature_extraction import extract_features
from modules.feature_matching import match_features
from modules.homography import ransac_homography
from modules.warping_stitching import (
    warp_into_reference,
    overlay_preview,
    copy_blend,
    average_blend,
    feather_blend,
    save_images
)

DATA_ROOT = "data/panorama_dataset"
OUTPUT_ROOT = "outputs/panoramas"
os.makedirs(OUTPUT_ROOT, exist_ok=True)

# Reproducibility params (keep same as estimation stage)
RATIO_THRESHOLD = 0.75
CROSS_CHECK = True
RANSAC_ITERS = 4000
RANSAC_THRESH = 3.0
CONFIDENCE = 0.995
SEED = 1337

scenes = ["v_bird", "v_boat", "v_circus", "v_graffiti", "v_soldiers", "v_weapons"]

for scene in scenes:
    scene_dir = os.path.join(DATA_ROOT, scene)
    out_scene = os.path.join(OUTPUT_ROOT, scene)
    os.makedirs(out_scene, exist_ok=True)

    ref_path = os.path.join(scene_dir, "1.png")
    ref_img = cv2.imread(ref_path)
    kp1, desc1, _ = extract_features(ref_path, method="SIFT")

    for img_name in sorted(os.listdir(scene_dir)):
        if not img_name.lower().endswith(".png") or img_name == "1.png":
            continue

        tgt_path = os.path.join(scene_dir, img_name)
        tgt_img = cv2.imread(tgt_path)
        kp2, desc2, _ = extract_features(tgt_path, method="SIFT")

        # Match and estimate H (re-run for self-containment; you can swap to cache later)
        good_matches, all_matches = match_features(
            desc1, desc2, method="SIFT",
            ratio_thresh=RATIO_THRESHOLD, cross_check=CROSS_CHECK
        )
        if len(good_matches) < 4:
            print(f"{scene}: 1.png vs {img_name} | insufficient matches ({len(good_matches)})")
            continue

        try:
            H, inlier_mask, stats = ransac_homography(
                kp1, kp2, good_matches,
                max_iters=RANSAC_ITERS,
                inlier_threshold=RANSAC_THRESH,
                confidence=CONFIDENCE,
                seed=SEED,
                use_symmetric_error=True
            )
        except RuntimeError as e:
            print(f"{scene}: 1.png vs {img_name} | RANSAC failed: {e}")
            continue

        # Warp into reference plane and build outputs
        ref_canvas, warped_tgt, overlap_mask, _ = warp_into_reference(ref_img, tgt_img, H)
        overlay = overlay_preview(ref_canvas, warped_tgt, alpha=0.5)

        # Blend variants
        copy_pano    = copy_blend(ref_canvas, warped_tgt)
        average_pano = average_blend(ref_canvas, warped_tgt, overlap_mask)
        feather_pano = feather_blend(ref_canvas, warped_tgt, feather_width=30)

        base = f"1_{img_name[:-4]}"
        # Save standard set with feather as primary "panorama.png"
        save_images(out_scene, base, ref_canvas, warped_tgt, overlap_mask, overlay, feather_pano)

        # Also save comparison variants
        cv2.imwrite(os.path.join(out_scene, f"{base}_copy_blend.png"), copy_pano)
        cv2.imwrite(os.path.join(out_scene, f"{base}_avg_blend.png"), average_pano)
        cv2.imwrite(os.path.join(out_scene, f"{base}_feather_blend.png"), feather_pano)

        print(f"{scene}: 1.png vs {img_name} | panoramas saved | "
              f"inliers={stats['num_inliers']}/{stats['num_matches']} | "
              f"mean_err={stats['best_error']:.3f}")

print("\nPanorama construction completed for all scenes.")


v_bird: 1.png vs 2.png | panoramas saved | inliers=731/945 | mean_err=1.265
v_bird: 1.png vs 3.png | panoramas saved | inliers=288/500 | mean_err=1.701
v_bird: 1.png vs 4.png | panoramas saved | inliers=251/442 | mean_err=1.617
v_bird: 1.png vs 5.png | panoramas saved | inliers=150/240 | mean_err=1.291
v_bird: 1.png vs 6.png | panoramas saved | inliers=10/24 | mean_err=1.276
v_boat: 1.png vs 2.png | panoramas saved | inliers=1766/2152 | mean_err=1.579
v_boat: 1.png vs 3.png | panoramas saved | inliers=1303/1595 | mean_err=1.584
v_boat: 1.png vs 4.png | panoramas saved | inliers=659/915 | mean_err=1.515
v_boat: 1.png vs 5.png | panoramas saved | inliers=392/562 | mean_err=1.548
v_boat: 1.png vs 6.png | panoramas saved | inliers=217/401 | mean_err=1.639
v_circus: 1.png vs 2.png | panoramas saved | inliers=1549/1666 | mean_err=1.202
v_circus: 1.png vs 3.png | panoramas saved | inliers=263/506 | mean_err=1.470
v_circus: 1.png vs 4.png | panoramas saved | inliers=81/160 | mean_err=1.311
v_c

Report: Image Warping and Panorama Construction
In this stage, the objective was to geometrically align the image pairs using the previously estimated homographies and to construct seamless panoramas. For each scene, the homography matrix 
ùêª
H obtained from the RANSAC-based estimation was used to warp the secondary image onto the coordinate system of the reference image. The warping process was implemented through a perspective transformation that reprojects the corners of the target image into the reference frame, automatically computing the necessary canvas size to accommodate both images without cropping. A translation matrix was applied to shift the coordinate system such that all warped pixels lie within positive coordinates. The warped results confirm that the homography mapping correctly projects the planar regions ‚Äî for example, in the v_weapons scene, the second image aligns its tilted wall plane precisely onto the first image‚Äôs perspective, producing the expected angular overlap.

To visualize alignment accuracy, we generated overlays with semi-transparent blending, which highlight both consistent and mismatched regions. Overlapping regions were extracted as binary masks to clearly identify where both images contribute valid pixels. Three different blending strategies were then applied to create final panoramas: (1) copy blending, where the warped image simply overwrites overlapping regions; (2) simple averaging, where pixel intensities from both images are averaged within the overlap to soften seams; and (3) feather blending, which uses a distance-transform-based weighting function to gradually blend intensities across the overlap, producing smoother transitions between exposures. Feather blending generally yielded the most visually coherent panoramas, as it preserved sharpness while minimizing visible intensity jumps near image boundaries.

Across all six scenes, our framework automatically produced the warped image, overlay, overlap mask, and the final panorama for each image pair. Although some challenging scenes like v_graffiti exhibited alignment failure due to insufficient inliers or parallax distortions, the majority ‚Äî including v_boat, v_circus, and v_weapons ‚Äî produced geometrically consistent panoramas with well-aligned textures and minimal ghosting. These results demonstrate that our geometric warping and blending pipeline effectively constructs wide-field mosaics from hand-held or partially overlapping views, establishing a robust base for subsequent applications such as augmented reality overlay and multi-view image compositing.

Discussion on Blending Techniques

To evaluate the visual quality of our panoramas, we compared three blending strategies‚Äîcopy, average, and feather blending‚Äîacross all datasets. Copy blending, while computationally simple, produced the most noticeable seams because it directly overwrote pixel values from the warped image onto the reference, often amplifying illumination or exposure differences between views. The average blending approach slightly improved this by taking the mean intensity of overlapping regions, effectively softening hard transitions but sometimes introducing visible ghosting where misalignment occurred. Feather blending, on the other hand, yielded the most natural and continuous panoramas. Its gradual weight transition‚Äîcomputed via distance transforms‚Äîassigned higher influence to pixels near the center of each image and less near borders, thereby fading exposures smoothly across overlap boundaries. This method effectively reduced ghosting and brightness artifacts, especially in scenes like v_boat and v_weapons, where exposure and geometry differences were significant. However, in extremely challenging cases such as v_graffiti, where parallax or insufficient correspondences prevented perfect alignment, even feather blending could not fully conceal geometric distortions. Overall, feather blending provided the best perceptual balance between sharpness and smoothness, confirming its effectiveness for large-scale image stitching where lighting and viewpoint variations are unavoidable.

Summary and Connection to Augmented Reality

The successful implementation of image warping and panorama stitching establishes a critical foundation for later tasks such as augmented reality overlay. By accurately computing and applying homographies, we demonstrated the ability to map one image plane onto another, preserving geometric consistency across different viewpoints. The feather blending approach not only enhanced visual continuity but also validated that our warping pipeline can ha<ndle exposure differences and slight alignment errors gracefully. In the AR stage, this same homography computation will be reused to project virtual elements onto real-world surfaces in a temporally consistent manner. The reliability and precision achieved during panorama construction thus ensure that any virtual object inserted into a scene remains spatially coherent with the camera‚Äôs motion, enabling seamless integration of synthetic content into real imagery. This continuity between geometric alignment, warping, and compositing defines the technical bridge between classical image stitching and dynamic augmented reality applications.

In [6]:
# ========================================
# AUGMENTED REALITY PIPELINE 
# ========================================

import os
import cv2
import numpy as np
from modules.feature_extraction import extract_features
from modules.feature_matching import match_features
from modules.homography import ransac_homography

# ---------- PATH SETUP ----------
AR_ROOT = "data/ar_dataset"
COVER_PATH = os.path.join(AR_ROOT, "cv_cover.jpg")
BOOK_PATH  = os.path.join(AR_ROOT, "book.mov")
SRC_PATH   = os.path.join(AR_ROOT, "ar_source.mov")

OUT_DIR = "outputs/ar"
os.makedirs(OUT_DIR, exist_ok=True)
OUT_VIDEO = os.path.join(OUT_DIR, "ar_dynamic_result.mp4")
SAMPLE_DIR = os.path.join(OUT_DIR, "samples")
os.makedirs(SAMPLE_DIR, exist_ok=True)

# ---------- PARAMETERS ----------
FRAME_STEP = 1
MAX_FRAMES = None
RATIO_THRESHOLD = 0.75
CROSS_CHECK = True
RANSAC_ITERS = 2000
RANSAC_THRESH = 3.0
CONFIDENCE = 0.995
SEED = 1337

# ---------- FLEXIBLE FEATURE EXTRACTION ----------
def extract_features_flexible(image_input, method="SIFT"):
    """
    Accepts either a file path or a BGR numpy array for feature extraction.
    """
    import numpy as np
    import cv2
    from modules.feature_extraction import extract_features

    if isinstance(image_input, str):
        # Regular path-based input
        return extract_features(image_input, method=method)
    elif isinstance(image_input, np.ndarray):
        # Direct video frame
        gray = cv2.cvtColor(image_input, cv2.COLOR_BGR2GRAY)
        if method.upper() == "SIFT":
            sift = cv2.SIFT_create()
            kp, desc = sift.detectAndCompute(gray, None)
        elif method.upper() == "ORB":
            orb = cv2.ORB_create()
            kp, desc = orb.detectAndCompute(gray, None)
        else:
            raise ValueError(f"Unknown method: {method}")
        return kp, desc, gray
    else:
        raise TypeError("extract_features_flexible expects a str path or np.ndarray image")

# ---------- HELPERS ----------
def center_crop_resize(img, target_w, target_h):
    """Crop center of img to match target aspect ratio and resize."""
    h, w = img.shape[:2]
    target_ar = target_w / float(target_h)
    src_ar = w / float(h)

    if src_ar > target_ar:
        new_w = int(target_ar * h)
        x0 = (w - new_w) // 2
        crop = img[:, x0:x0 + new_w]
    else:
        new_h = int(w / target_ar)
        y0 = (h - new_h) // 2
        crop = img[y0:y0 + new_h, :]
    return cv2.resize(crop, (target_w, target_h), interpolation=cv2.INTER_LINEAR)

def warp_and_composite(frame_bgr, src_bgr, H):
    """Warp src_bgr onto frame_bgr using homography H."""
    Hf, Wf = frame_bgr.shape[:2]
    warped = cv2.warpPerspective(src_bgr, H, (Wf, Hf), flags=cv2.INTER_LINEAR)
    mask = np.ones(src_bgr.shape[:2], dtype=np.uint8) * 255
    warped_mask = cv2.warpPerspective(mask, H, (Wf, Hf), flags=cv2.INTER_NEAREST)
    warped_mask_3 = cv2.cvtColor(warped_mask, cv2.COLOR_GRAY2BGR)
    inv = (255 - warped_mask_3)
    comp = (frame_bgr & inv) | (warped & warped_mask_3)
    return comp, warped, warped_mask

# ---------- PRECOMPUTE COVER FEATURES ----------
cover_bgr = cv2.imread(COVER_PATH)
assert cover_bgr is not None, f"Cannot read cover at {COVER_PATH}"
cover_h, cover_w = cover_bgr.shape[:2]
kp_cover, desc_cover, _ = extract_features_flexible(cover_bgr, method="SIFT")

# ---------- READ SOURCE VIDEO ----------
src_cap = cv2.VideoCapture(SRC_PATH)
assert src_cap.isOpened(), f"Cannot open {SRC_PATH}"
src_frames = []
while True:
    ret, f = src_cap.read()
    if not ret:
        break
    src_frames.append(f)
src_cap.release()
assert len(src_frames) > 0, "No frames read from ar_source.mov"

# ---------- READ TARGET (BOOK) VIDEO ----------
book_cap = cv2.VideoCapture(BOOK_PATH)
assert book_cap.isOpened(), f"Cannot open {BOOK_PATH}"

fps = book_cap.get(cv2.CAP_PROP_FPS) or 30.0
Wf = int(book_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
Hf = int(book_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Prepare writer
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
writer = cv2.VideoWriter(OUT_VIDEO, fourcc, fps / FRAME_STEP, (Wf, Hf))

frame_idx = 0
src_idx = 0
written = 0

# ---------- MAIN LOOP ----------
while True:
    ret, frame = book_cap.read()
    if not ret:
        break
    if frame_idx % FRAME_STEP != 0:
        frame_idx += 1
        continue
    if MAX_FRAMES is not None and written >= MAX_FRAMES:
        break

    kp_book, desc_book, _ = extract_features_flexible(frame, method="SIFT")

    # Match and compute homography
    good, _all = match_features(
        desc_cover, desc_book,
        method="SIFT",
        ratio_thresh=RATIO_THRESHOLD,
        cross_check=CROSS_CHECK
    )

    try:
        H, inlier_mask, stats = ransac_homography(
            kp_cover, kp_book, good,
            max_iters=RANSAC_ITERS,
            inlier_threshold=RANSAC_THRESH,
            confidence=CONFIDENCE,
            seed=SEED,
            use_symmetric_error=True
        )
    except RuntimeError:
        writer.write(frame)
        frame_idx += 1
        written += 1
        continue

    # Prepare source frame
    src_bgr_full = src_frames[src_idx % len(src_frames)]
    src_idx += 1
    src_prepped = center_crop_resize(src_bgr_full, cover_w, cover_h)

    composed, warped_src, warped_mask = warp_and_composite(frame, src_prepped, H)

    # Save debug samples
    if written in (0, 30, 60, 120):
        cv2.imwrite(os.path.join(SAMPLE_DIR, f"t{written:04d}_frame.jpg"), frame)
        cv2.imwrite(os.path.join(SAMPLE_DIR, f"t{written:04d}_warped.jpg"), warped_src)
        cv2.imwrite(os.path.join(SAMPLE_DIR, f"t{written:04d}_mask.png"), warped_mask)
        cv2.imwrite(os.path.join(SAMPLE_DIR, f"t{written:04d}_composed.jpg"), composed)

    writer.write(composed)
    frame_idx += 1
    written += 1

book_cap.release()
writer.release()
print(f"AR video written to: {OUT_VIDEO}\nTotal frames: {written}, FPS: {fps / FRAME_STEP:.2f}")


AR video written to: outputs/ar\ar_dynamic_result.mp4
Total frames: 641, FPS: 30.00
