# Image Matching and Homography Estimation with OpenCV and LightGlue

In [1]:
import os
import cv2 
import time
import json
import math
import copy
import torch
import numpy as np
from vidstab import VidStab
import matplotlib.pyplot as plt

from lightglue import viz2d
from lightglue import LightGlue, SuperPoint, DISK
from lightglue.utils import load_image, rbd, load_image_from_path
import CSRansac

from vidstab import VidStab

In [2]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 'mps', 'cpu'

extractor = SuperPoint(max_num_keypoints=2048).eval().to(device)  # load the extractor
#matcher = LightGlue(features='superpoint', depth_confidence=0.9, width_confidence=0.95).eval().to(device)
matcher = LightGlue(features='superpoint', depth_confidence=-1, width_confidence=-1).eval().to(device)
#matcher.compile(mode='reduce-overhead')

In [4]:
print(torch.__version__)
print(device)

2.1.2
cuda


## Dataset 전처리

In [8]:
aircraft_datasets = "datasets"

lables = os.path.join(aircraft_datasets + "/label")

In [9]:
# 원본 좌표값과 실수형 좌표값을 불러옴
origin_coordinate = []
float_origin_coordinate = []

# 원점 좌표값 불러오기
for label_file in os.listdir(lables):
    label_path = os.path.join(lables, label_file)
    with open(label_path, "r") as f:
        json_file = json.load(f)
        coord = json_file["targetAnnotation"]
        float_coord = copy.deepcopy(coord)
        float_origin_coordinate.append(float_coord)
        
        coord[0] = round(coord[0] * 640, 4)
        coord[1] = round(coord[1] * 480, 4)
        origin_coordinate.append(coord)

print(origin_coordinate)
print(len(origin_coordinate))

print(float_origin_coordinate)
print(len(float_origin_coordinate))

[[319.172, 270.5525], [320.0, 265.2454], [344.4649, 256.0291], [313.5761, 257.2958], [325.4817, 168.0838], [315.9396, 202.4891], [325.4792, 168.0804], [312.3912, 306.4268], [320.0, 265.2386], [331.4872, 26.9028], [316.5232, 203.0878], [329.4775, 59.023], [320.0, 337.5758], [324.1364, 161.3599], [309.3466, 253.7444], [321.2631, 248.8727], [332.8524, 236.0226], [326.0481, 203.8017], [318.4895, 251.0605], [320.9647, 255.8256], [321.2552, 215.7061], [319.4533, 225.7516], [319.4534, 180.869], [321.2005, 215.6378], [321.2277, 215.6717], [316.3752, 230.084], [316.2056, 231.4328], [320.8982, 312.2862], [320.9509, 198.6214], [315.9281, 231.4998], [320.8952, 257.6141], [320.8216, 257.4771], [320.8206, 257.478], [320.6533, 290.0106], [320.7295, 257.2927], [320.0, 291.9199], [320.0, 257.736], [320.0, 485.3593], [318.0135, 279.4593], [314.6762, 328.5291]]
40
[[0.4987062, 0.563651], [0.5, 0.5525945], [0.5382264, 0.5333939999999999], [0.4899627, 0.5360328999999999], [0.5085652, 0.3501746], [0.4936557

In [10]:
video_dir = os.path.join(aircraft_datasets, "video")
target_image_dir = os.path.join(aircraft_datasets, "target_image")
# 비디오 안정화 객체 생성
stabilizer = VidStab()

In [11]:
#원본 이미지를 가져오는 코드
len_coord = len(origin_coordinate)

target_images = []
for image_file in os.listdir(target_image_dir):
    image_path = os.path.join(target_image_dir, image_file)
    target_images.append(image_path)

# 에러를 저장할 리스트
disappear_errors = []
misannotate_errors = []
pixel_errors = []

missing_inlier = 0
failed_inliers = 0

In [12]:
def matching_keypoints(target_img, video_img, stabilizing=False):
    # 이미지를 불러옴
    img0 = load_image(target_img, grayscale=True)
    if stabilizing == True:
        img1 = stabilizer.stabilize_frame(video_img)
        img1 = load_image(img1, grayscale=True)
    else:
        img1 = load_image(video_img , grayscale=True)

    # extract local features
    feats0 = extractor.extract(img0.to(device))  # auto-resize the image, disable with resize=None
    feats1 = extractor.extract(img1.to(device))

    # match the features
    matches01 = matcher({'image0': feats0, 'image1': feats1})
    feats0, feats1, matches01 = [rbd(x) for x in [feats0, feats1, matches01]]  # remove batch dimension

    # get results
    kpts0 = feats0["keypoints"]
    kpts1 = feats1["keypoints"]
    matches = matches01['matches']  # indices with shape (K,2)
    points0 = kpts0[matches[..., 0]]  # coordinates in img0, shape (K,2)
    points1 = kpts1[matches[..., 1]]  # coordinates in img1, shape (K,2)

    return {
        "points0": points0,
        "points1": points1,
    }
    
#에러 측정 
def get_errors(disappear_error, misannotate_error, pixel_error, keypoint, mask, coord, x, y):
    if len(keypoint) < 6:
        missing_inlier += 1
    
    if mask == 0.3:
        failed_inliers += 1
    
    _x = coord[0] / 640
    _y = coord[1] / 480
    
    if _x < 0 or _x > 1 or _y < 0 or _y > 1:
        disappear_error += 1
        
    distance = math.sqrt((x - _x)**2 + (y - _y)**2)
    
    if distance > 0.1:
        misannotate_error += 1
        
    if distance > pixel_error:
        pixel_error = distance
        
    return {
        "disappear_error": disappear_error,
        "misannotate_error": misannotate_error,
        "pixel_error": pixel_error,
        "missing_inlier": missing_inlier,
        "failed_inliers": failed_inliers
    }

In [None]:
# 매칭되는 호모그래피를 구하여 원점을 투영 변환한 후, 에러를 측정
# 에러 측정을 위해 총 10번 반복
for i in range(10):
    disappear_error = 0
    misannotate_error = 0
    pixel_error = 0
    
    for i in range(len_coord):
        target_image = target_images[i]
        len_target_image = len(target_images)
        
        x = origin_coordinate[i][0]
        y = origin_coordinate[i][1]
        
        target_image = load_image(target_image, grayscale=True)
        
        # vide_dir에 있는 모든 비디오를 가져옴
        for video_file in os.listdir(video_dir):
            video_path = os.path.join(video_dir, video_file)
            cap = cv2.VideoCapture(video_path)
            
            # 각 프레임 처리, 에러처리도 동시에 진행
            while True:
                ret, frame = cap.read()
                if not ret:
                    break
                
                # 특징점 매칭
                results = matching_keypoints(target_image, frame)
                target_keypoint = results["points0"].cpu().numpy()
                frame_keypoint = results["points1"].cpu().numpy()
                
               # 호모그래피 추정
                homography, mask = CSRansac.csransac(target_keypoint, frame_keypoint))
                projected_pts = CSRansac.perspective_transform(np.array([x, y]), homography)
                
                # 에러 측정
                error_results = get_errors(disappear_error, misannotate_error, pixel_error, target_keypoint, mask, projected_pts, x, y)
                    
            cap.release()

    disappear_errors.append(disappear_error)
    misannotate_errors.append(misannotate_error)
    pixel_errors.append(pixel_error)
  

In [21]:
error1 = sum(disappear_errors) / len(disappear_errors)
error2 = sum(misannotate_errors) / len(misannotate_errors)
error3 = sum(pixel_errors) / len(pixel_errors)

print("disappear_error:", error1)
print("num_error:", error2)
print("pixel_error:", error3)
print("missing_inlier:", missing_inlier)
print("failed_inliers:", failed_inliers)   

disappear_error: 279.5
num_error: 867.2
pixel_error: 10.09973137927756


## check speed

In [None]:
# video_frames 폴더에서 프레임 파일 리스트 가져오기
video_frames = os.listdir('video')

# 프레임 수 초기화
frame_count = 0

# 프레임 별 처리 시간 리스트 초기화
frame_processing_times = []

x = 637 // 2
y = 367 // 2

image0 = load_image_from_path("img1.png", grayscale=True)
cap = cv2.VideoCapture('demo_video_resized.mp4')

# 각 프레임 처리
while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    start_time = time.time()
    
    feats0 = extractor.extract(image0.to(device))
    image1 = stabilizer.stabilize_frame(input_frame = frame)
    image1 = load_image(frame, grayscale=True)
    feats1 = extractor.extract(image1.to(device))
    
    matches01 = matcher({"image0": feats0, "image1": feats1})
    
    feats0, feats1, matches01 = [
        rbd(x) for x in [feats0, feats1, matches01]
    ]  # remove batch dimension
    
    kpts0, kpts1, matches = feats0["keypoints"], feats1["keypoints"], matches01["matches"]
    m_kpts0, m_kpts1 = kpts0[matches[..., 0]], kpts1[matches[..., 1]]
    
    homography, _ = CSRansac.csransac(m_kpts0.cpu().numpy(), m_kpts1.cpu().numpy())
    projected_pts = CSRansac.perspective_transform(np.array([x, y]), homography)
    
    cv2.circle(frame, (int(projected_pts[0]), int(projected_pts[1])), 5, (0, 0, 255), -1)
    
    # 현재 시간 측정
    current_time = time.time()
    
    # 프레임 처리 시간 계산
    frame_processing_time = current_time - start_time
    frame_processing_times.append(frame_processing_time)
    
    # 이전 프레임 처리 시간 업데이트
    prev_frame_time = current_time

    # FPS 계산
    fps = 1.0 / frame_processing_time

    # 프레임 수 증가
    frame_count += 1

    
    cv2.imshow('frame', frame)
    
    key = cv2.waitKey(5)
    if key == 27:
        break
    
    
cap.release()
cv2.destroyAllWindows()

# 전체 처리 시간 계산
total_processing_time = sum(frame_processing_times)

# 전체 프레임 수와 전체 처리 시간을 사용하여 평균 FPS 계산
average_fps = frame_count / total_processing_time

print(f"Total Frames Processed: {frame_count}")
print(f"Average FPS: {average_fps:.2f}")

In [35]:
from vidstab import VidStab

# Using defaults
stabilizer = VidStab()
stabilizer.stabilize(input_path='demo_video_resized.mp4', output_path='stable_demo_video.mp4')

In [36]:
cap = cv2.VideoCapture('datasets/unstable_version.mp4')

x = 319
y = 238

disappear_error = 0
misannotate_error = 0
pixel_error = 0

image0 = load_image("datasets/origin.png", grayscale=True)

fourcc = cv2.VideoWriter_fourcc(*'DIVX')
out = cv2.VideoWriter('before_stabilize.avi', fourcc, 30, (640, 480))

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    feats0 = extractor.extract(image0.to(device))
    #image1 = stabilizer.stabilize_frame(input_frame = frame)
    image1 = load_image(frame, grayscale=True)
    feats1 = extractor.extract(image1.to(device))
    
    matches01 = matcher({"image0": feats0, "image1": feats1})
    
    feats0, feats1, matches01 = [
        rbd(x) for x in [feats0, feats1, matches01]
    ]  # remove batch dimension
    
    kpts0, kpts1, matches = feats0["keypoints"], feats1["keypoints"], matches01["matches"]
    m_kpts0, m_kpts1 = kpts0[matches[..., 0]], kpts1[matches[..., 1]]
    
    homography, mask = CSRansac.csransac(m_kpts0.cpu().numpy(), m_kpts1.cpu().numpy())
    if mask < 0.3:
        cv2.imshow('frame', frame)
        continue
    projected_pts = CSRansac.perspective_transform(np.array([x, y]), homography)
    
    _x = projected_pts[0] / 640
    _y = projected_pts[1] / 480
    
    # 에러 측정
    if _x < 0 or _x > 1 or _y < 0 or _y > 1:
        disappear_error += 1
        
    distance = math.sqrt((x / 640 - _x)**2 + (y / 640 - _y)**2)
    
    if distance > 0.1:
        misannotate_error += 1
        
    if distance > pixel_error:
        pixel_error = distance
    
    cv2.circle(frame, (int(projected_pts[0]), int(projected_pts[1])), 5, (0, 0, 255), -1)
    
    cv2.imshow('frame', frame)
    
    out.write(frame)
    
    key = cv2.waitKey(5)
    if key == 27:
        break

cap.release()
out.release()
cv2.destroyAllWindows()

print("disappear_error:", disappear_error)
print("misannotate_error:", misannotate_error)
print("pixel_error:", pixel_error)

disappear_error: 0
misannotate_error: 334
pixel_error: 0.14165977565377297


In [39]:
cap = cv2.VideoCapture('datasets/unstable_version.mp4')

x = 319
y = 238

disappear_error = 0
misannotate_error = 0
pixel_error = 0

image0 = load_image("datasets/origin.png", grayscale=True)

fourcc = cv2.VideoWriter_fourcc(*'DIVX')
out = cv2.VideoWriter('after_stabilize.avi', fourcc, 30, (640, 480))

while True:
    ret, frame = cap.read()
    if not ret:
        break
    
    feats0 = extractor.extract(image0.to(device))
    image1 = stabilizer.stabilize_frame(input_frame = frame)
    image1 = load_image(image1, grayscale=True)
    feats1 = extractor.extract(image1.to(device))
    
    matches01 = matcher({"image0": feats0, "image1": feats1})
    
    feats0, feats1, matches01 = [
        rbd(x) for x in [feats0, feats1, matches01]
    ]  # remove batch dimension
    
    kpts0, kpts1, matches = feats0["keypoints"], feats1["keypoints"], matches01["matches"]
    m_kpts0, m_kpts1 = kpts0[matches[..., 0]], kpts1[matches[..., 1]]
    
    homography, mask = CSRansac.csransac(m_kpts0.cpu().numpy(), m_kpts1.cpu().numpy())
    if mask < 0.3:
        cv2.imshow('frame', frame)
        continue
    projected_pts = CSRansac.perspective_transform(np.array([x, y]), homography)
    
    _x = projected_pts[0] / 640
    _y = projected_pts[1] / 480
    
    # 에러 측정
    if _x < 0 or _x > 1 or _y < 0 or _y > 1:
        disappear_error += 1
        
    distance = math.sqrt((x / 640 - _x)**2 + (y / 640 - _y)**2)
    
    if distance > 0.1:
        misannotate_error += 1
        
    if distance > pixel_error:
        pixel_error = distance
    
    cv2.circle(frame, (int(projected_pts[0]), int(projected_pts[1])), 5, (0, 0, 255), -1)
    
    cv2.imshow('frame', frame)
    
    out.write(frame)
    
    key = cv2.waitKey(5)
    if key == 27:
        break

cap.release()
out.release()
cv2.destroyAllWindows()

print("disappear_error:", disappear_error)
print("misannotate_error:", misannotate_error)
print("pixel_error:", pixel_error)

error: OpenCV(4.9.0) D:\a\opencv-python\opencv-python\opencv\modules\highgui\src\window_w32.cpp:124: error: (-215:Assertion failed) bmi && width >= 0 && height >= 0 && (bpp == 8 || bpp == 24 || bpp == 32) in function 'FillBitmapInfo'
