# Image Matching and Homography Estimation with OpenCV and LightGlue

In [1]:
import os
import cv2 
import time
import json
import math
import torch
import numpy as np
from vidstab import VidStab
import matplotlib.pyplot as plt

from lightglue import viz2d
from lightglue import LightGlue, SuperPoint, DISK
from lightglue.utils import load_image, rbd, load_image_from_path
import CSRansac

In [2]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 'mps', 'cpu'

extractor = SuperPoint(max_num_keypoints=2048).eval().to(device)  # load the extractor
#matcher = LightGlue(features='superpoint', depth_confidence=0.9, width_confidence=0.95).eval().to(device)
matcher = LightGlue(features='superpoint', depth_confidence=-1, width_confidence=-1).eval().to(device)
#matcher.compile(mode='reduce-overhead')

In [4]:
print(torch.__version__)
print(device)

2.1.2
cuda


In [5]:
def match_lightglue(img0, img1):
    img0 = load_image(img0)
    img1 = load_image(img1)

    # extract local features
    feats0 = extractor.extract(img0.to(device))  # auto-resize the image, disable with resize=None
    feats1 = extractor.extract(img1.to(device))
    
    # match the features
    matches01 = matcher({'image0': feats0, 'image1': feats1})
    feats0, feats1, matches01 = [rbd(x) for x in [feats0, feats1, matches01]]  # remove batch dimension
    
    # get results
    kpts0 = feats0["keypoints"]
    kpts1 = feats1["keypoints"]
    matches = matches01['matches']  # indices with shape (K,2)
    points0 = kpts0[matches[..., 0]]  # coordinates in img0, shape (K,2)
    points1 = kpts1[matches[..., 1]]  # coordinates in img1, shape (K,2)
        
    return {
        "points0": points0,
        "points1": points1,
    }

In [6]:
stabilizer = VidStab()

def matching_keypoints(target_img, video_img, stabilizing=False):
    # 이미지를 불러옴
    img0 = load_image(target_img, grayscale=True)
    if stabilizing == True:
        img1 = cv2.imread(video_img)
        img1 = stabilizer.stabilize_frame(img1)
        img1 = load_image(img1, grayscale=True)
    else:
        img1 = load_image(video_img , grayscale=True)

    # extract local features
    feats0 = extractor.extract(img0.to(device))  # auto-resize the image, disable with resize=None
    feats1 = extractor.extract(img1.to(device))

    # match the features
    matches01 = matcher({'image0': feats0, 'image1': feats1})
    feats0, feats1, matches01 = [rbd(x) for x in [feats0, feats1, matches01]]  # remove batch dimension

    # get results
    kpts0 = feats0["keypoints"]
    kpts1 = feats1["keypoints"]
    matches = matches01['matches']  # indices with shape (K,2)
    points0 = kpts0[matches[..., 0]]  # coordinates in img0, shape (K,2)
    points1 = kpts1[matches[..., 1]]  # coordinates in img1, shape (K,2)

    return {
        "points0": points0,
        "points1": points1,
    }

## Dataset 전처리

In [7]:
aircraft_datasets = "D:/aircraft_datasets"

lables = os.path.join(aircraft_datasets + "/label")

In [8]:
origin_coordinate = []

# 원점 좌표값 불러오기
for label_file in os.listdir(lables):
    label_path = os.path.join(lables, label_file)
    with open(label_path, "r") as f:
        json_file = json.load(f)
        coord = json_file["targetAnnotation"]
        coord[0] = coord[0] * 640
        coord[1] = coord[1] * 480
        origin_coordinate.append(coord)

print(origin_coordinate)
print(len(origin_coordinate))


[[319.171968, 270.55248], [320.0, 265.24536], [344.464896, 256.02912], [313.576128, 257.29579199999995], [325.48172800000003, 168.083808], [315.939648, 202.48910399999997], [325.479232, 168.080352], [312.391232, 306.426768], [320.0, 265.23864], [331.487168, 26.902847999999988], [316.5232, 203.087808], [329.47750399999995, 59.02296000000001], [320.0, 337.57583999999997], [324.136448, 161.35992000000002], [309.34656, 253.744368], [321.263104, 248.872656], [332.852352, 236.02262399999998], [326.04812799999996, 203.801712], [318.48947200000003, 251.060496], [320.964672, 255.825552], [321.25523200000003, 215.70609599999997], [319.453312, 225.751632], [319.45344, 180.868992], [321.200512, 215.63779200000002], [321.227712, 215.671728], [316.37516800000003, 230.084016], [316.20556799999997, 231.432768], [320.89824, 312.286224], [320.950912, 198.62135999999998], [315.928128, 231.49977600000003], [320.895168, 257.614128], [320.82163199999997, 257.47713600000003], [320.820608, 257.477952], [320.6

In [9]:
video_dir = os.path.join(aircraft_datasets, "video")
output_dir = os.path.join(aircraft_datasets, "frames_from_video")
stabilized_frame_path = os.path.join(aircraft_datasets, "stabilized_frame")
stabilizer = VidStab()

In [10]:
# 원본 이미지 경로를 저장할 리스트
images = [[] for i in range(len(origin_coordinate))]
i = 0

# output_dir 내의 모든 폴더에 대한 반복
for folder_name in os.listdir(output_dir):
    folder_path = os.path.join(output_dir, folder_name)
    
    for name in os.listdir(folder_path):
        filename = os.path.join(folder_path, name)
        images[i].append(filename)
    
    i = i + 1

# images 리스트의 길이 반환
# num_images = len(images)
# print(f"총 이미지 수: {num_images}")

print(len(images[0]))

368


In [19]:
#안정화된 동영상에서 프레임을 이미지 파일로 저장
stabilized_frame_path = os.path.join(aircraft_datasets, "stabilized_frame")

#동영상에서 각 프레임을 이미지 파일로 저장하는 코드
for video_filename in os.listdir(stabilized_video_path):
    # video 파일 경로
    video_path = os.path.join(video_dir, video_filename)
    
    # video 파일 이름에서 확장자 제거하여 동영상 이름 추출
    video_name = os.path.splitext(video_filename)[0]
    
    # 해당 동영상의 프레임 저장 폴더 생성
    output_dir = os.path.join(stabilized_frame_path, video_name)
    os.makedirs(output_dir, exist_ok=True)
    
    # 동영상 파일 로드
    video = cv2.VideoCapture(video_path)

    # 프레임 카운터 초기화
    frame_count = 0

    while True:
        # 동영상에서 프레임을 읽음
        ret, frame = video.read()
        if not ret:
            break  # 동영상 끝에 도달하면 중단
        
        # 프레임을 이미지 파일로 저장
        frame_filename = os.path.join(output_dir, f'frame_{frame_count:04d}.jpg')
        frame = cv2.resize(frame, (640, 480))
        cv2.imwrite(frame_filename, frame)
        
        frame_count += 1

    # 자원 해제
    video.release()

In [15]:
# 안정화 이미지 경로를 저장할 리스트
stablized_images = [[] for i in range(len(origin_coordinate))]
i = 0

# output_dir 내의 모든 폴더에 대한 반복
for folder_name in os.listdir(stabilized_frame_path):
    folder_path = os.path.join(stabilized_frame_path, folder_name)
    
    for name in os.listdir(folder_path):
        filename = os.path.join(folder_path, name)
        if os.path.isfile(filename):
            stablized_images[i].append(filename)
    
    i = i + 1

print(len(stablized_images[0]))

FileNotFoundError: [WinError 3] 지정된 경로를 찾을 수 없습니다: 'D:/aircraft_datasets\\stabilized_frame'

In [24]:
#안정화된 이미지를 기준으로 호모그래피 행렬을 구하고, 호모그래피 행렬을 이용하여 특징점의 좌표를 변환하는 코드
len_coord = len(origin_coordinate)

stable_coord_list = [[] for _i in range(len(origin_coordinate))]

for i in range(len_coord):
    _stablized_images = stablized_images[i]
    _len_stablized_images= len(_stablized_images)
    x = origin_coordinate[i][0]
    y = origin_coordinate[i][1]
    
    stable_coord_list[i] = [[] for _j in range(_len_stablized_images)]
    
    img0 = _stablized_images[0]
    for j in range(_len_stablized_images):
        if j != _len_stablized_images - 1:
            img1 = _stablized_images[j+1]
        
            # LightGlue
            results_lightglue = match_lightglue(img0, img1, cfg.lightglue)
            target_keypoint = results_lightglue["points0"].cpu().numpy()
            frame_keypoint = results_lightglue["points1"].cpu().numpy()
            
            homography, _ = CSRansac.csransac(target_keypoint, frame_keypoint)
            projected_pts = CSRansac.perspective_transform(np.array([x, y]), homography)
            
            stable_coord_list[i][j].append(projected_pts)

In [27]:
# 각 행의 이름을 지정하기 위한 딕셔너리 생성
stabilized_data_dict = {}
for i, row in enumerate(stable_coord_list):
    key = f"video_{i + 1}"  # 각 행의 이름 생성 (row_1, row_2, ...)
    stabilized_data_dict[key] = row

filename = "test_stabilized_coord_list.json"
file_path = os.path.join(aircraft_datasets, filename)
with open(file_path, "w") as f:
    json.dump(stabilized_data_dict, f, indent=4)

## Error Estimate

In [12]:
float_origin_coordinate = []
lables = os.path.join(aircraft_datasets + "/label")
# 원점 좌표값 불러오기
for label in os.listdir(lables):
    label_path = os.path.join(lables, label)
    with open(label_path, "r") as f:
        json_file = json.load(f)
        coord = json_file["targetAnnotation"]
        float_origin_coordinate.append(coord)
    
        
print(float_origin_coordinate)
print(len(float_origin_coordinate))
print(type(float_origin_coordinate[0][0]))

[[0.4987062, 0.563651], [0.5, 0.5525945], [0.5382264, 0.5333939999999999], [0.4899627, 0.5360328999999999], [0.5085652, 0.3501746], [0.4936557, 0.42185229999999996], [0.5085613, 0.3501674], [0.4881113, 0.6383890999999999], [0.5, 0.5525804999999999], [0.5179487, 0.056047599999999975], [0.4945675, 0.4230996], [0.5148086, 0.12296450000000003], [0.5, 0.703283], [0.5064632, 0.33616650000000003], [0.483354, 0.5286341], [0.5019736, 0.5184847], [0.5200818, 0.4917138], [0.5094502, 0.4245869], [0.4976398, 0.5230427], [0.5015073, 0.5329699], [0.5019613, 0.44938769999999995], [0.4991458, 0.4703159], [0.499146, 0.3768104], [0.5018758, 0.4492454], [0.5019183, 0.4493161], [0.4943362, 0.4793417], [0.4940712, 0.4821516], [0.5014035, 0.6505963], [0.5014858, 0.41379449999999995], [0.4936377, 0.48229120000000003], [0.5013987, 0.5366961], [0.5012838, 0.5364107], [0.5012822, 0.5364124], [0.5010208, 0.6041888], [0.5011398, 0.5360265], [0.5, 0.6081664], [0.5, 0.53695], [0.5, 1.01116514], [0.4968961, 0.5822069

## 기존 에러 평가 코드(타깃 이미지)

In [11]:
#원본 이미지를 기준으로 호모그래피 행렬을 구하고, 호모그래피 행렬을 이용하여 특징점의 좌표를 변환하는 코드
len_coord = len(origin_coordinate)

coord_list = [[] for _i in range(len(origin_coordinate))]

disappear_errors = []
misannotate_errors = []
pixel_errors = []

missing_inlier = 0
failed_inliers = 0

# 10번 반복하여 측정한 에러를 구함
for k in range(10):
    # 좌표의 개수(동영상의 개수)만큼 반복
    for i in range(len_coord):
        _images = images[i]
        _len_images = len(_images)
        x = origin_coordinate[i][0]
        y = origin_coordinate[i][1]
        
        # 두 번째 차원의 리스트 초기화
        coord_list[i] = [[] for _ in range(_len_images)]

        img0 = _images[0] # 첫 번째 이미지를 target 이미지로 설정
        for j in range(_len_images):
            if j != _len_images - 1:
                img1 = _images[j+1]

                # LightGlue
                results_lightglue = match_lightglue(img0, img1)
                target_keypoint = results_lightglue["points0"].cpu().numpy()
                frame_keypoint = results_lightglue["points1"].cpu().numpy()
                if len(target_keypoint) < 6:
                    missing_inlier += 1

                homography, mask = CSRansac.csransac(target_keypoint, frame_keypoint)
                if mask == 0.3:
                    failed_inliers += 1
                projected_pts = CSRansac.perspective_transform(np.array([x, y]), homography)

                coord_list[i][j].append(projected_pts)
                
    #에러 측정            
    disappear_error = 0
    misannotate_error = 0
    pixel_error = 0

    for i in range(len_coord):
        float_origin_x = float_origin_coordinate[i][0]
        float_origin_y = float_origin_coordinate[i][1]
        
        origin_x = origin_coordinate[i][0]
        origin_y = origin_coordinate[i][1]
        
        for j in range(len(coord_list[i])-1):
            _coord = coord_list[i][j]
            
            x = _coord[0][0]
            y = _coord[0][1]
            
            x = x / 640
            y = y / 480
            
            x = round(x, 4)
            y = round(y, 4)
            
            # disappear_error
            if x < 0 or x > 1 or y < 0 or y > 1:
                disappear_error += 1
            
            distance = math.sqrt((float_origin_x - x)**2 + (float_origin_y - y)**2)
            
            # num_error
            if distance > 0.1:
                misannotate_error += 1
                
            # pixel_error
            if distance > pixel_error:
                pixel_error = distance
               
    print("disappear_error:", disappear_error)
    print("misannotate_error:", misannotate_error)
    print("pixel_error:", pixel_error)

    disappear_errors.append(disappear_error)
    misannotate_errors.append(misannotate_error)
    pixel_errors.append(pixel_error)
    

print("missing_inlier:", missing_inlier)
print("failed_inliers:", failed_inliers)

In [None]:
error1 = sum(disappear_errors) / len(disappear_errors)
error2 = sum(misannotate_errors) / len(misannotate_errors)
error3 = sum(pixel_errors) / len(pixel_errors)

print("disappear_error:", error1)
print("num_error:", error2)
print("pixel_error:", error3)

disappear_error: 267.4
num_error: 855.6
pixel_error: 6.883285535124935


## 기존 에러 평가 코드(인접 프레임)

In [12]:
#원본 이미지를 기준으로 호모그래피 행렬을 구하고, 호모그래피 행렬을 이용하여 특징점의 좌표를 변환하는 코드
len_coord = len(origin_coordinate)

coord_list = [[] for _i in range(len(origin_coordinate))]

disappear_errors = []
misannotate_errors = []
pixel_errors = []

missing_inlier = 0
failed_inliers = 0

# 10번 반복하여 측정한 에러를 구함
for k in range(10):
    # 좌표의 개수(동영상의 개수)만큼 반복
    for i in range(len_coord):
        _images = images[i]
        _len_images = len(_images)
        x = origin_coordinate[i][0]
        y = origin_coordinate[i][1]
        
        # 두 번째 차원의 리스트 초기화
        coord_list[i] = [[] for _ in range(_len_images)]

        img0 = _images[0] # 첫 번째 이미지를 target 이미지로 설정
        for j in range(_len_images):
            if j != _len_images - 1:
                img1 = _images[j+1]

                # LightGlue
                results_lightglue = matching_keypoints(img0, img1, stabilizing=False)
                target_keypoint = results_lightglue["points0"].cpu().numpy()
                frame_keypoint = results_lightglue["points1"].cpu().numpy()
                if len(target_keypoint) < 6:
                    missing_inlier += 1

                homography, mask = CSRansac.csransac(target_keypoint, frame_keypoint)
                if mask < 0.3:
                    failed_inliers += 1
                projected_pts = CSRansac.perspective_transform(np.array([x, y]), homography)

                coord_list[i][j].append(projected_pts)
                
                img0 = img1
                
    #에러 측정            
    disappear_error = 0
    misannotate_error = 0
    pixel_error = 0

    for i in range(len_coord):
        origin_x = float_origin_coordinate[i][0]
        origin_y = float_origin_coordinate[i][1]
        
        for j in range(len(coord_list[i])-1):
            _coord = coord_list[i][j]
            
            x = _coord[0][0]
            y = _coord[0][1]
            
            x = x / 640
            y = y / 480
            
            x = round(x, 4)
            y = round(y, 4)
            
            # disappear_error
            if x < 0 or x > 1 or y < 0 or y > 1:
                disappear_error += 1
            
            distance = math.sqrt((origin_x - x)**2 + (origin_y - y)**2)
            
            # num_error
            if distance > 0.1:
                misannotate_error += 1
            
            # pixel_error
            if distance > pixel_error:
                pixel_error = distance
               
    print("disappear_error:", disappear_error)
    print("misannotate_error:", misannotate_error)
    print("pixel_error:", pixel_error)

    disappear_errors.append(disappear_error)
    misannotate_errors.append(misannotate_error)
    pixel_errors.append(pixel_error)
    

print("missing_inlier:", missing_inlier)
print("failed_inliers:", failed_inliers)

disappear_error: 357
misannotate_error: 0
pixel_error: 0.07490546927441015
disappear_error: 362
misannotate_error: 0
pixel_error: 0.09205622522795283
disappear_error: 361
misannotate_error: 0
pixel_error: 0.07700923927821908
disappear_error: 352
misannotate_error: 0
pixel_error: 0.0550737224020641
disappear_error: 360
misannotate_error: 0
pixel_error: 0.056552205946806405
disappear_error: 355
misannotate_error: 0
pixel_error: 0.04089657769568996
disappear_error: 356
misannotate_error: 0
pixel_error: 0.06489294521918093
disappear_error: 345
misannotate_error: 0
pixel_error: 0.06422107827823807
disappear_error: 356
misannotate_error: 0
pixel_error: 0.06827897080514612
disappear_error: 354
misannotate_error: 0
pixel_error: 0.04955823252719581
missing_inlier: 0
failed_inliers: 9302


In [13]:
error1 = sum(disappear_errors) / len(disappear_errors)
error2 = sum(misannotate_errors) / len(misannotate_errors)
error3 = sum(pixel_errors) / len(pixel_errors)

print("disappear_error:", error1)
print("num_error:", error2)
print("pixel_error:", error3)

disappear_error: 355.8
num_error: 0.0
pixel_error: 0.06434446666549035


## check speed

In [None]:
# 테스트용

# video_frames 폴더에서 프레임 파일 리스트 가져오기
video_frames = os.listdir('video')

# 프레임 수 초기화
frame_count = 0

# 프레임 별 처리 시간 리스트 초기화
frame_processing_times = []

x = 637 // 2
y = 367 // 2
image0 = load_image("img1.png", grayscale=True)

# 각 프레임 처리
for frame in video_frames:
    start_time = time.time()
    
    #image0 = load_image("img1.png", grayscale=True)
    feats0 = extractor.extract(image0.to(device))
    image1 = load_image_from_path(os.path.join('video', frame), grayscale=True)
    feats1 = extractor.extract(image1.to(device))

    matches01 = matcher({"image0": feats0, "image1": feats1})
    feats0, feats1, matches01 = [
        rbd(x) for x in [feats0, feats1, matches01]
    ]  # remove batch dimension

    kpts0, kpts1, matches = feats0["keypoints"], feats1["keypoints"], matches01["matches"]
    m_kpts0, m_kpts1 = kpts0[matches[..., 0]], kpts1[matches[..., 1]]

    homography, _ = CSRansac.csransac(m_kpts0.cpu().numpy(), m_kpts1.cpu().numpy())
    projected_pts = CSRansac.perspective_transform(np.array([x, y]), homography)

    # 현재 시간 측정
    current_time = time.time()

    # 프레임 처리 시간 계산
    frame_processing_time = current_time - start_time
    frame_processing_times.append(frame_processing_time)

    # 이전 프레임 처리 시간 업데이트
    prev_frame_time = current_time

    # FPS 계산
    fps = 1.0 / frame_processing_time

    # 프레임 수 증가
    frame_count += 1

    # 이미지 및 매칭 시각화 코드 (생략)

# 전체 처리 시간 계산
total_processing_time = sum(frame_processing_times)

# 전체 프레임 수와 전체 처리 시간을 사용하여 평균 FPS 계산
average_fps = frame_count / total_processing_time

print(f"Total Frames Processed: {frame_count}")
print(f"Average FPS: {average_fps:.2f}")

In [35]:
from vidstab import VidStab

# Using defaults
stabilizer = VidStab()
stabilizer.stabilize(input_path='demo_video_resized.mp4', output_path='stable_demo_video.mp4')

## 실험 데이터셋 재생 코드

In [16]:
failed_inliers = 0
    
# 좌표의 개수(동영상의 개수)만큼 반복
for i in range(1):
    _images = images[0]
    _len_images = len(_images)
    x = origin_coordinate[0][0]
    y = origin_coordinate[0][1]

    img0 = _images[0] # 첫 번째 이미지를 target 이미지로 설정
    for j in range(_len_images):
        if j == 0:
            continue
        
        img1 = _images[j]
        _img1 = cv2.imread(img1)

        # LightGlue
        results_lightglue = matching_keypoints(img0, img1, stabilizing=False)
        target_keypoint = results_lightglue["points0"].cpu().numpy()
        frame_keypoint = results_lightglue["points1"].cpu().numpy()

        homography, mask = CSRansac.csransac(target_keypoint, frame_keypoint)
        print(mask)
        if mask < 0.3:
            failed_inliers += 1
            cv2.imshow('frame', _img1)
            img0 = img1
            continue
        projected_pts = CSRansac.perspective_transform(np.array([x, y]), homography)
        
        img0 = img1
        
        cv2.circle(_img1, (int(projected_pts[0]), int(projected_pts[1])), 15, (0, 0, 255), -1)
        cv2.circle(_img1, (int(projected_pts[0]), int(projected_pts[1])), 3, (0, 0, 0), -1)
        cv2.imshow('frame', _img1)
        
        key = cv2.waitKey(5)
        if key == 27:
            break
            
cv2.destroyAllWindows()
print("failed_inliers:", failed_inliers)
            

0.9755434782608695
0.6622983870967742
0.8769551616266945
-1
0.9988439306358381
-1
0.8955223880597015
-1
0.9721485411140584
0.9660056657223796
0.9686162624821684
0.3789173789173789
0.8782234957020058
0.9985250737463127
0.7752293577981652
0.9968992248062015
0.9968553459119497
0.9542682926829268
-1
0.37285491419656785
0.8927444794952681
0.913961038961039
0.9984202211690363
0.9984615384615385
-1
0.9984177215189873
0.9403225806451613
0.9268680445151033
0.9616564417177914
0.9667721518987342
0.8056338028169014
0.8379160636758322
0.5329428989751098
0.9357664233576642
0.9954476479514416
0.9204892966360856
0.5672782874617737
0.9405646359583952
0.2556732223903177
0.974124809741248
0.9969135802469136
0.9827856025039123
0.9953846153846154
0.9426356589147287
-1
-1
-1
0.9984375
0.9301948051948052
-1
0.9984472049689441
0.998468606431853
0.8180404354587869
-1
0.9165378670788253
-1
0.9877862595419847
0.9909502262443439
0.4391371340523883
0.9983766233766234
0.9986225895316805
0.992867332382311
0.99146514