In [1]:
import os
import cv2 
import time
import json
import math
import copy
import torch
import numpy as np
import kornia as K
import kornia.feature as KF


from lightglue import LightGlue, SuperPoint
from lightglue.utils import load_image, rbd, load_image_from_path
import CSRansac

In [2]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

print(torch.__version__)
print(device)

extractor = SuperPoint(max_num_keypoints=2048).eval().to(device)  # load the extractor
#matcher = LightGlue(features='superpoint', depth_confidence=0.9, width_confidence=0.95).eval().to(device)
matcher = LightGlue(features='superpoint', depth_confidence=-1, width_confidence=-1).eval().to(device)
#matcher.compile(mode='reduce-overhead')

2.1.2
cuda


In [4]:
def match_lightglue(img0, img1):
    img0 = load_image(img0)
    img1 = load_image(img1)

    # extract local features
    feats0 = extractor.extract(img0.to(device))  # auto-resize the image, disable with resize=None
    feats1 = extractor.extract(img1.to(device))
    
    # match the features
    matches01 = matcher({'image0': feats0, 'image1': feats1})
    feats0, feats1, matches01 = [rbd(x) for x in [feats0, feats1, matches01]]  # remove batch dimension
    
    # get results
    kpts0 = feats0["keypoints"]
    kpts1 = feats1["keypoints"]
    matches = matches01['matches']  # indices with shape (K,2)
    points0 = kpts0[matches[..., 0]]  # coordinates in img0, shape (K,2)
    points1 = kpts1[matches[..., 1]]  # coordinates in img1, shape (K,2)
        
    return {
        "points0": points0,
        "points1": points1,
    }

In [5]:
def load_and_preprocess_image(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    image = cv2.resize(image, (640, 480))  # 필요한 경우 이미지 크기 조정
    image = K.image_to_tensor(image, False).float() / 255.0
    image = image.to(device)
    return image

In [6]:
def matching_keypoints(target_img, video_img):
    # 이미지를 불러옴
    img0 = load_image(target_img, grayscale=True)
    img1 = load_image(video_img , grayscale=True)

    # extract local features
    feats0 = extractor.extract(img0.to(device))  # auto-resize the image, disable with resize=None
    feats1 = extractor.extract(img1.to(device))

    # match the features
    matches01 = matcher({'image0': feats0, 'image1': feats1})
    feats0, feats1, matches01 = [rbd(x) for x in [feats0, feats1, matches01]]  # remove batch dimension

    # get results
    kpts0 = feats0["keypoints"]
    kpts1 = feats1["keypoints"]
    matches = matches01['matches']  # indices with shape (K,2)
    points0 = kpts0[matches[..., 0]]  # coordinates in img0, shape (K,2)
    points1 = kpts1[matches[..., 1]]  # coordinates in img1, shape (K,2)

    return {
        "points0": points0,
        "points1": points1,
    }

In [7]:
def get_errors(coord_list, float_origin_coordinate, len_coord, len_videos):
    misannotate_error = 0
    pixel_error = 0
    
    for index in range(len_videos):
        for i in range(len_coord):
            try:
                origin_x = float_origin_coordinate[index][i][0]
                origin_y = float_origin_coordinate[index][i][1]
                
                _coord = coord_list[index][i]
                
                x = _coord[0][0]
                y = _coord[0][1]
                
                x = x / 640
                y = y / 480
                
                x = round(x, 4)
                y = round(y, 4)
                
                distance = math.sqrt((origin_x - x)**2 + (origin_y - y)**2)
                
                if distance > 0.1:
                    misannotate_error += 1
                
                if distance > pixel_error:
                    pixel_error = distance
            except:
                pass
                
    return misannotate_error, pixel_error

In [8]:
# # VideoWriter 초기화
# fourcc = cv2.VideoWriter_fourcc(*'DIVX')
# out = cv2.VideoWriter('unity_annotated_lightglue.mp4', fourcc, 30, (640, 480))

# # 마우스 클릭 이벤트 핸들러
# clicked_coords = []

# def mouse_callback(event, x, y, flags, param):
#     global clicked_coords
#     if event == cv2.EVENT_LBUTTONDOWN:
#         clicked_coords = [x, y]

# # 웹캠 초기화
# cap = cv2.VideoCapture(0)

# if not cap.isOpened():
#     print("웹캠을 열 수 없습니다.")
#     exit()

# cv2.namedWindow('Webcam')
# cv2.setMouseCallback('Webcam', mouse_callback)

# frame_count = 0
# total_time = 0
# coords_set = False

# while True:
#     ret, frame = cap.read()
#     if not ret:
#         print("프레임을 읽을 수 없습니다.")
#         break
    
#     cv2.imshow('Webcam', frame)
    
#     if len(clicked_coords) == 2 and not coords_set:
#         x, y = clicked_coords
#         coords_set = True
#         print(f"클릭된 좌표: ({x}, {y})")
    
#     key = cv2.waitKey(1)
#     if key == 27:  # ESC 키
#         break
    
#     if coords_set:
#         img0 = frame  # 첫 프레임을 target image로 사용
#         break

# start_time = time.time()

# # 각 프레임 처리
# while True:
#     ret, img1 = cap.read()
#     if not ret:
#         break

#     results_lightglue = matching_keypoints(img0, img1, stabilizing=False)
#     target_keypoint = results_lightglue["points0"].cpu().numpy()
#     frame_keypoint = results_lightglue["points1"].cpu().numpy()

#     homography, mask = CSRansac.csransac(target_keypoint, frame_keypoint)
#     projected_pts = CSRansac.perspective_transform(np.array([x, y]), homography)
    
#     # 결과를 비디오에 기록
#     out.write(img1)
    
#     img0 = img1
    
#     frame_count += 1

#     cv2.imshow('Webcam', img1)
#     key = cv2.waitKey(1)
#     if key == 27:  # ESC 키
#         break

# end_time = time.time()

# cap.release()
# out.release()
# cv2.destroyAllWindows()

# total_time += end_time - start_time   
# average_time = frame_count / total_time
# print("FPS : ", average_time)


In [11]:
# VideoWriter 초기화
fourcc = cv2.VideoWriter_fourcc(*'DIVX')
out = cv2.VideoWriter('unity_annotated_lightglue.mp4', fourcc, 30, (640, 480))

video_path = "demo_video.mp4"

# 마우스 클릭 이벤트 핸들러
clicked_coords = []

def mouse_callback(event, x, y, flags, param):
    global clicked_coords
    if event == cv2.EVENT_LBUTTONDOWN:
        clicked_coords.append((x, y))
        print(f"클릭된 좌표: ({x}, {y})")

frame_count = 0
total_time = 0
paused = True

cap = cv2.VideoCapture(video_path)

if not cap.isOpened():
    print("비디오를 열 수 없습니다.")
    exit()

# 첫 프레임을 target image로 사용
ret, img0 = cap.read()
if not ret:
    print("첫 번째 프레임을 읽을 수 없습니다.")
    exit()

img0 = cv2.resize(img0, (640, 480))
cv2.namedWindow('frame')
cv2.setMouseCallback('frame', mouse_callback)

cv2.imshow('frame', img0)

# 일시 정지 상태에서 사용자가 마우스로 클릭하여 어노테이션할 좌표를 얻음
while paused:
    key = cv2.waitKey(0)
    if key == 32:  # 스페이스바를 누르면 재생 시작
        paused = False

# start_time = time.time()

# 각 프레임 처리
while True:
    if not paused:
        ret, img1 = cap.read()
        if not ret:
            break

        img1 = cv2.resize(img1, (640, 480))
        frame = img1.copy()
        
        # LightGlue
        results_lightglue = matching_keypoints(img0, img1)
        target_keypoint = results_lightglue["points0"].cpu().numpy()
        frame_keypoint = results_lightglue["points1"].cpu().numpy()

        homography, mask = CSRansac.csransac(target_keypoint, frame_keypoint)

        # 각 클릭된 좌표를 변환하여 표시
        for x, y in clicked_coords:
            projected_pts = CSRansac.perspective_transform(np.array([x, y]), homography)
            # clicked_coords[i] = projected_pts
            x = int(projected_pts[0])
            y = int(projected_pts[1])
            
            cv2.circle(frame, (x, y), 15, (0, 0, 255), -1)
            cv2.circle(frame, (x, y), 3, (0, 0, 0), -1)    
        

        cv2.imshow('frame', frame)

        # img0 = img1
        # frame_count += 1

        key = cv2.waitKey(5)
        if key == 27:  # ESC 키를 누르면 종료
            break
        elif key == 32:  # 스페이스바를 누르면 일시 정지
            paused = True
    else:
        key = cv2.waitKey(0)
        if key == 32:  # 스페이스바를 누르면 재생 시작
            paused = False
        elif key == 27:  # ESC 키를 누르면 종료
            break

# end_time = time.time()

cap.release()
out.release()
cv2.destroyAllWindows()

# total_time += end_time - start_time   
# average_time = frame_count / total_time
# print("FPS : ", average_time)


클릭된 좌표: (317, 175)
클릭된 좌표: (289, 233)
클릭된 좌표: (399, 148)


In [None]:
# VideoWriter 초기화
fourcc = cv2.VideoWriter_fourcc(*'DIVX')
out = cv2.VideoWriter('unity_annotated_lightglue.mp4', fourcc, 30, (640, 480))

video_path = "demo_video.mp4"

# 칼만 필터 초기화
kalman = cv2.KalmanFilter(8, 4)
kalman.measurementMatrix = np.eye(4, 8, dtype=np.float32)
kalman.transitionMatrix = np.eye(8, dtype=np.float32)
kalman.processNoiseCov = np.eye(8, dtype=np.float32) * 0.01
kalman.measurementNoiseCov = np.eye(4, dtype=np.float32) * 0.1
kalman.errorCovPost = np.eye(8, dtype=np.float32)

# LoFTR 모델 초기화
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
loftr = KF.LoFTR(pretrained='outdoor').to(device)

# Lucas-Kanade optical flow 파라미터
lk_params = dict(winSize=(15, 15), maxLevel=2,
                 criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03))


# 마우스 클릭 이벤트 핸들러
clicked_coords = []

def mouse_callback(event, x, y, flags, param):
    global clicked_coords
    if event == cv2.EVENT_LBUTTONDOWN:
        clicked_coords.append((x, y))
        print(f"클릭된 좌표: ({x}, {y})")

frame_count = 0
total_time = 0
paused = True

cap = cv2.VideoCapture(video_path)

if not cap.isOpened():
    print("비디오를 열 수 없습니다.")
    exit()

# 첫 프레임을 target image로 사용
ret, img0 = cap.read()
if not ret:
    print("첫 번째 프레임을 읽을 수 없습니다.")
    exit()

img0 = cv2.resize(img0, (640, 480))
cv2.namedWindow('frame')
cv2.setMouseCallback('frame', mouse_callback)

cv2.imshow('frame', img0)

prev_gray = cv2.imread(img0, cv2.IMREAD_GRAYSCALE)
prev_gray = cv2.resize(prev_gray, (640, 480))

# LOFTR
image = load_and_preprocess_image(img0)

# 특징점 추출
with torch.no_grad():
    input_dict = {"image0": image, "image1": image}
    correspondences = loftr(input_dict)

prev_points = correspondences['keypoints0'].cpu().numpy()
prev_points = prev_points.reshape(-1, 1, 2)

# 일시 정지 상태에서 사용자가 마우스로 클릭하여 어노테이션할 좌표를 얻음
while paused:
    key = cv2.waitKey(0)
    if key == 32:  # 스페이스바를 누르면 재생 시작
        paused = False

# start_time = time.time()

# 각 프레임 처리
while True:
    if not paused:
        ret, img1 = cap.read()
        if not ret:
            break

        img1 = cv2.resize(img1, (640, 480))
        frame = img1.copy()
        
        _frame = cv2.imread(img1)
        gray = cv2.cvtColor(_frame, cv2.COLOR_BGR2GRAY)
        gray = cv2.resize(gray, (640, 480))
        next_points, status, error = cv2.calcOpticalFlowPyrLK(prev_gray, gray, prev_points, None, **lk_params)
        good_old = prev_points[status == 1]
        good_new = next_points[status == 1]

        H, _ = cv2.findHomography(good_old, good_new, cv2.RANSAC, 5.0)

        # 각 클릭된 좌표를 변환하여 표시
        for x, y in clicked_coords:
            projected_pts = CSRansac.perspective_transform(np.array([x, y]), homography)
            # clicked_coords[i] = projected_pts
            x = int(projected_pts[0])
            y = int(projected_pts[1])
            
            cv2.circle(frame, (x, y), 15, (0, 0, 255), -1)
            cv2.circle(frame, (x, y), 3, (0, 0, 0), -1)    
        
        cv2.imshow('frame', frame)

        # img0 = img1
        # frame_count += 1

        key = cv2.waitKey(5)
        if key == 27:  # ESC 키를 누르면 종료
            break
        elif key == 32:  # 스페이스바를 누르면 일시 정지
            paused = True
    else:
        key = cv2.waitKey(0)
        if key == 32:  # 스페이스바를 누르면 재생 시작
            paused = False
        elif key == 27:  # ESC 키를 누르면 종료
            break

# end_time = time.time()

cap.release()
out.release()
cv2.destroyAllWindows()

# total_time += end_time - start_time   
# average_time = frame_count / total_time
# print("FPS : ", average_time)
