In [1]:
import os

from modules.extra_utils import (get_video_info, change_fps, print_total_durations, extract_key_frames)
from modules.utils import load_config
from modules.scene_utils import (save_timestamps_to_txt, save_all_video_scenes_by_timestamps)
from modules.audio_utils import (save_all_mono_audio_from_scene_folder, transcribe_and_save_scene_information_into_json)
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Config 파일을 로드
config_path = "./config/preprocessing_config.yaml"
config = load_config(config_path)

original_video_folder = config["general"]["video_folder"]

preprocessed_video_folder = config["scene_caption"]["fps_adjusted_video_folder"]
timestamp_file = config["scene_caption"]["timestamp_file"]
scene_folder = config["scene_caption"]["scene_folder"]
mono_audio_folder = config["scene_caption"]["audio"]["mono_audio_folder"]

key_frames_folder = config["frame_caption"]["key_frames_folder"]

video_files = [f for f in os.listdir(preprocessed_video_folder) if f.endswith(".mp4")]

print(f"[INFO] Configuration loaded successfully.")
print(f"[INFO] Found {len(video_files)} video files in the input directory.")

In [2]:
# 원본 비디오 파일의 정보를 출력
get_video_info(original_video_folder)

File Name                      Resolution           FPS        Frames          Duration (s)   
------------------------------------------------------------------------------------------
-ncFDuKdgNE.mp4                640x360             23.976      3900            162.662        
5qlG1ODkRWw.mp4                640x286             23.976      3474            144.895        
6ZMZYrdXtP0.mp4                640x360             23.976      3660            152.652        
7DfNc-wxnBM.mp4                640x360             23.976      3204            133.633        
94AnEUa_z8U.mp4                640x360             23.976      3804            158.658        
9iZFtT4aShI.mp4                640x360             23.976      3618            150.901        
AHHH770W4Wk.mp4                640x360             23.976      3876            161.661        
C4y_tu3LYlo.mp4                640x360             24.000      3197            133.208        
Fz9HnTVx52g.mp4                640x360             23.

In [None]:
# FPS를 조정하여 preprocessed_video 폴더에 저장
change_fps(original_video_folder, preprocessed_video_folder)

[INFO] Processed and saved: ./preprocessed_video/7DfNc-wxnBM.mp4 with FPS 25
[INFO] Processed and saved: ./preprocessed_video/s2wBtcmE5W8.mp4 with FPS 25
[INFO] Processed and saved: ./preprocessed_video/6ZMZYrdXtP0.mp4 with FPS 25
[INFO] Processed and saved: ./preprocessed_video/oyYuYNnSq9E.mp4 with FPS 25
[INFO] Processed and saved: ./preprocessed_video/v8HrbX0hzX8.mp4 with FPS 25
[INFO] Processed and saved: ./preprocessed_video/AHHH770W4Wk.mp4 with FPS 25
[INFO] Processed and saved: ./preprocessed_video/tBDHJCVi7_0.mp4 with FPS 25
[INFO] Processed and saved: ./preprocessed_video/C4y_tu3LYlo.mp4 with FPS 25
[INFO] Processed and saved: ./preprocessed_video/94AnEUa_z8U.mp4 with FPS 25
[INFO] Processed and saved: ./preprocessed_video/xqsDUwDwdUM.mp4 with FPS 25
[INFO] Processed and saved: ./preprocessed_video/mDUSjBiHYeY.mp4 with FPS 25
[INFO] Processed and saved: ./preprocessed_video/wFtBmw4cINY.mp4 with FPS 25
[INFO] Processed and saved: ./preprocessed_video/9iZFtT4aShI.mp4 with FPS 25

In [3]:
# FPS 조정된 비디오의 정보를 출력 (프레임 수 동일)
get_video_info(preprocessed_video_folder)

File Name                      Resolution           FPS        Frames          Duration (s)   
------------------------------------------------------------------------------------------
-ncFDuKdgNE.mp4                640x360             25.000      3900            156.000        
5qlG1ODkRWw.mp4                640x286             25.000      3474            138.960        
6ZMZYrdXtP0.mp4                640x360             25.000      3660            146.400        
7DfNc-wxnBM.mp4                640x360             25.000      3204            128.160        
94AnEUa_z8U.mp4                640x360             25.000      3804            152.160        
9iZFtT4aShI.mp4                640x360             25.000      3618            144.720        
AHHH770W4Wk.mp4                640x360             25.000      3876            155.040        
C4y_tu3LYlo.mp4                640x360             25.000      3197            127.880        
Fz9HnTVx52g.mp4                640x360             25.

In [5]:
# 비디오 폴더의 모든 비디오에 대해 타임스탬프 추출하여 txt 파일로 저장
save_timestamps_to_txt(
    preprocessed_video_folder,
    timestamp_file,
    threshold=config["scene_caption"]["PySceneDetect_threshold"],
    min_scene_len=config["scene_caption"]["PySceneDetect_min_scene_len"]
)

print(f"[INFO] Scene timestamps have been saved to {timestamp_file}")

타임스탬프를 추출하여 txt 파일에 저장하는 중: 100%|██████████| 22/22 [00:47<00:00,  2.14s/it]

[INFO] Scene timestamps have been saved to ./timestamps.txt





In [6]:
# 타임스탬프 txt 파일로부터 비디오 Scene(mp4) 추출하여 scene_folder에 저장
# 속도 향상을 위해 멀티프로세싱 사용하여 처리 순서가 순서대로가 아님. (tqdm 시간측정도 일관되지 않음) (6분 33초 소요)
save_all_video_scenes_by_timestamps(
    preprocessed_video_folder, scene_folder, timestamp_file
)

print(f"[INFO] Scenes have been successfully split and saved to {scene_folder}")

Processing Videos: 100%|██████████| 22/22 [06:33<00:00, 17.90s/it]

[INFO] Scenes have been successfully split and saved to ./scenes





In [10]:
# Scene 폴더로부터 모든 Scene들의 모노 오디오를 mono_audio_folder에 저장
save_all_mono_audio_from_scene_folder(scene_folder, mono_audio_folder)

print(f"[INFO] Audio has been successfully extracted and saved to {mono_audio_folder}")

Saving mono audio: 100%|██████████| 650/650 [00:48<00:00, 13.31it/s]

[INFO] Audio has been successfully extracted and saved to mono_audio





In [11]:
# mono_audio_folder에 저장된 모든 Scene의 오디오를 텍스트로 변환하여 Scene 정보 JSON 파일로 저장
transcribe_and_save_scene_information_into_json(
    mono_audio_folder, config['scene_caption']['audio']['scene_info_with_audio_scripts_file'], timestamp_file
)

print(f"[INFO] Successfully transcribed and saved to {config['scene_caption']['audio']['scene_info_with_audio_scripts_file']}")

Loading SST model...


  checkpoint = torch.load(fp, map_location=device)
100%|██████████| 22/22 [09:56<00:00, 27.13s/it]

Results saved to scene_info_with_audio_scripts.json
[INFO] Successfully transcribed and saved to scene_info_with_audio_scripts.json





In [14]:

from transformers import CLIPProcessor, CLIPModel

# CLIP 모델과 Processor를 로드
model_path = 'openai/clip-vit-large-patch14'
processor = CLIPProcessor.from_pretrained(model_path)
clip_model = CLIPModel.from_pretrained(model_path).cuda()
clip_model.requires_grad_(False)

# 각 비디오에서 Key Frame을 추출하여 key_frames_folder에 저장
for video_file in tqdm(video_files, desc="키 프레임 추출 중"):
    video_path = os.path.join(preprocessed_video_folder, video_file)
    extract_key_frames(video_path, key_frames_folder, processor, clip_model, similarity_threshold=0.85, stddev_threshold=10)

print(f"[INFO] Key frames have been successfully extracted and saved to {key_frames_folder}")

키 프레임 추출 중:   5%|▍         | 1/22 [01:28<30:57, 88.46s/it]

[INFO] 7DfNc-wxnBM: Extracted 82 key frames.


키 프레임 추출 중:   9%|▉         | 2/22 [03:12<32:30, 97.55s/it]

[INFO] s2wBtcmE5W8: Extracted 49 key frames.


키 프레임 추출 중:  14%|█▎        | 3/22 [04:52<31:15, 98.72s/it]

[INFO] 6ZMZYrdXtP0: Extracted 25 key frames.


키 프레임 추출 중:  18%|█▊        | 4/22 [06:27<29:10, 97.24s/it]

[INFO] oyYuYNnSq9E: Extracted 20 key frames.


키 프레임 추출 중:  23%|██▎       | 5/22 [07:45<25:32, 90.17s/it]

[INFO] v8HrbX0hzX8: Extracted 87 key frames.


키 프레임 추출 중:  27%|██▋       | 6/22 [09:31<25:31, 95.69s/it]

[INFO] AHHH770W4Wk: Extracted 53 key frames.


키 프레임 추출 중:  32%|███▏      | 7/22 [10:51<22:39, 90.63s/it]

[INFO] tBDHJCVi7_0: Extracted 139 key frames.


키 프레임 추출 중:  36%|███▋      | 8/22 [12:19<20:58, 89.86s/it]

[INFO] C4y_tu3LYlo: Extracted 46 key frames.


키 프레임 추출 중:  41%|████      | 9/22 [14:04<20:28, 94.48s/it]

[INFO] 94AnEUa_z8U: Extracted 56 key frames.


키 프레임 추출 중:  45%|████▌     | 10/22 [15:47<19:23, 96.99s/it]

[INFO] xqsDUwDwdUM: Extracted 52 key frames.


키 프레임 추출 중:  50%|█████     | 11/22 [17:30<18:07, 98.85s/it]

[INFO] mDUSjBiHYeY: Extracted 58 key frames.


키 프레임 추출 중:  55%|█████▍    | 12/22 [20:11<19:38, 117.87s/it]

[INFO] wFtBmw4cINY: Extracted 83 key frames.


키 프레임 추출 중:  59%|█████▉    | 13/22 [21:43<16:30, 110.10s/it]

[INFO] 9iZFtT4aShI: Extracted 118 key frames.


키 프레임 추출 중:  64%|██████▎   | 14/22 [23:12<13:48, 103.58s/it]

[INFO] 5qlG1ODkRWw: Extracted 121 key frames.


키 프레임 추출 중:  68%|██████▊   | 15/22 [25:13<12:42, 109.00s/it]

[INFO] q-H62GgHjeg: Extracted 15 key frames.


키 프레임 추출 중:  73%|███████▎  | 16/22 [26:33<10:00, 100.07s/it]

[INFO] j8fcNsJOtQo: Extracted 77 key frames.


키 프레임 추출 중:  77%|███████▋  | 17/22 [28:15<08:23, 100.72s/it]

[INFO] UdZuHyttXbw: Extracted 28 key frames.


키 프레임 추출 중:  82%|████████▏ | 18/22 [29:51<06:37, 99.43s/it] 

[INFO] Pwv4avomXYo: Extracted 61 key frames.


키 프레임 추출 중:  86%|████████▋ | 19/22 [31:36<05:02, 100.97s/it]

[INFO] n1lbpj6868o: Extracted 86 key frames.


키 프레임 추출 중:  91%|█████████ | 20/22 [33:24<03:26, 103.07s/it]

[INFO] -ncFDuKdgNE: Extracted 37 key frames.


키 프레임 추출 중:  95%|█████████▌| 21/22 [35:39<01:52, 112.76s/it]

[INFO] zjwBNUXCA-M: Extracted 60 key frames.


키 프레임 추출 중: 100%|██████████| 22/22 [37:26<00:00, 102.10s/it]

[INFO] Fz9HnTVx52g: Extracted 49 key frames.
[INFO] Key frames have been successfully extracted and saved to ./key_frames





In [4]:
# 비디오, Scene, 모노 오디오 폴더의 총 길이와 키 프레임 수를 출력
print_total_durations(preprocessed_video_folder, scene_folder, mono_audio_folder, key_frames_folder)

print(f"\n[INFO] Total durations and key frame counts have been printed.")

[INFO] Total video duration: 0h 55m 24s 240ms
[INFO] Total scene duration: 0h 55m 24s 240ms
[INFO] Total audio duration: 0h 55m 24s 240ms
[INFO] Total key frames: 1402 frames

[INFO] Total durations and key frame counts have been printed.


In [3]:
import os
import clip
import torch
import pickle
import numpy as np
from PIL import Image
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor


device = "cuda" if torch.cuda.is_available() else "cpu"

# CLIP 모델과 전처리 로드
model, preprocess = clip.load("ViT-B/32", device=device)

# 이미지 경로 설정
image_dir = "./key_frames"
image_paths = [
    os.path.join(image_dir, fname)
    for fname in os.listdir(image_dir)
    if fname.endswith((".png", ".jpg", ".jpeg"))
]

# 캐시 파일 경로
cache_path = "./key_frames/image_embeddings_cache.pkl"

# 이미지 임베딩 추출 함수 (병렬 처리 적용)
def process_image(path):
    image = preprocess(Image.open(path).convert("RGB")).unsqueeze(0).to(device)
    with torch.no_grad():
        embedding = model.encode_image(image).cpu().numpy()
    return embedding


def get_image_embeddings(image_paths):
    embeddings = []
    with ThreadPoolExecutor() as executor:
        results = list(
            tqdm(
                executor.map(process_image, image_paths),
                desc="Extracting image embeddings",
                total=len(image_paths),
            )
        )
        embeddings.extend(results)
    return np.vstack(embeddings)


# 이미지 임베딩 생성 또는 업데이트
image_embeddings = get_image_embeddings(image_paths)
with open(cache_path, "wb") as f:
    pickle.dump(image_embeddings, f)
print("Image embeddings have been updated and saved.")

100%|███████████████████████████████████████| 338M/338M [00:04<00:00, 73.6MiB/s]
Extracting image embeddings: 100%|██████████| 1402/1402 [00:36<00:00, 38.75it/s]

Image embeddings have been updated and saved.



