In [28]:
!pip install pytesseract
!pip install ipywidgets
!apt-get install tesseract-ocr-kor -y
!apt-get install tesseract-ocr -y
!apt install ffmpeg libsm6 libxext6 -y

Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Using cached jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Using cached jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
Installing collected packages: jedi
Successfully installed jedi-0.19.2
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr-kor is already the newest version (1:4.00~git30-7274cfa-1.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libsm6 is already the newest version (2:1.2.3-1build2).
libxext6 is already the newest version (2:1.3.4-1build1).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 ne

In [10]:
from google.colab import drive
# Google Drive 마운트
drive.mount('/content/drive')

Mounted at /content/drive


In [46]:
import cv2
import numpy as np
import pytesseract
from PIL import Image
import json
import time
from google.colab import drive
import os
from tqdm.notebook import tqdm
from IPython.display import clear_output, display
import ipywidgets as widgets

class DialogueExtractor:
    def __init__(self, dialogue_region=(0.0, 0.8, 1.0, 1.0)):
        self.dialogue_region = dialogue_region
        self.previous_text = ""
        self.current_dialogue = ""
        self.stable_frame_count = 0
        self.required_stable_frames = 5
        self.dialogues = []
        self.seen_dialogues = set()  # 중복 체크를 위한 set

    def get_dialogue_region(self, frame):
        height, width = frame.shape[:2]
        x1 = int(width * self.dialogue_region[0])
        y1 = int(height * self.dialogue_region[1])
        x2 = int(width * self.dialogue_region[2])
        y2 = int(height * self.dialogue_region[3])
        return frame[y1:y2, x1:x2]

    def preprocess_image(self, image):
        # 이미지 전처리
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # 노이즈 제거
        denoised = cv2.fastNlMeansDenoising(gray)

        # 대비 향상
        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
        enhanced = clahe.apply(denoised)

        # 이진화
        _, threshold = cv2.threshold(enhanced, 180, 255, cv2.THRESH_BINARY_INV)

        return threshold

    def extract_text(self, image):
        # OCR 설정 (한글)
        custom_config = r'--oem 3 --psm 6 -l kor+eng'
        text = pytesseract.image_to_string(image, config=custom_config)
        return text.strip()

    def process_frame(self, frame):
        dialogue_region = self.get_dialogue_region(frame)
        processed = self.preprocess_image(dialogue_region)
        current_text = self.extract_text(processed)

        if current_text == self.previous_text and current_text:
            self.stable_frame_count += 1
            if self.stable_frame_count >= self.required_stable_frames:
                if current_text != self.current_dialogue:
                    self.current_dialogue = current_text
                    # 새로운 대화인 경우에만 출력
                    if current_text not in self.seen_dialogues:
                        self.seen_dialogues.add(current_text)
                        print(f"\n[새로운 대화 발견!] {current_text}")
                        print("-" * 50)

                    self.dialogues.append({
                        "text": current_text,
                        "timestamp": time.time()
                    })
        else:
            self.stable_frame_count = 0

        self.previous_text = current_text

    def process_video(self, video_path, output_path, frame_skip=2):
        cap = cv2.VideoCapture(video_path)
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        fps = int(cap.get(cv2.CAP_PROP_FPS))

        print("\n[처리 시작]")
        print("=" * 50)
        print(f"총 프레임: {total_frames}")
        print(f"FPS: {fps}")
        print("=" * 50)

        pbar = tqdm(total=total_frames, desc="Processing frames")

        frame_count = 0
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            if frame_count % frame_skip == 0:
                self.process_frame(frame)

            frame_count += 1
            pbar.update(1)

        cap.release()
        pbar.close()

        # 대화 내용 정리
        unique_dialogues = []
        for dialogue in self.dialogues:
            print("found dialogue: ", dialogue)
            if dialogue['text'] in self.seen_dialogues:
                unique_dialogues.append(dialogue)
                self.seen_dialogues.remove(dialogue['text'])  # 중복 방지
        print("unique dialogues: ", unique_dialogues)

        # JSON으로 저장
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump({
                "video_info": {
                    "total_frames": total_frames,
                    "fps": fps,
                    "processed_frames": frame_count,
                    "frame_skip": frame_skip
                },
                "dialogues": unique_dialogues,
                "video_path": video_path,
                "extraction_time": time.strftime("%Y-%m-%d %H:%M:%S")
            }, f, ensure_ascii=False, indent=2)

        print("\n[처리 완료]")
        print("=" * 50)
        print(f"총 추출된 대화 수: {len(unique_dialogues)}")
        print(f"결과가 저장된 경로: {output_path}")
        print("=" * 50)

    def reset(self):
        self.previous_text = ""
        self.current_dialogue = ""
        self.stable_frame_count = 0
        self.dialogues = []
        self.seen_dialogues = set()


    def main():
      # 비디오 파일 경로 (Google Drive 내 경로)
      video_path = '/content/drive/MyDrive/Colab Notebooks/Dataset Station/Genshin Impact Dialogue Dataset/[원신] 마신 임무 제4장 제1막 『흰 이슬과 검은 물결의 서시』 (루미네 ver.).mp4'
      output_path = '/content/drive/MyDrive/Colab Notebooks/Dataset Station/Genshin Impact Dialogue Dataset/dialogues.json'

      # 대화창 영역 설정 (필요에 따라 조정)
      extractor = DialogueExtractor(dialogue_region=(0.1, 0.8, 0.9, 0.95))

      # 비디오 처리 (2프레임마다 처리)
      extractor.process_video(video_path, output_path, frame_skip=2)

    if __name__ == "__main__":
        main()


[처리 시작]
총 프레임: 288702
FPS: 30


Processing frames:   0%|          | 0/288702 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [45]:
def main():
    # 비디오 파일 경로 (Google Drive 내 경로)
    video_path = '/content/drive/MyDrive/Colab Notebooks/Dataset Station/Genshin Impact Dialogue Dataset/[원신] 마신 임무 제4장 제1막 『흰 이슬과 검은 물결의 서시』 (루미네 ver.).mp4'
    output_path = '/content/drive/MyDrive/Colab Notebooks/Dataset Station/Genshin Impact Dialogue Dataset/dialogues.json'

    # 대화창 영역 설정 (필요에 따라 조정)
    extractor = DialogueExtractor(dialogue_region=(0.1, 0.8, 0.9, 0.95))

    # 비디오 처리 (2프레임마다 처리)
    extractor.process_video(video_path, output_path, frame_skip=2)

if __name__ == "__main__":
    main()


[처리 시작]
총 프레임: 288702
FPS: 30


Processing frames:   0%|          | 0/288702 [00:00<?, ?it/s]

KeyboardInterrupt: 