In [1]:
import os
import json
import shutil
import cv2
import torch
from torchvision import transforms
from PIL import Image
from transformers import BertTokenizer
import concurrent.futures

In [2]:
# # 'a' 디렉토리 경로 설정
# a_directory = "/root/lip-to-speech/TL163"

# # 'a' 디렉토리 하위의 첫 번째 디렉토리로 들어가서 그 내부의 모든 파일을 'a' 디렉토리로 이동
# current_path = a_directory

# # 1단계: 첫 번째 디렉토리로 이동
# subdirs = [d for d in os.listdir(current_path) if os.path.isdir(os.path.join(current_path, d))]
# if not subdirs:
#     raise Exception("첫 번째 디렉토리를 찾을 수 없습니다.")
# first_subdir = subdirs[0]
# current_path = os.path.join(current_path, first_subdir)

# # 2단계: 계속해서 하위 디렉토리로 이동 (가장 안쪽 디렉토리를 찾기)
# while True:
#     subdirs = [d for d in os.listdir(current_path) if os.path.isdir(os.path.join(current_path, d))]
#     if not subdirs:
#         break  # 더 이상 하위 디렉토리가 없으면 중단
#     current_path = os.path.join(current_path, subdirs[0])

# # 3단계: 가장 안쪽 디렉토리 안의 모든 파일을 'a' 디렉토리로 이동
# for file_name in os.listdir(current_path):
#     file_path = os.path.join(current_path, file_name)
#     if os.path.isfile(file_path):
#         shutil.move(file_path, a_directory)

# # 4단계: 중간 디렉토리들 모두 삭제 (첫 번째 디렉토리부터 시작해서 가장 안쪽까지 삭제)
# shutil.rmtree(os.path.join(a_directory, first_subdir))

# print(f"모든 파일을 '{a_directory}'로 이동하고 중간 디렉토리를 삭제했습니다.")


In [3]:
def get_video_info(json_file_path, video_file_path):
    with open(json_file_path, 'r') as f:
        data = json.load(f)

    video_name = data[0]['Video_info']['video_Name']
    video_path = f'{video_file_path}/{video_name}'
    
    FPS = int(data[0]['Audio_info']['Sampling_rate'].replace('Khz',''))
    Sentence_info = data[0]['Sentence_info']
    
    annotations = []
    for sentence in Sentence_info:
        annotations.append( (sentence['start_time'], sentence['end_time'], sentence['sentence_text']) )

    return video_name, FPS, annotations

# 3. 전처리 함수 정의
def preprocess_video_frame(frame_path, transform):
    img = Image.open(frame_path).convert('RGB')
    img_tensor = transform(img)
    return img_tensor

# 비디오 파일 경로 및 저장할 디렉토리 설정
def save_frame(video_path, transform):
    output_dir = video_path.replace('/TS', '/split_TS')
    os.makedirs(output_dir, exist_ok=True)
    
    cap = cv2.VideoCapture(video_path)
    
    # 총 프레임 수 확인
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print(f"총 프레임 수: {total_frames}")
    
    frame_idx = 0
    while cap.isOpened():
        ret, frame = cap.read()
        
        if not ret:
            break  # 비디오가 끝나면 종료
        
        print(f"처리 중인 프레임: {frame_idx + 1}/{total_frames}")
        
        # OpenCV에서 BGR로 읽어오기 때문에 RGB로 변환
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # NumPy 배열을 PIL 이미지로 변환
        pil_image = Image.fromarray(frame)
        
        # 변환 적용
        transformed_frame = transform(pil_image)
    
        # 프레임 저장 (원본 이미지로 저장할 경우)
        frame_filename = os.path.join(output_dir, f'frame_{frame_idx:04d}.jpg')
        pil_image.save(frame_filename)
    
        # 텐서로 변환된 프레임을 저장하거나 활용하고 싶다면,
        # torch.save(transformed_frame, 'path_to_save_tensor')
    
        frame_idx += 1
    
    cap.release()
    print(f"저장된 프레임 수: {frame_idx}")

    return output_dir
    
def tokenize_text(text, tokenizer):
    tokens = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=50)
    return tokens

def get_all_file_paths(directory):
    file_paths = []
    
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_paths.append(os.path.join(root, file))
        break
    
    return file_paths
    
def process_video(json_file_name):
    cur_file_path = f'{json_file_path}/{json_file_name}'
    video_name, FPS, annotations = get_video_info(cur_file_path, video_file_path)

    video_path = f'{video_file_path}/{video_name}'
    print(f'{video_name} is started')
    output_dir = save_frame(video_path, transform)
    return output_dir

In [4]:
# json_file_path = '/root/lip-to-speech/TL163'
# video_file_path = json_file_path.replace('/TL', '/TS')

# json_file_names = os.listdir(json_file_path)

# # 멀티스레딩으로 비디오 처리
# with concurrent.futures.ThreadPoolExecutor() as executor:
#     results = list(executor.map(process_video, json_file_names))

In [5]:
# json_file_path = '/root/lip-to-speech/TL164'
# video_file_path = json_file_path.replace('/TL', '/TS')

# json_file_names = os.listdir(json_file_path)

# # 멀티스레딩으로 비디오 처리
# with concurrent.futures.ThreadPoolExecutor() as executor:
#     results = list(executor.map(process_video, json_file_names))


# json_file_path = '/root/lip-to-speech/TL165'
# video_file_path = json_file_path.replace('/TL', '/TS')

# json_file_names = os.listdir(json_file_path)

# # 멀티스레딩으로 비디오 처리
# with concurrent.futures.ThreadPoolExecutor() as executor:
#     results = list(executor.map(process_video, json_file_names))


In [7]:
json_file_name = json_file_names[0]
tec = f'{json_file_path}/{json_file_name}'
tec

'/root/lip-to-speech/TL163/lip_K_5_F_06_C651_A_001.json'

In [9]:
annotations = []
frame_paths = []
json_file_path = '/root/lip-to-speech/TL163'

json_file_names = os.listdir(json_file_path)
for json_file_name in json_file_names:
    cur_file_path = f'{json_file_path}/{json_file_name}'
    video_file_path = cur_file_path.replace('TL', 'split_TS').replace('.json', '.mp4')
    video_name, FPS, annotation = get_video_info(cur_file_path, video_file_path)
    annotations.append(annotation)
    
    cur_frame_path = json_file_path.replace('TL', 'split_TS') + '/' + video_name
    frame_list =  os.listdir(cur_frame_path)
    cur_frame_list = [ f'{cur_frame_path}/{frame_file}' for frame_file in frame_list ]
    frame_paths.append(cur_frame_list)

In [28]:
import cv2
import os

# def map_frames_to_text(video_path, annotations, fps, num_frames):
#     frame_text_mapping = {}
#     for frame in range(num_frames):
#         time_sec = frame / fps  # Convert frame to seconds
#         for (start, end, text) in annotations:
#             if start <= time_sec < end:
#                 frame_text_mapping[frame] = text
#                 break
#         else:
#             frame_text_mapping[frame] = ""  # If no matching text, assign empty string
#     return frame_text_mapping
import torch
from torch.utils.data import Dataset

# PyTorch Dataset class to handle 5-frame sequences
class LipReadingDataset(Dataset):
    def __init__(self, frame_paths, annotations, fps, frames_per_chunk=5):
        self.frame_paths = frame_paths  # Frame paths for video
        self.annotations = annotations  # (start, end, text) annotations
        self.fps = fps  # Frames per second
        self.frames_per_chunk = frames_per_chunk  # Length of frame sequences
        
        # Prepare the frame-text mapping
        num_frames = len(frame_paths[0])  # Number of frames in the video
        self.frame_text_mapping = map_frames_to_text(frame_paths[0], annotations[0], fps, num_frames, frames_per_chunk)

    def __len__(self):
        return len(self.frame_text_mapping)  # Total number of 5-frame sequences

    def __getitem__(self, idx):
        start_frame, end_frame, text = self.frame_text_mapping[idx]
        
        # Load the frames corresponding to this 5-frame sequence
        frames = []
        for frame_idx in range(start_frame, end_frame + 1):
            frame_path = self.frame_paths[0][frame_idx]  # 실제 프레임 경로를 사용
            frame = preprocess_video_frame(frame_path)  # 프레임 로드 및 전처리
            frames.append(frame)
        
        # Combine frames and text
        frames_tensor = torch.stack(frames)  # (5, C, H, W)
        
        return {
            'frames': frames_tensor,  # 5프레임 묶음
            'text': text  # 5프레임에 대응하는 텍스트
        }


In [53]:
def map_frames_to_text(frame_paths, annotations, fps, num_frames, max_sequence_length=10):
    frame_text_mapping = []
    
    for start, end, text in annotations:
        # 전체 프레임 중 해당 텍스트가 걸리는 프레임 수
        total_text_frames = int((end - start) * fps)
        if total_text_frames == 0:
            continue

        # 텍스트를 띄어쓰기 단위로 분리 (단어 단위)
        words = text.split()  # 띄어쓰기 기준으로 텍스트를 나눔
        total_words = len(words)

        # 각 단어에 할당할 프레임 수 계산 (단어 길이와 비례하게)
        # 'max_frame'이라는 새로운 이름을 사용하여 'max'와 충돌 방지
        frames_per_word = [max_frame(1, total_text_frames // total_words)] * total_words
        leftover_frames = total_text_frames - sum(frames_per_word)

        # 나머지 프레임을 고르게 분배
        for i in range(leftover_frames):
            frames_per_word[i % total_words] += 1

        # 유동적인 시퀀스 생성
        current_frame = int(start * fps)
        for word, frame_count in zip(words, frames_per_word):
            # 각 단어에 대해 프레임을 할당
            start_frame = current_frame
            end_frame = min(current_frame + frame_count - 1, num_frames - 1)
            frame_text_mapping.append((start_frame, end_frame, word))
            current_frame += frame_count

    return frame_text_mapping

# 'max_frame' 함수로 'max' 대체
def max_frame(a, b):
    return a if a > b else b

In [54]:
fps = 30
num_frames = len(frame_paths[0])

frame_text_mapping = map_frames_to_text(frame_paths[0], annotations[0], fps, num_frames, max_sequence_length=30)
frame_text_mapping

[(117, 131, '나한테'),
 (132, 145, '맞는'),
 (146, 159, '화장품을'),
 (160, 173, '찾기가'),
 (174, 187, '어려운'),
 (188, 201, '것'),
 (202, 215, '같아.'),
 (249, 264, '다음'),
 (265, 280, '주에'),
 (281, 295, '친구'),
 (296, 310, '생일이어서'),
 (311, 325, '오늘'),
 (326, 340, '다른'),
 (341, 355, '친구들과'),
 (356, 370, '생일'),
 (371, 385, '선물을'),
 (386, 400, '사러'),
 (401, 415, '갈'),
 (416, 430, '거야.'),
 (629, 637, '안'),
 (638, 646, '넘어가.'),
 (749, 767, '작년'),
 (768, 786, '가을에'),
 (787, 805, '우리'),
 (806, 824, '아이랑'),
 (825, 843, '함께'),
 (844, 862, '공원에서'),
 (863, 881, '낙엽'),
 (882, 900, '가지고'),
 (901, 919, '놀았는데.'),
 (920, 938, '새벽마다'),
 (939, 957, '목이'),
 (958, 976, '말라서'),
 (977, 995, '잠에서'),
 (996, 1014, '깨는데'),
 (1015, 1033, '이제는'),
 (1034, 1052, '침대'),
 (1053, 1071, '옆에'),
 (1072, 1090, '물을'),
 (1091, 1109, '두고'),
 (1110, 1127, '자야겠어.'),
 (1427, 1448, '요즘'),
 (1449, 1470, '해가'),
 (1471, 1491, '짧아져서'),
 (1492, 1512, '퇴근하면'),
 (1513, 1533, '어둡더라고.'),
 (1565, 1584, '시간이'),
 (1585, 1604, '나면'),
 (1605, 1624, '티브이를'),


In [55]:
from torch.utils.data import Dataset

def preprocess_video_frame(frame_path, transform):
    img = Image.open(frame_path).convert('RGB')
    print(f"Loaded frame: {frame_path}, Size: {img.size}")  # 프레임 경로와 크기 출력
    img_tensor = transform(img)
    return img_tensor


def tokenize_text(text, tokenizer):
    tokens = tokenizer(text, return_tensors="pt", padding="max_length", max_length=20, truncation=True)
    return {
        'input_ids': tokens['input_ids'],  # Actual tokenized input
        'attention_mask': tokens['attention_mask']  # Actual attention mask
    }

class LipReadingDataset(Dataset):
    def __init__(self, video_paths, annotations, tokenizer, transform, fps, sequence_length=30):
        self.video_paths = video_paths
        self.annotations = annotations
        self.tokenizer = tokenizer
        self.transform = transform
        self.sequence_length = sequence_length  # Set to 30 frames
        self.fps = fps

        # Frame-text mapping creation using map_frames_to_text function
        self.frame_text_mapping = []
        for video_annotation in annotations:
            num_frames = len(video_paths[0])  # Number of frames in each video
            mapping = map_frames_to_text(video_paths[0], video_annotation, fps, num_frames, sequence_length)
            self.frame_text_mapping.append(mapping)

    def __len__(self):
        return sum([len(mapping) for mapping in self.frame_text_mapping])  # Total number of 30-frame chunks

    def __getitem__(self, idx):
        # Find the video and frame index based on idx
        video_idx, frame_mapping_idx = self.get_video_frame_index(idx)

        # Get the frame-text mapping for this sequence
        start_frame, end_frame, text = self.frame_text_mapping[video_idx][frame_mapping_idx]

        # Load the frames for this sequence
        frames = []
        for frame_idx in range(start_frame, min(end_frame + 1, len(self.video_paths[video_idx]))):
            frame_path = self.video_paths[video_idx][frame_idx]
            frame = preprocess_video_frame(frame_path, self.transform)
            frames.append(frame)

        # Padding for remaining frames if sequence is shorter than 30 frames
        while len(frames) < self.sequence_length:
            frames.append(torch.zeros_like(frames[0]))  # Add zero padding for frames

        # Combine and tokenize the text
        tokens = tokenize_text(text, self.tokenizer)

        return {
            'video': torch.stack(frames),  # (sequence_length, C, H, W) - 30 frames
            'input_ids': tokens['input_ids'].squeeze(0),  # (max_length,)
            'attention_mask': tokens['attention_mask'].squeeze(0)
        }

    def get_video_frame_index(self, idx):
        """Find the video and the 30-frame chunk index"""
        cum_frames = 0
        for i, mapping in enumerate(self.frame_text_mapping):
            if idx < cum_frames + len(mapping):
                return i, idx - cum_frames
            cum_frames += len(mapping)
        raise IndexError("Index out of range")

In [56]:
tokenizer = BertTokenizer.from_pretrained("beomi/kobert")

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

fps = 30

dataset = LipReadingDataset(frame_paths, annotations, tokenizer, transform, fps, sequence_length=5)

for i in range(len(dataset)):
    data = dataset[i]
    print(f"Video: {data['video'].shape}, Input IDs: {data['input_ids']}, Attention Mask: {data['attention_mask']}")

Loaded frame: /root/lip-to-speech/split_TS163/lip_K_5_F_06_C651_A_001.mp4/frame_0117.jpg, Size: (1920, 1080)
Loaded frame: /root/lip-to-speech/split_TS163/lip_K_5_F_06_C651_A_001.mp4/frame_0118.jpg, Size: (1920, 1080)
Loaded frame: /root/lip-to-speech/split_TS163/lip_K_5_F_06_C651_A_001.mp4/frame_0119.jpg, Size: (1920, 1080)
Loaded frame: /root/lip-to-speech/split_TS163/lip_K_5_F_06_C651_A_001.mp4/frame_0120.jpg, Size: (1920, 1080)
Loaded frame: /root/lip-to-speech/split_TS163/lip_K_5_F_06_C651_A_001.mp4/frame_0121.jpg, Size: (1920, 1080)
Loaded frame: /root/lip-to-speech/split_TS163/lip_K_5_F_06_C651_A_001.mp4/frame_0122.jpg, Size: (1920, 1080)
Loaded frame: /root/lip-to-speech/split_TS163/lip_K_5_F_06_C651_A_001.mp4/frame_0123.jpg, Size: (1920, 1080)
Loaded frame: /root/lip-to-speech/split_TS163/lip_K_5_F_06_C651_A_001.mp4/frame_0124.jpg, Size: (1920, 1080)
Loaded frame: /root/lip-to-speech/split_TS163/lip_K_5_F_06_C651_A_001.mp4/frame_0125.jpg, Size: (1920, 1080)
Loaded frame: /root

IsADirectoryError: [Errno 21] Is a directory: '/root/lip-to-speech/split_TS163/lip_K_5_F_06_C651_A_001.mp4/.ipynb_checkpoints'