In [None]:
pip install av

Collecting av
  Downloading av-12.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.8/33.8 MB[0m [31m41.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av
Successfully installed av-12.0.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import av
import numpy as np
import torch
from transformers import VivitImageProcessor, VivitForVideoClassification
from collections import defaultdict

In [None]:
np.random.seed(0)

def read_video_pyav(container, indices):
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

def process_video_files(folder_path):
    # 모델 및 프로세서 초기화
    from transformers import pipeline

    image_processor = VivitImageProcessor.from_pretrained("kkumtori/vivit-b-16x2-kinetics400-finetuned-0505-mediapipe")
    model = pipeline("kkumtori/vivit-b-16x2-kinetics400-finetuned-0505-mediapipe")
    model.imageprocessor = image_processor

    feature_dict = defaultdict(list)

    # 폴더 내 모든 파일을 탐색
    for filename in os.listdir(folder_path):
        if filename.endswith(".mp4"):  # 비디오 파일 형식 필터링
            file_path = os.path.join(folder_path, filename)
            container = av.open(file_path)

            # 32 프레임 샘플링
            indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
            video = read_video_pyav(container=container, indices=indices)

            # 비디오를 모델에 맞게 준비
            inputs = image_processor(list(video), return_tensors="pt")

            # 모델을 통한 전파
            with torch.no_grad():
                outputs = model(**inputs, output_hidden_states=True)
                logits = outputs.logits
                hidden_states = outputs.hidden_states
                last_hidden = hidden_states[-1]

            # 클래스별로 마지막 히든 레이어의 특징 저장
            predictions = torch.argmax(logits, dim=-1)
            for idx, prediction in enumerate(predictions):
                feature_dict[prediction.item()].append(last_hidden[idx].numpy())

    return feature_dict

# 폴더 경로 설정 및 함수 호출
#folder_path = '/content/drive/MyDrive/temp'
folder_path = '/content/drive/MyDrive/기컴비_텀프/data/temp'
all_features = process_video_files(folder_path)
print(all_features)


defaultdict(<class 'list'>, {0: [array([[-7.9534988e+00, -4.3826671e+00,  3.4476194e+00, ...,
        -4.7930422e+00, -7.5185823e+00,  3.0407934e+00],
       [-1.6844152e+01, -3.0463426e+00,  4.3826237e+00, ...,
        -1.2342141e+00,  8.0614477e-02, -6.6756001e+00],
       [-6.3012667e+00,  3.2564924e+00,  2.4743316e+00, ...,
        -7.5133653e+00, -1.4035599e+00, -1.8838451e+01],
       ...,
       [-1.6991383e+01, -2.7283037e+00,  7.3920298e+00, ...,
         2.4419069e-01, -2.1484270e+00, -7.0126429e+00],
       [-1.6931517e+01, -2.1895020e+00,  6.9496880e+00, ...,
        -1.2084007e-02, -2.4033704e+00, -7.1066146e+00],
       [-1.6302195e+01, -3.4971147e+00,  8.7466049e+00, ...,
         5.2495599e-01, -1.2600405e+00, -7.2345486e+00]], dtype=float32)]})
