In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install av

Collecting av
  Downloading av-12.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av
Successfully installed av-12.1.0


In [None]:
import os
import pickle
import numpy as np
import torch
import av
from collections import defaultdict
from tqdm import tqdm
from transformers import VivitImageProcessor, VivitForVideoClassification
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import polynomial_kernel
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics.pairwise import laplacian_kernel

## **1. 학습 모델 불러오기**

In [None]:
from transformers import pipeline
from transformers import VivitImageProcessor

model_name = "kkumtori/vivit-b-16x2-kinetics400-0511-mediapipe" # 허깅링크 9차회의록

image_processor = VivitImageProcessor.from_pretrained(model_name)
video_cls = pipeline(model = model_name)
video_cls.image_processor = image_processor

preprocessor_config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.64k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/355M [00:00<?, ?B/s]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## **2. functions**

In [None]:
def read_video_pyav(container, indices):
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

def load_word_dictionary(pkl_file_path):
    """단어 사전 불러오는 함수"""
    with open(pkl_file_path, 'rb') as f:
        word_dictionary = pickle.load(f)
    return word_dictionary

In [None]:
def process_video_files(folder_paths):
    """hidden layer 뽑는 함수"""
    image_processor = video_cls.image_processor
    model = video_cls.model
    model.to(device)
    feature_dict = defaultdict(list)

    for folder_path in folder_paths:
        # 폴더명을 클래스 이름으로 사용
        for class_label in tqdm(os.listdir(folder_path)):
            class_path = os.path.join(folder_path, class_label)
            if not os.path.isdir(class_path):
                continue

            # 클래스 별 폴더 내 모든 파일을 탐색
            for filename in os.listdir(class_path):
                if filename.endswith(".mp4"):
                    file_path = os.path.join(class_path, filename)
                    container = av.open(file_path)

                    # 32 프레임 샘플링
                    indices = sample_frame_indices(clip_len=32, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
                    video = read_video_pyav(container=container, indices=indices)

                    # 비디오를 모델에 맞게 준비
                    inputs = image_processor(list(video), return_tensors="pt").to(device)

                    # 모델을 통한 전파
                    with torch.no_grad():
                        outputs = model(**inputs, output_hidden_states=True)
                        logits = outputs.logits
                        hidden_states = outputs.hidden_states
                        last_hidden = hidden_states[-1]

                    # 클래스별로 마지막 히든 레이어의 특징 저장
                    predictions = torch.argmax(logits, dim=-1)
                    for idx, prediction in enumerate(predictions):
                        last_hidden = last_hidden[idx].cpu().numpy()
                        # print(last_hidden.shape)
                        feature_dict[class_label].append(last_hidden[idx])
    return feature_dict

In [None]:
def diagonal_average(matrix):
    """
    Extract the diagonal elements of an nxn matrix and calculate their average.

    Parameters:
    matrix (list of list of int/float): The nxn matrix.

    Returns:
    float: The average of the diagonal elements of the matrix.
    """
    n = len(matrix)  # Assuming matrix is a square matrix (nxn)
    diagonal_elements = [matrix[i][i] for i in range(n)]

    if not diagonal_elements:
        return 0  # To handle empty lists, though in this context it shouldn't happen

    average = sum(diagonal_elements) / len(diagonal_elements)
    return average

# # Example usage:
# matrix = [
#     [1, 2, 3],
#     [4, 5, 6],
#     [7, 8, 9]
# ]

# average = diagonal_average(matrix)
# print(f"Average of diagonal elements: {average}")  # Output: Average of diagonal elements: 5.0


In [None]:
def row_max_average(matrix):
    """
    Extract the maximum value from each row of an nxn matrix and calculate their average.

    Parameters:
    matrix (list of list of int/float): The nxn matrix.

    Returns:
    float: The average of the maximum values from each row of the matrix.
    """
    if not matrix or not matrix[0]:
        return 0  # Handle empty matrix or empty rows

    max_values = [max(row) for row in matrix]

    average = sum(max_values) / len(max_values)
    return average

# # Example usage:
# matrix = [
#     [1, 2, 3],
#     [4, 5, 6],
#     [7, 8, 9]
# ]

# average = row_max_average(matrix)
# print(f"Average of the maximum values from each row: {average}")  # Output: Average of the maximum values from each row: 6.0


In [None]:
def predict_with_dictionary(feature_dict, word_dictionary):
    """단어 사전을 사용한 예측 함수"""
    total_videos = 0
    correct_predictions = 0

    for class_label, features in feature_dict.items():
        for pred_feature in feature_dict[class_label]:
            similarities = {}
            for word, word_feature in word_dictionary.items():
                # similarity = cosine_similarity([pred_feature], [word_feature])[0][0]
                # similarity = row_max_average(cosine_similarity([pred_feature], [word_feature]))
                # similarity = -row_max_average(pairwise_distances([pred_feature], [word_feature], metric='manhattan'))
                # similarity = row_max_average(linear_kernel([pred_feature], [word_feature]))
                # similarity = row_max_average(polynomial_kernel([pred_feature], [word_feature], degree=2))
                # similarity = row_max_average(sigmoid_kernel([pred_feature], [word_feature]))
                # similarity = row_max_average(rbf_kernel([pred_feature], [word_feature]))
                similarity = row_max_average(laplacian_kernel([pred_feature], [word_feature]))


                similarities[word] = similarity
            most_similar_word = max(similarities, key=similarities.get)
            print(f"단어(class_label): {class_label}, 예측(most_similar_word): {most_similar_word}")

            total_videos += 1
            if most_similar_word == class_label:
                correct_predictions += 1
        print('---------------------------------')
    print(correct_predictions, '/', total_videos)
    return correct_predictions / total_videos

## **3. 단어사전 불러오기**

In [None]:
# 단어 사전
pkl_file_path_max = '/content/drive/MyDrive/기컴비_텀프/code/feature_dictionary/pkl/M_max_pooled_features.pkl'
pkl_file_path_average = '/content/drive/MyDrive/기컴비_텀프/code/feature_dictionary/pkl/M_average_pooled_features.pkl'

word_dictionary_max = load_word_dictionary(pkl_file_path_max)
word_dictionary_average = load_word_dictionary(pkl_file_path_average)

In [None]:
for k in word_dictionary_max.keys():
    print(list(word_dictionary_max[k])==list(word_dictionary_average[k]))

True
True
True
True
True
True
True
True
True
True


ㄴ 이상하다.. ! 피클이 잘못 저장된 듯  . .

In [None]:
# all_features.pkl 에서 avg,max 다시 뽑기
pkl_file_path_all = '/content/drive/MyDrive/기컴비_텀프/code/feature_dictionary/pkl/M_all_features.pkl'
word_dictionary_all = load_word_dictionary(pkl_file_path_all)

In [None]:
tmp_max = dict()
tmp_avg = dict()
for k,v in word_dictionary_all.items():
    print(k)
    tmp_max[k]=np.array(v).max(axis=0)
    tmp_avg[k]=np.array(v).mean(axis=0)

붕대
구급차
의사
골절
쓰러지다
가렵다
배고프다
친구
병원
다리


In [None]:
for k in tmp_max.keys():
    print(list(tmp_max[k])==list(tmp_avg[k]))

False
False
False
False
False
False
False
False
False
False


## **4. 평가**

In [None]:
# 폴더 경로 설정 및 평가 함수 호출
folder_paths = ['/content/drive/MyDrive/기컴비_텀프/data/train_dataset/mediapipe/test']
all_features = process_video_files(folder_paths)

100%|██████████| 10/10 [02:41<00:00, 16.11s/it]


In [None]:
for k,v in all_features.items():
    print(k,len(v),v[0].shape)

붕대 10 (768,)
구급차 10 (768,)
의사 10 (768,)
골절 10 (768,)
쓰러지다 10 (768,)
가렵다 10 (768,)
배고프다 10 (768,)
친구 10 (768,)
병원 10 (768,)
다리 10 (768,)


In [None]:
# acc_max = predict_with_dictionary(all_features, word_dictionary_max)
acc_max = predict_with_dictionary(all_features, tmp_max)

단어(class_label): 붕대, 예측(most_similar_word): 다리
단어(class_label): 붕대, 예측(most_similar_word): 다리
단어(class_label): 붕대, 예측(most_similar_word): 다리
단어(class_label): 붕대, 예측(most_similar_word): 다리
단어(class_label): 붕대, 예측(most_similar_word): 다리
단어(class_label): 붕대, 예측(most_similar_word): 다리
단어(class_label): 붕대, 예측(most_similar_word): 다리
단어(class_label): 붕대, 예측(most_similar_word): 다리
단어(class_label): 붕대, 예측(most_similar_word): 다리
단어(class_label): 붕대, 예측(most_similar_word): 다리
---------------------------------
단어(class_label): 구급차, 예측(most_similar_word): 다리
단어(class_label): 구급차, 예측(most_similar_word): 다리
단어(class_label): 구급차, 예측(most_similar_word): 다리
단어(class_label): 구급차, 예측(most_similar_word): 다리
단어(class_label): 구급차, 예측(most_similar_word): 다리
단어(class_label): 구급차, 예측(most_similar_word): 다리
단어(class_label): 구급차, 예측(most_similar_word): 다리
단어(class_label): 구급차, 예측(most_similar_word): 다리
단어(class_label

In [None]:
# acc_average = predict_with_dictionary(all_features, word_dictionary_average)
acc_average = predict_with_dictionary(all_features, tmp_avg)

단어(class_label): 붕대, 예측(most_similar_word): 배고프다
단어(class_label): 붕대, 예측(most_similar_word): 친구
단어(class_label): 붕대, 예측(most_similar_word): 붕대
단어(class_label): 붕대, 예측(most_similar_word): 쓰러지다
단어(class_label): 붕대, 예측(most_similar_word): 붕대
단어(class_label): 붕대, 예측(most_similar_word): 골절
단어(class_label): 붕대, 예측(most_similar_word): 가렵다
단어(class_label): 붕대, 예측(most_similar_word): 의사
단어(class_label): 붕대, 예측(most_similar_word): 의사
단어(class_label): 붕대, 예측(most_similar_word): 다리
---------------------------------
단어(class_label): 구급차, 예측(most_similar_word): 친구
단어(class_label): 구급차, 예측(most_similar_word): 가렵다
단어(class_label): 구급차, 예측(most_similar_word): 쓰러지다
단어(class_label): 구급차, 예측(most_similar_word): 구급차
단어(class_label): 구급차, 예측(most_similar_word): 가렵다
단어(class_label): 구급차, 예측(most_similar_word): 배고프다
단어(class_label): 구급차, 예측(most_similar_word): 구급차
단어(class_label): 구급차, 예측(mo

In [None]:
print(f'max_pooling : {acc_max}')
print(f'average_pooling : {acc_average}')

max_pooling : 0.12
average_pooling : 0.43
