# Resizing

In [1]:
import subprocess
def convert_resize_video(input_path, output_path, target_resolution=(256, 256), bitrate='1000k'):
    try:
        # 임시 파일 경로 설정
        temp_mp4_path = 'temp.mp4'

        # Step 1: .avi를 .mp4로 변환
        cmd1 = [
            'ffmpeg',
            '-i', input_path,       
            # 입력 파일 경로
            '-c:v', 'mpeg4',          # 비디오 코덱 설정 
            '-f', 'rawvideo',          
#             '-b:v', bitrate,            # 비트레이트 설정
            '-y',                       # 덮어쓰기 허용
            output_path
#             temp_mp4_path                # 임시 출력 파일 경로 (.mp4)
        ]

        # FFmpeg 명령어 실행 (Step 1)
        subprocess.run(cmd1, check=True)

#         # Step 2: 리사이징 및 재인코딩
#         cmd2 = [
#             'ffmpeg',
#             '-i', temp_mp4_path,                 # 입력 파일 경로 (임시 .mp4 파일)
#             '-vf', f'scale={target_resolution[0]}:{target_resolution[1]}',  # 리사이징
#             '-c:v', 'mpeg4',                   # 비디오 코덱 설정 
#             '-f', 'rawvideo',          
# #             '-b:v', bitrate,                     # 비트레이트 설정
#             '-y',                                # 덮어쓰기 허용
#             output_path                          # 출력 파일 경로
#         ]

#         # FFmpeg 명령어 실행 (Step 2)
#         subprocess.run(cmd2, check=True)

#         # 임시 파일 삭제
#         subprocess.run(['rm', temp_mp4_path])

        print(f'동영상 변환 및 리사이징이 완료되었습니다. 저장 경로: {output_path}')
    except subprocess.CalledProcessError as e:
        print(f'오류 발생: {e}')
    except Exception as e:
        print(f'알 수 없는 오류 발생: {e}')

# 사용 예시
input_video_path = '/data_1/seclab_nahyun/UCF101_subset/train/SkyDiving/v_SkyDiving_g17_c05.avi'
output_video_path = './resized_video.mp4'
convert_resize_video(input_video_path, output_video_path)
# !ls ./~/jupyter_notebook/UCF101_subset/train/SkyDiving/v_SkyDiving_g17_c05.avi

오류 발생: Command '['ffmpeg', '-i', '/data_1/seclab_nahyun/UCF101_subset/train/SkyDiving/v_SkyDiving_g17_c05.avi', '-c:v', 'mpeg4', '-f', 'rawvideo', '-y', './resized_video.mp4']' returned non-zero exit status 1.


ffmpeg version 3.4.11-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 7 (Ubuntu 7.5.0-3ubuntu1~18.04)
  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --enable-gpl --disable-stripping --enable-avresample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librubberband --enable-librsvg --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-li

In [18]:
from tensorflow.python.client import device_lib


import os
# 0번 gpu 만을 사용하고 싶은 경우
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Motion Vector

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import cv2
import torch
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.append('./dist/coviar-0.1-py3.9-linux-x86_64.egg')
from coviar import load, get_num_frames
import os

def visualize_motion_vector(mv, stride=20):
    Y, X, _ = mv.shape
    x, y = np.meshgrid(np.arange(X), np.arange(Y))
    u = mv[..., 0]
    v = mv[..., 1]

    fig, ax = plt.subplots(figsize=(10, 10))
    ax.quiver(x[::stride, ::stride], y[::stride, ::stride], u[::stride, ::stride], v[::stride, ::stride], angles='xy', scale_units='xy', scale=1, color='r')
    ax.set_aspect('equal')
    plt.gca().invert_yaxis()
    plt.show()
    
def compute_motion_vectors_and_weight_maps(video_path, GOP_SIZE, height, width, num_frame):
    representation_type = 1  # 모션 벡터를 위한 값
    accumulate = True  # 누적된 표현을 위한 값

    """
    비디오에서 모션 벡터와 가중치 맵을 계산하는 함수.

    Parameters:
    - video_path: 비디오 파일의 경로
    - GOP_SIZE: GOP의 크기

    Returns:
    - all_motion_vectors: 각 프레임의 모션 벡터 텐서
    - weight_maps: 각 GOP의 가중치 맵 텐서
    """

    total_frames = num_frame
#     print('total_frames!', total_frames)
#     print('video_path!', video_path, height, width)
    
    if num_frame % GOP_SIZE == 0:
        num_iframe = num_frame // GOP_SIZE
    else:
        num_iframe = num_frame // GOP_SIZE + 1
    
    # 각 프레임에 대한 가중치 맵과 모션 벡터 텐서 초기화
    weight_maps_tensor = torch.zeros((num_iframe, 3, height, width))
    all_motion_vectors_tensor = torch.zeros((total_frames, height, width, 2))
    gop_start = 0
    frame_index = 0

    for gop_start in range(0, total_frames, GOP_SIZE):
        weight_map = np.zeros((height, width), dtype=np.float32)

        for offset in range(1, GOP_SIZE):  # I 프레임 제외
            frame_index = gop_start + offset
            
            if frame_index >= total_frames:
                break
            
#             print('frame_index:', frame_index)
            try:
#                 print(video_path, frame_index // GOP_SIZE, frame_index % GOP_SIZE, representation_type, accumulate)
                mv = load(video_path, frame_index // GOP_SIZE, frame_index % GOP_SIZE, representation_type, accumulate)
#                 visualize_motion_vector(mv)
                # mv의 유효성 확인
                if mv is None or mv.size == 0:
                    raise ValueError(f"Invalid motion vector data at frame {frame_index}")

                # mv의 각 채널을 float32로 변환하고 개별적으로 리사이즈
                mv_x = cv2.resize(mv[..., 0].astype(np.float32), (height, width))
                mv_y = cv2.resize(mv[..., 1].astype(np.float32), (height, width))

                # 리사이즈된 채널들을 다시 합치기
                mv = np.stack([mv_x, mv_y], axis=-1)

                # 모션 벡터의 크기 (움직임의 거리)를 계산하고 가중치 맵에 누적합니다.
                magnitude = np.sqrt(mv[:,:,0]**2 + mv[:,:,1]**2)
                weight_map += magnitude

                # 모션 벡터 저장
                all_motion_vectors_tensor[frame_index] = torch.tensor(mv)

            except Exception as e:
                print(f"Error occurred at frame {frame_index}:", e)
                mv = np.zeros((height, width, 2), dtype=np.int32)  # 0벡터 할당

        # RGB 채널 각각에 동일한 가중치 맵을 복사
        for channel in range(3):
            weight_maps_tensor[gop_start // GOP_SIZE, channel] = torch.tensor(weight_map)
    
#     print('weight_maps_tensor', weight_maps_tensor.shape, 'all_motion_vectors_tensor', all_motion_vectors_tensor.shape)
    
#     visualize_weight_map(weight_maps_tensor)
#     visualize_motion_vectors(all_motion_vectors_tensor)
    return all_motion_vectors_tensor, weight_maps_tensor


# Segmentation

In [4]:
import torch
from PIL import Image
import torch
import torchvision.transforms as T

def perform_segmentation_with_interval(segmentation_module, original_tensor, interval=12, device='cuda:1'):
    
    # Define the image transform
    transform_img = T.Compose([
        T.Resize(512),
        T.ToTensor(), 
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    # Ensure the tensor is on the GPU
    original_tensor = original_tensor.to(device)

    # Initialize the scores tensor on CPU
    num_frames = original_tensor.size(0)
    selected_frames = list(range(0, num_frames, interval))
    scores = torch.zeros(len(selected_frames), 150, original_tensor.size(2), original_tensor.size(3))

    # Perform segmentation for selected frames
    with torch.no_grad():  # Use no_grad to save GPU memory
        for i, frame_idx in enumerate(selected_frames):
            img = original_tensor[frame_idx].cpu().numpy()
            img = np.transpose(img, (1, 2, 0))
            img = Image.fromarray(np.uint8(img))
            img = img.convert('RGB')
            segSize = (img.size[1], img.size[0])
            
            # 이미지를 텐서로 변환
            img_transformed = transform_img(img)
            img_transformed = img_transformed.unsqueeze(0).to(device)

            feed_dict = {'img_data': img_transformed}
            pred_tmp = segmentation_module(feed_dict, segSize=segSize)
            scores[i] = pred_tmp

    # Move the scores tensor to GPU if needed
    scores = scores.to(device)

    _, pred = torch.max(scores, dim=1)

    return pred.cpu().numpy()

# Mask

In [5]:
import numpy as np
import pandas as pd
import torch

def create_mask_for_stuff_pixels(segmentation_tensor, color_encoding_path='./color_coding_semantic_segmentation_classes.csv'):
    """
    Create a mask to remove pixels with 'Stuff' values in the segmentation tensor.

    Parameters:
    - segmentation_tensor: A tensor of shape (num_frames, height, width) containing the segmentation results
    - color_encoding_path: Path to the CSV file containing color encodings for semantic segmentation classes

    Returns:
    - masks: A tensor of shape (num_frames, height, width) containing the masks
    """
    
    # Read the color encoding CSV
    color_df = pd.read_csv(color_encoding_path) 

    # Extract the Idx values where Stuff is 1
    stuff_pixel_values = color_df[color_df['Stuff'] == 1]['Idx'].tolist()

    # Create a mask where pixels with stuff values are set to 0, others are set to 1
    masks = np.ones_like(segmentation_tensor)
    for value in stuff_pixel_values:
        masks[segmentation_tensor == value - 1] = 0

    # Convert the numpy array to a PyTorch tensor
    masks_tensor = torch.tensor(masks, dtype=torch.float32)

    return masks_tensor


# Visualization

In [6]:
import matplotlib.pyplot as plt
import torch
import math
import numpy as np

def visualize_tensor_image(image_tensor):
    """
    Visualize a given image tensor.

    Parameters:
    - image_tensor: A tensor of shape [3, H, W]

    Returns:
    None
    """
    # Check if the tensor is on GPU, if yes, move it to CPU
    if image_tensor.device != torch.device('cpu'):
        image_tensor = image_tensor.cpu()

    # Convert the tensor to numpy array
    image_array = image_tensor.numpy()

    # Transpose the array to shape [H, W, 3] for visualization
    image_array = image_array.transpose(1, 2, 0)

    # Display the image
    plt.imshow(image_array)
    plt.axis('off')  # Hide axes for better visualization
    plt.show()


def visualize_weight_map(weight_maps_tensor, stride=5):
    num_iframes, num_channels, height, width = weight_maps_tensor.shape
    
    cols = 7  # 한 행에 7개의 프레임
    rows = math.ceil(num_iframes*num_channels / cols)  # 필요한 행의 수 계산

    # figsize의 값을 조절하여 전체 그림의 크기를 늘립니다.
    plt.figure(figsize=(cols * 3, rows * 3))  # 각 프레임의 크기를 대략 3x3으로 설정

    

    for iframe in range(num_iframes):
        for channel in range(num_channels):
            plt.subplot(rows, cols, iframe * num_channels + channel + 1)
            weight_map = weight_maps_tensor[iframe, channel].numpy()
            plt.imshow(weight_map, cmap='viridis')
            plt.title(f'I-Frame {iframe + 1}, Channel {channel + 1}')

    plt.tight_layout()
    plt.show()

    
def visualize_masks(masks_tensor):
    """
    Visualize the masks.

    Parameters:
    - masks_tensor: A tensor of shape (num_frames, height, width) containing the masks
    """
    
    num_frames, height, width = masks_tensor.shape
    
    cols = 7  # 한 행에 7개의 프레임
    rows = math.ceil(num_frames / cols)  # 필요한 행의 수 계산

    # figsize의 값을 조절하여 전체 그림의 크기를 늘립니다.
    plt.figure(figsize=(cols * 3, rows * 3))  # 각 프레임의 크기를 대략 3x3으로 설정

    
    for frame_idx in range(num_frames):
        plt.subplot(rows, cols, frame_idx+1)
        mask = masks_tensor[frame_idx].numpy()

#         plt.figure(figsize=(8, 8))
        plt.imshow(mask, cmap='gray')  # You can choose a different colormap
        plt.title(f'Mask - Frame {frame_idx}')
        plt.axis('off')
    plt.tight_layout()
    plt.show()


def visualize_motion_vectors(motion_tensor):
    """
    Visualize motion vectors for each GOP and frame.

    Parameters:
    - motion_tensor: A tensor of shape [num_frames, 3, H, W]

    Returns:
    None
    """
    print(motion_tensor.shape)
    num_frames, _, _, _ = motion_tensor.shape
    
    cols = 7  # 한 행에 7개의 프레임
    rows = math.ceil(num_frames / cols)  # 필요한 행의 수 계산

    # figsize의 값을 조절하여 전체 그림의 크기를 늘립니다.
    plt.figure(figsize=(cols * 3, rows * 3))  # 각 프레임의 크기를 대략 3x3으로 설정

    stride = 20

    # Check if the tensor is on GPU, if yes, move it to CPU
    if motion_tensor.device != torch.device('cpu'):
        motion_tensor = motion_tensor.cpu()

    for frame_idx in range(num_frames):
        mv = motion_tensor[frame_idx]  # .numpy()
        Y, X, _ = mv.shape
        x, y = np.meshgrid(np.arange(X), np.arange(Y))
        u = mv[..., 0]
        v = mv[..., 1]

        plt.subplot(rows, cols, frame_idx + 1)
        plt.quiver(x[::stride, ::stride], y[::stride, ::stride], u[::stride, ::stride], v[::stride, ::stride],
                   angles='xy', scale_units='xy', scale=1, color='r')
        plt.gca().set_aspect('equal')
        plt.gca().invert_yaxis()

    plt.tight_layout()
    plt.show()
    
def visualize_motion_vector(mv, stride=20):
    Y, X, _ = mv.shape
    x, y = np.meshgrid(np.arange(X), np.arange(Y))
    u = mv[..., 0]
    v = mv[..., 1]

    fig, ax = plt.subplots(figsize=(10, 10))
    ax.quiver(x[::stride, ::stride], y[::stride, ::stride], u[::stride, ::stride], v[::stride, ::stride], angles='xy', scale_units='xy', scale=1, color='r')
    ax.set_aspect('equal')
    plt.gca().invert_yaxis()
    plt.show()

# 최종 노이즈 시각화
def visualize_warped_frames(warped_frames):
    """
    Visualize the given warped frames (noise).

    Parameters:
    - warped_frames: Tensor of shape [num_frames, C, H, W]
    """
    
    num_frames, C, H, W = warped_frames.shape
    
    # Ensure the tensor is on CPU for visualization
    warped_frames = warped_frames.cpu()

    cols = 7  # 한 행에 7개의 프레임
    rows = math.ceil(num_frames / cols)  # 필요한 행의 수 계산

    # figsize의 값을 조절하여 전체 그림의 크기를 늘립니다.
    plt.figure(figsize=(cols * 3, rows * 3))  # 각 프레임의 크기를 대략 3x3으로 설정

#     for idx in range(num_frames):
    for idx in range(7):
        plt.subplot(rows, cols, idx+1)
        frame = warped_frames[idx].numpy()
        
        frame = warped_frames[idx].numpy()
        
        # Transpose the array to shape [H, W, 3] for visualization
        frame = frame.transpose(1, 2, 0)

        # Clip the values to the range [0, 1]
        frame = np.clip(frame, 0, 1)

        # Normalize the frame for visualization
#         frame = (frame - frame.min()) / (frame.max() - frame.min())
        
        plt.imshow(frame)
        plt.axis('off')
        plt.title(f"Frame {idx}")

    plt.tight_layout()
    plt.show()
    
    
    
def visualize_video_frames_tf(video_frames):
    """
    동영상 프레임을 시각화하는 함수

    Parameters:
    - video_frames: 동영상 프레임의 텐서 (프레임수, 높이, 너비, 채널)

    Returns:
    - None
    """
    print("!!!!, ", video_frames.shape)
    # 텐서를 넘파이 배열로 변환
    frames_np = video_frames.numpy()

    # 프레임수
    num_frames = frames_np.shape[0]
    cols = 7  # 한 행에 7개의 프레임
    rows = math.ceil(num_frames / cols)  # 필요한 행의 수 계산

    # figsize의 값을 조절하여 전체 그림의 크기를 늘립니다.
    plt.figure(figsize=(cols * 3, rows * 3))

    # 각 프레임을 시각화
#     for i in range(num_frames):
    for i in range(7):
        plt.subplot(rows, cols, i + 1)
        plt.imshow(frames_np[i])
#         plt.title(f'프레임 {i+1}')
        plt.axis('off')
        
    plt.tight_layout()
    plt.show()

import os
from PIL import Image
import math
import tensorflow as tf

def save_video_frames_tf(video_frames, save_path='./adversarial_examples/'):
    """
    동영상 프레임을 이미지 파일로 저장하는 함수

    Parameters:
    - video_frames: 동영상 프레임의 텐서 (프레임수, 높이, 너비, 채널)
    - save_path: 이미지를 저장할 경로

    Returns:
    - None
    """
    # 텐서를 넘파이 배열로 변환
    frames_np = video_frames.numpy()
    
    
    # 데이터 타입을 uint8로 변환
    frames_np = (frames_np * 255).astype(np.uint8)

    # 프레임수
    num_frames = frames_np.shape[0]

    # 저장 경로 확인 및 생성
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # 각 프레임을 이미지 파일로 저장
    for i in range(num_frames):
        frame_image = Image.fromarray(frames_np[i])
        frame_image.save(os.path.join(save_path, f'frame_{i:03d}.png'))
    
    
def visualize_video(video_array):
    """
    주어진 numpy.ndarray 동영상을 시각화하는 함수.

    Parameters:
    - video_array: (num_frame, height, width, 3) shape의 numpy.ndarray 동영상

    Returns:
    - None
    """
    
    num_frames, height, width, _ = video_array.shape

    for frame_idx in range(num_frames):
        plt.figure(figsize=(10, 6))
        plt.imshow(video_array[frame_idx])
        plt.axis('off')
        plt.title(f"Frame {frame_idx + 1}/{num_frames}")
        plt.show()
#         input("Press Enter to continue to the next frame...")



# Apply Motion Vectors

In [7]:
import torch
import torch.nn.functional as FF

def normalize_grid(grid):
    # 그리드의 최대값과 최소값을 가져옵니다.
    max_val = grid.max()
    min_val = grid.min()

    # 그리드 값을 [-1, 1] 범위로 정규화
    normalized_grid = 2 * (grid - min_val) / (max_val - min_val) - 1

    return normalized_grid

# 이 함수는 exp_noise를 각 GOP의 첫 번째 프레임에 설정하고, 나머지 프레임들에 대해 해당 프레임의 모션 벡터를 사용하여 노이즈를 변형합니다. 
# 결과적으로, 각 GOP의 모든 프레임에 대한 노이즈가 생성됩니다.
def apply_motion_vector_to_iframe_tensor(iframe, motion_vector, scale_factor, grid_density):
    C, H, W = iframe.shape
#     print(iframe.shape, motion_vector.shape)
#     visualize_tensor_image(iframe)

    # Create a dense grid
    x = torch.linspace(0, W-1, W*grid_density).unsqueeze(0).repeat(H*grid_density, 1)
    y = torch.linspace(0, H-1, H*grid_density).unsqueeze(1).repeat(1, W*grid_density)

    # Resize motion vectors to match the dense grid
    mv_x = FF.interpolate(motion_vector[..., 0].unsqueeze(0).unsqueeze(0), size=(H*grid_density, W*grid_density), mode='bilinear', align_corners=True).squeeze()
    mv_y = FF.interpolate(motion_vector[..., 1].unsqueeze(0).unsqueeze(0), size=(H*grid_density, W*grid_density), mode='bilinear', align_corners=True).squeeze()

    # Apply motion vectors to the grid
    x_new = x - scale_factor * mv_x
    y_new = y - scale_factor * mv_y

    grid = torch.stack((x_new, y_new), dim=2).unsqueeze(0)
    
    # 그리드 값을 확인하고 필요한 경우 정규화
#     print("Before normalization:", grid.min(), grid.max())
    grid = normalize_grid(grid)
#     print("After normalization:", grid.min(), grid.max())
    
    # Move the grid tensor to the same device as iframe
    grid = grid.to(iframe.device)
    
    # Use grid_sample to warp the frame
    warped_frame = FF.grid_sample(iframe.unsqueeze(0), grid, mode='bilinear', padding_mode='reflection', align_corners=True).squeeze()
    

    # Resize the warped frame to the original resolution
    warped_frame = FF.interpolate(warped_frame.unsqueeze(0), size=(H, W), mode='bilinear', align_corners=True).squeeze()

#     visualize_tensor_image(warped_frame)
    return warped_frame


def apply_motion_vectors_to_all_iframes(iframes, motion_vectors, scale_factor=1.0, grid_density=4):
    """
    Apply motion vectors to all I-frames.

    Parameters:
    - iframes: Tensor of shape [num_gops, C, H, W]
    - motion_vectors: Tensor of shape [total_frames, H, W, 2]

    Returns:
    - warped_frames: Tensor of shape [total_frames, C, H, W]
    """
    
    num_gops, C, H, W = iframes.shape
    total_frames = motion_vectors.shape[0]
    GOP_SIZE = total_frames // num_gops
    
    # Initialize the result tensor
    warped_frames = torch.zeros((total_frames, C, H, W), device=iframes.device)

    frame_counter = 0
    for gop_idx in range(num_gops):
        # Set the first frame of each GOP as the I-frame
        warped_frames[frame_counter] = iframes[gop_idx]
        frame_counter += 1
        
        for _ in range(GOP_SIZE - 1):
            if frame_counter < total_frames:  # Ensure we don't exceed the total number of frames
                mv = motion_vectors[frame_counter]
                iframe = iframes[gop_idx]
                
                warped = apply_motion_vector_to_iframe_tensor(iframe, mv, scale_factor, grid_density)
                warped_frames[frame_counter] = warped
                frame_counter += 1

    return warped_frames



# Utils

In [8]:
import sys
import json
import pickle
import argparse
sys.path.append('/home/ksknh7/.local/bin/pip')
# import mxnet as mx
# from mxnet import nd
import torch
import shutil
import glob
import numpy as np
import pdb
# from gluoncv.data.transforms import video
import torch.nn.functional as F
import torch.nn as nn
import torchvision
import imageio
import subprocess

# transform_post = video.VideoNormalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])

import torch
import torchvision.transforms as transforms
from PIL import Image
import tensorflow as tf
def preprocess_images(frames):
    processed_frames = []

    for i in range(frames.shape[0]):
        frame = frames[i]

        # Transpose to move the channel dimension to the last axis
        frame = tf.transpose(frame, perm=[1, 2, 0])

        # Convert to PIL Image equivalent
        frame = tf.image.convert_image_dtype(frame, dtype=tf.uint8) # 얘를 안하면 이미지 출력이 안됨 
#         frame = tf.cast(frame, dtype=tf.uint8)
        
        # Resize
#         frame = tf.image.resize(frame, (224, 224))
        
        # Normalize
        frame = tf.cast(frame, dtype=tf.float32) / 255.0
#         frame = frame / 255.0
        mean = [0.485, 0.456, 0.406]
        std = [0.229, 0.224, 0.225]
        frame = (frame - mean) / std
        
        frame = tf.transpose(frame, perm=[2, 0, 1])

        processed_frames.append(frame)

    return tf.stack(processed_frames)

def inverse_preprocess_images(frames, device):
    processed_frames = []

    for i in range(frames.shape[0]):
        frame = frames[i]

        # Transpose to move the channel dimension to the last axis
        frame = frame.permute(1, 2, 0)
        
#         start_time = time.time()

        # Denormalize
        mean = torch.tensor([0.485, 0.456, 0.406]).to('cpu')
        std = torch.tensor([0.229, 0.224, 0.225]).to('cpu')
        frame = frame.to('cpu')
#         end_time = time.time()
#         elapsed_time = end_time - start_time
#         print(f"  -CPU옮기는 시간: {elapsed_time}초")
#         start_time = time.time()


#         mean = torch.tensor([0.485, 0.456, 0.406]).to(device)
#         std = torch.tensor([0.229, 0.224, 0.225]).to(device)
#         frame = frame.to(device)
        frame = ((frame * std) + mean) * 255.0
        
        frame = tf.cast(frame, dtype=tf.uint8)
#         frame = frame.byte()
        frame = tf.image.convert_image_dtype(frame, dtype=tf.float32)
#         frame = frame.to(torch.float32)
#         frame = tf.image.resize(frame, (256, 256))

        # convert to JPEG 
#         frame = tf.cast(frame, dtype=tf.uint8)
#         frame = tf.cast(frame, dtype=tf.float32)
#         frame = tf.image.convert_image_dtype(frame , dtype=tf.uint8)
#         frame = tf.image.encode_jpeg(frame)
#         frame = tf.image.decode_jpeg(frame, channels=3)
#         end_time = time.time()
#         elapsed_time = end_time - start_time
#         print(f"  -나머지 시간: {elapsed_time}초")

        
        processed_frames.append(frame)
        
    return tf.stack(processed_frames)


# def transform_input_to_mxnet(input_tensor):
# #     # PyTorch 텐서를 MXNet NDArray로 변환
# #     input_ndarray = mgs_ = transform_post(input_tensor)
# #     data = input_tensor.reshape((-1,) + (args.new_length, 3, args.input_size, args.input_size))
# #     data = data.reshape((-1,) + (imgs.shape[0] , 3, 256, 256))

#     # 입력 텐서의 shape를 가져옵니다.
#     clip_length, num_channels, height, width = input_tensor.shape
    
#     # 배치 차원과 깊이 차원을 추가합니다.
#     input_data = input_tensor.reshape((1, num_channels, clip_length, height, width))
    
#     min_val = np.min(input_data, axis=(2, 3, 4), keepdims=True)
#     max_val = np.max(input_data, axis=(2, 3, 4), keepdims=True)
#     normalized_data = (input_data - min_val) / (max_val - min_val)
    
#     return normalized_data

def save_images_with_imageio(num_frame, original_images, adv_images, save_path):
    for i in range(num_frame):
        original_image = (original_images[i] * 255).astype('uint8')
        adv_image = (adv_images[i] * 255).astype('uint8')
        
        original_image_path = f"{save_path}/original_{i}.png"
        adv_image_path = f"{save_path}/adv_{i}.png"
        
        imageio.imwrite(original_image_path, original_image)
        imageio.imwrite(adv_image_path, adv_image)
        
def normalization(imgs):
    batch_size = imgs.shape[0]
    input_data = mx.nd.array(imgs)
    data = np.stack(imgs, axis=0)
    data = data.reshape((-1,) + (imgs.shape[0] , 3, 256, 256))
    data = np.transpose(data, (0, 2, 1, 3, 4))
    print("normalized imgs shape:", imgs.shape)

    return data

def Cross_Entropy(logits, target, kappa=0):
    criterion = nn.CrossEntropyLoss()
    other, other_class = logits.max(1)
    loss = criterion(logits, other_class.long())
    #pdb.set_trace()
    return loss, loss.item(), loss.item(), loss.item(), loss.item(), loss.item(), loss.item()


def CWLoss(logits, target, device, kappa=0):
    logits = F.softmax(logits, dim=1)

    target_onehot = torch.zeros(1, 101).to(device)
    target_onehot[0, target] = 1
    real = (target_onehot * logits).sum(1)[0]
    tmp_logit = ((1. - target_onehot) * logits - target_onehot*10000.)

    other, other_class = logits.max(1)
    sort_prob, sort_class = logits.sort()
    second_logit = sort_prob[0][-2].unsqueeze(0)
    second_class = sort_class[0][-2].unsqueeze(0)
    
    return torch.clamp(torch.sum(logits)-second_logit, kappa), target.item(), real.item(), other.item(), other_class.item(), second_logit.item(), second_class.item() # untargeted
#     return torch.clamp(other-5*real, kappa), target.item(), real.item(), other.item(), other_class.item(), second_logit.item(), second_class.item() # targeted


def norm2(x):
    assert len(x.shape) == 4
    norm_vec = torch.sqrt(x.float().pow(2).sum(dim=[1,2,3])).view(-1, 1, 1, 1)
    norm_vec += (norm_vec == 0).float()*1e-8
    return norm_vec

def _pert_loss(logits, ori_label, target_label, delta_motion, device):
    cw_loss = CWLoss
    #cw_loss = Cross_Entropy
    loss, target, real, other, other_class, second_logit, second_class = cw_loss(logits, target_label, device)
    loss = loss.squeeze(0)
    return loss, target, real, other, other_class, second_logit, second_class


# Black Box Attack

In [9]:
from coviar import get_num_frames
from coviar import load
import numpy as np
import torch
import copy
import pdb
import time
import torch.nn.functional as F
# import mxnet.ndarray as FF
import random

GOP_SIZE = 12

def _perturbation_image(model,
                      original_image,
                      ori_label,
                      video_path,
                      save_path,
                      transform_post,
                      args,
                      config,
                      device, segmentation_module):
    

    original_image = original_image.to(device)

#     total_frames = get_num_frames(video_path)
    original_image_ = original_image.clone()
    
    
    num_frame, channel, height, width = original_image.shape
    dim = height * width * channel
    loop = 0
    inner_loop = 0
    success = False
    num_query = 0
    num_pframe = 0

    max_query = 60000
    base_exploration = 0.1
    fd_eta = 0.1
    online_lr = 0.1
    flow_lr = 0.005
    
    
    ori_label = ori_label.to(device)
    target_label = torch.tensor([random.sample(range(101), 1)[0]]).to(device)
    while target_label == ori_label:
        target_label = torch.tensor([random.sample(range(101), 1)[0]]).to(device)
    print('target_label', target_label, 'ori_label', ori_label)
    
    if num_frame % GOP_SIZE == 0:
        num_iframe = num_frame // GOP_SIZE
    else:
        num_iframe = num_frame // GOP_SIZE + 1

    prior = torch.zeros(num_frame, channel, height, width).to(device)
    delta = torch.zeros(num_frame, channel, height, width).to(device)
    est_grad = torch.zeros(num_frame, channel, height, width).to(device)
    adv_img = torch.zeros(3, num_frame, channel, height, width).to(device)
    iframe = torch.zeros(num_frame, height, width, channel).to(device)
    noise_frames = torch.zeros(num_frame, channel, height, width).to(device)
    noise_iframes = torch.zeros(num_frame, channel, height, width).to(device)
    segmentation_result = torch.zeros(num_iframe, height, width).to(device)
    dynamic_exploration = torch.zeros(height, width).to(device)

    index_visual = torch.zeros(num_frame, 2, height, width).to(device)
    index_motion = torch.zeros(num_frame, height, width, 2).to(device)
    
#     print('original_image:', original_image.shape)
#     visualize_warped_frames(original_image)
    
    segmentation_result = perform_segmentation_with_interval(segmentation_module, original_image, interval=12, device=device)
#     print('segmentation', segmentation_result.shape)
    mask = create_mask_for_stuff_pixels(segmentation_result)
#     print('mask', mask.shape)
#     visualize_masks(mask)
    
    motion_vectors, weight_maps = compute_motion_vectors_and_weight_maps(video_path, GOP_SIZE=12, height=height, width=width, num_frame=num_frame)
#     print('motion_vectors', motion_vectors.shape, 'weight_maps', weight_maps.shape)
    
#     # I-frame과 각 프레임 간의 Optical Flow 계산
#     motion_vectors = calculate_optical_flow(video_path, cap, gop_size=GOP_SIZE)
#     cap.release()

    # exploration 값을 동적으로 조절, dynamic_exploration_tensor: torch.Size([9, 3, 256, 256])
    dynamic_exploration = base_exploration * weight_maps 
#     print(dynamic_exploration.shape)
    # dynamic_exploration 값을 사용하여 노이즈 프레임의 크기를 조절 노이즈 프레임을 모션 벡터와 결합하여 적대적 섭동을 생성
    dynamic_exploration_tensor = dynamic_exploration.clone().detach().to(device)
    
    while not (num_query > max_query):
        pred_adv_logit = list()
        start1 = time.time()
            
        gop_index = loop // GOP_SIZE

        # 노이즈 프레임 생성: iframe에 대한 랜덤 노이즈 프레임(noise_frames)을 생성
        noise_frames = torch.randn(1, 3, height, width).repeat(num_iframe, 1, 1, 1).to(device)
#         visualize_warped_frames(noise_frames)
#         print('noise_frames.shape', noise_frames.shape, mask.shape)
        
        # masks를 noise_frames와 동일한 shape으로 확장 
        expanded_masks = mask.unsqueeze(1).expand_as(noise_frames)
#         print('mask', mask.shape, expanded_masks.shape)

        # noise_frames에 마스크를 적용
#         masked_noise_frames = noise_frames * expanded_masks.to(device)
        masked_noise_frames = noise_frames

#         print('dynamic_exploration_tensor:', dynamic_exploration_tensor.shape, 'masked_noise_frames:', masked_noise_frames.shape)
        iframe_noise = dynamic_exploration_tensor * masked_noise_frames
#         iframe_noise = dynamic_exploration_tensor * noise_frames # seg X
        applied_noise = apply_motion_vectors_to_all_iframes(iframe_noise, motion_vectors)
#         print('iframe_noise', iframe_noise.shape, 'applied_noise', applied_noise.shape)
        
        q1 = prior + applied_noise
        q2 = prior - applied_noise
#         visualize_warped_frames(q1)

        adv_img[0] = original_image + fd_eta*q1/norm2(q1)
        adv_img[1] = original_image + fd_eta*q2/norm2(q2)
        adv_img[2] = original_image
        
        for i in range(3):
#             data = adv_img[i].clone()
            
            adv_img_np = adv_img[i].clone().cpu().numpy()
            data = np.stack(adv_img_np, axis=0)
            data = torch.tensor(np.array(data)).to(device)
            
#             start_time = time.time()
            restored_frames = tf.expand_dims(inverse_preprocess_images(data, device), axis=0)
#             restored_frames = inverse_preprocess_images(data, device).unsqueeze(0)
#             print('!1!!!!!! data shape: ', restored_frames.shape, '// type', type(restored_frames))
            
#             end_time = time.time()
#             elapsed_time = end_time - start_time
#             print(f"전처리 시간: {elapsed_time}초")
            
#             print(restored_frames.device) 
            
#             start_time = time.time()
#             with tf.device('/GPU:0'):
            pred = model.predict(restored_frames, verbose=0)
#             pred = model(restored_frames)
#             end_time = time.time()
#             elapsed_time = end_time - start_time
#             print(f"모델 쿼리 실행 시간: {elapsed_time}초")
#             start_time = time.time()
            
#             pred = model(restored_frames)
            pred_classes = np.argmax(pred)
            pred = torch.tensor(pred).to(device)            
            
#             print('pred shape: ', pred.shape ,'pred type: ', type(pred) , 'pred_classes: ', pred_classes)
            
            pred_adv_logit.append(pred)
            
#             loss = criterion(logits, target_label)  # 손실 함수 정의 필요
#             losses.append(loss.item())
            
#         losses = torch.tensor(losses).to(device)
#         print('losses: ', losses, losses.shape)
#         weighted_noises = (losses - losses.mean()) / (losses.std() + 1e-10)  # 정규화
#         est_grad = torch.mean(weighted_noises.view(population_size, 1, 1, 1, 1) * noises, dim=0)

#         # 그래디언트를 사용하여 적대적 이미지 업데이트
#         delta = online_lr * est_grad.sign()
#         original_image = torch.clamp(original_image + delta, 0, 1)


        l1, _, _, _, _, _, _ = _pert_loss(pred_adv_logit[0].to(device), ori_label, target_label, delta, device)
        l2, _, _, _, _, _, _ = _pert_loss(pred_adv_logit[1].to(device), ori_label, target_label, delta, device)
        loss, target, real, other, other_class, second_logit, second_class = _pert_loss(pred_adv_logit[2].to(device), ori_label, target_label, delta, device)

        
#         print('loss:', loss, 'other', other, 'other_class', other_class,'second_logit', second_logit, 'second_class', second_class)
        
        num_query += 3
        
        est_deriv = (l1-l2)/(fd_eta*base_exploration*base_exploration)
        est_grad = est_deriv.item() * applied_noise
        prior += online_lr * est_grad
        
#         visualize_warped_frames(applied_noise)

        original_image = original_image - flow_lr*prior.sign()
        delta = original_image_ - original_image
        original_image = torch.max(torch.min(original_image, original_image_ + 0.03), original_image_ - 0.03)
        original_image = torch.clamp(original_image, 0, 1)

#         end_time = time.time()
#         elapsed_time = end_time - start_time
#         print(f"나머지 시간: {elapsed_time}초\n")

        # 적대적 공격의 성공 여부 판단:
        pred_adv_label = pred_adv_logit[2].argmax()
        if (loop % 100 ==0) or (loop == max_query) or pred_adv_label != ori_label: # untargeted attack.
#         if (loop % 10 ==0) or (loop == max_query) or pred_adv_label == target_label:
            print('[T2]{:.3f}s for [{}]-th loop\t'
                  'Queries {:03d}\t'
                  'Overall loss {:.3f}\t'
                  'est_deriv {:.3f}\t'
                  'Target {}\t'
                  'Target logit {:.3f}\t'
                  'ori logit {:.3f}\t'
                  'ori class {}\t'
                  'second logit {:.3f}\t'
                  'second class {}\t'.format(time.time() - start1, loop,
                                            num_query, loss, est_deriv.item(), target,
                                            real, other, other_class, second_logit, second_class))
#             print(prior.shape, delta.shape)
            visualize_warped_frames(q1)
#             visualize_warped_frames(prior) 
#             visualize_warped_frames(original_image)
#             visualize_video_frames_tf(restored_frames)
            
            
        loop += 1
        if pred_adv_label != ori_label:  # untargeted attack.
#         if pred_adv_label == target_label:
            print('Predicted label is {}\t'.format(pred_adv_label))
            diff = adv_img[2] - original_image_
            print('diff max {:.3f}, diff min {:.3f}'.format(diff.max(), diff.min()))
            success = True
#             visualize_warped_frames(original_image_) # 색 이상
            
#             visualize_video_frames_tf(tf.squeeze(test_frames, axis=0))
            save_video_frames_tf(tf.squeeze(test_frames, axis=0))
#             visualize_video_frames_tf(tf.squeeze(restored_frames, axis=0)) # 흐림
            
            #save_images(num_frame, original_image_.cpu().permute(0,2,3,1).numpy(), adv_img[2].cpu().permute(0,2,3,1).numpy(), save_path)
            break

        if num_query >= max_query:
#             visualize_warped_frames(original_image_)
            save_video_frames_tf(tf.squeeze(test_frames, axis=0))
            #save_images(num_frame, original_image_.cpu().permute(0,2,3,1).numpy(), adv_img[2].cpu().permute(0,2,3,1).numpy(), save_path)
            break
            
    return pred_adv_label, num_query, success

# Attack!

In [10]:
import pathlib
download_dir = pathlib.Path('/data_1/seclab_nahyun/UCF101_subset/')
subset_paths = {'train': pathlib.Path('/data_1/seclab_nahyun/UCF101_subset/train'), 'val': pathlib.Path('/data_1/seclab_nahyun/UCF101_subset/val'), 'test': pathlib.Path('/data_1/seclab_nahyun/UCF101_subset/test')}
print(subset_paths)

# video_count_train = len(list(download_dir.glob('train/*/*.avi')))
# video_count_val = len(list(download_dir.glob('val/*/*.avi')))
video_count_test = len(list(download_dir.glob('test/*/*.avi')))
# video_total = video_count_train + video_count_val + video_count_test
print(f"Total videos: {video_count_test}")

{'train': PosixPath('/data_1/seclab_nahyun/UCF101_subset/train'), 'val': PosixPath('/data_1/seclab_nahyun/UCF101_subset/val'), 'test': PosixPath('/data_1/seclab_nahyun/UCF101_subset/test')}
Total videos: 1010


In [16]:
import cv2
import os
import random
import numpy as np
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

def format_frames(frame, output_size):
    frame = tf.image.convert_image_dtype(frame, tf.float32)
    frame = tf.image.resize_with_pad(frame, *output_size)
    return frame


# 이전에 정의한 format_frames 함수와 FrameGenerator 클래스를 그대로 사용합니다.
def frames_from_video_file(video_path, output_size=(256, 256)):
    """
    동영상 파일에서 모든 프레임을 생성합니다.

    Args:
        video_path: 동영상 파일 경로.
        output_size: 프레임 이미지의 크기.

    Returns:
        모든 프레임으로 이루어진 NumPy 배열과 video_path.
    """
    result = []
    src = cv2.VideoCapture(str(video_path))

    while True:
        ret, frame = src.read()
        if not ret:
            break
        frame = format_frames(frame, output_size)
        result.append(frame)

    src.release()
    result = np.array(result)[..., [2, 1, 0]]
#     print(result)

    return result, str(video_path)  # video_path도 반환합니다.

# 이제 frames_from_video_file 함수를 사용하여 모든 프레임을 생성하도록 FrameGenerator 클래스를 수정합니다.
class FrameGenerator:
    def __init__(self, path):
        """ 레이블링된 동영상 파일의 프레임을 생성합니다.

        Args:
            path: 동영상 파일 경로.
        """
        self.path = path
        self.class_names = sorted(set(p.name for p in self.path.iterdir() if p.is_dir()))
        self.class_ids_for_name = dict((name, idx) for idx, name in enumerate(self.class_names))

    def get_files_and_class_names(self):
        video_paths = list(self.path.glob('*/*.avi'))
        classes = [p.parent.name for p in video_paths]
        return video_paths, classes

    def __call__(self):
        video_paths, classes = self.get_files_and_class_names()

        pairs = list(zip(video_paths, classes))

        random.shuffle(pairs)

        for path, name in pairs:
            video_frames, video_path = frames_from_video_file(path)
            label = self.class_ids_for_name[name]  # 레이블 인코딩
            yield video_frames, label, video_path  # video_path도 반환합니다.

#     def generator_function(self, num_batches):
#         video_paths, classes = self.get_files_and_class_names()

#         pairs = list(zip(video_paths, classes))

#         random.shuffle(pairs)

#         for path, name in pairs:
#             video_frames, video_path = frames_from_video_file(path)
#             label = self.class_ids_for_name[name]  # 레이블 인코딩
#             yield video_frames, label, video_path  # video_path도 반환합니다.

#             num_batches -= 1
#             if num_batches == 0:
#                 break
           
        
# Create the training set
output_signature = (tf.TensorSpec(shape=(None, None, None, 3), dtype=tf.float32),
                    tf.TensorSpec(shape=(), dtype=tf.int16),
                    tf.TensorSpec(shape=(), dtype=tf.string))  # video_path를 추가
batch_size = 1
# Create the test set
# frame_generator = FrameGenerator(subset_paths['test'])
# generator_function = lambda: frame_generator.generator_function(1)  # 원하는 배치 개수를 인자로 전달
# test_ds = tf.data.Dataset.from_generator(generator_function, output_signature=output_signature)
test_ds = tf.data.Dataset.from_generator(FrameGenerator(subset_paths['test']), output_signature=output_signature)
AUTOTUNE = tf.data.AUTOTUNE
# test_ds = test_ds.shuffle(1000).batch(batch_size).cache().prefetch(buffer_size=AUTOTUNE) 이거 개오래걸림

# test_ds = test_ds.take(1).batch(batch_size).cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.batch(batch_size).cache().prefetch(buffer_size=AUTOTUNE)

In [12]:
loaded_model = tf.keras.models.load_model('my_model')
# loaded_model.summary()

In [13]:
import torch

torch.cuda.get_device_name() # CUDA를 실행하고 있는 기기 이름을 나타낸다.
torch.cuda.is_available() # CUDA의 활성 여부를 나타낸다.
# print(torch.__version__)

True

In [14]:
test_frames, test_labels, video_path = next(iter(test_ds))
print(test_frames.shape)

(1, 97, 256, 256, 3)


2024-02-26 16:47:50.622741: W tensorflow/core/kernels/data/cache_dataset_ops.cc:768] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [19]:
import torch
import sys
sys.path.append('/home/seclab_nahyun/jupyter_notebook/semantic-segmentation-pytorch/')
from mit_semseg.models import ModelBuilder, SegmentationModule


# os.environ["CUDA_VISIBLE_DEVICES"] = '0, 1'

# 모델 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load the pre-trained model
builder = ModelBuilder()
net_encoder = builder.build_encoder(
    arch='resnet50dilated',
    fc_dim=2048,
    weights='/home/seclab_nahyun/jupyter_notebook/semantic-segmentation-pytorch/ckpt/encoder_epoch_20.pth')
net_decoder = builder.build_decoder(
    arch='ppm_deepsup',
    fc_dim=2048,
    num_class=150,
    weights='/home/seclab_nahyun/jupyter_notebook/semantic-segmentation-pytorch/ckpt/decoder_epoch_20.pth',
    use_softmax=True)

crit = torch.nn.NLLLoss(ignore_index=-1)
segmentation_module = SegmentationModule(net_encoder, net_decoder, crit)
segmentation_module.eval()
segmentation_module.to(device)

# i = 0
# for test_frames, test_labels, video_path in test_ds:
#     if test_frames is None:
#         continue
    
visualize_video_frames_tf(tf.squeeze(test_frames, axis=0))
# 각 배치에 대한 예측 수행
tf.keras.backend.clear_session()

# print(test_frames.shape, type(test_frames), test_labels, video_path)
# predictions = loaded_model.predict(test_frames)
# predicted_class_index = np.argmax(predictions)
# print(predictions, predicted_class_index, )

save_path = 'saved_images'  # 저장 경로 
video_frames = tf.transpose(test_frames, perm=[0, 1, 4, 2, 3])
preprocessed_image = preprocess_images(video_frames[0])
#     print(preprocessed_image.shape, type(preprocessed_image))
video_frames = torch.tensor(preprocessed_image.numpy())

label = torch.tensor(test_labels.numpy())
input_video_path = video_path[0].numpy().decode('utf-8')
print(video_frames.shape, label, input_video_path)

output_video_path = "./resized_video.mp4"  # 출력 비디오 파일 경로
target_size = (256, 256)  # 목표 크기
convert_resize_video(input_video_path, output_video_path)


start_time = time.time()
_perturbation_image(loaded_model, video_frames, label, output_video_path, save_path, None, None, None, device, segmentation_module)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"모델 쿼리 실행 시간: {elapsed_time}초")


#     i+=1
#     if (i == 5) :
#         break


Loading weights for net_encoder
Loading weights for net_decoder


RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 23.65 GiB total capacity; 20.64 MiB already allocated; 1.56 MiB free; 32.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [16]:
loaded_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 rescaling_1 (Rescaling)     (None, None, None, None,  0         
                              3)                                 
                                                                 
 time_distributed (TimeDistr  (None, None, None, None,  4049571  
 ibuted)                      1280)                              
                                                                 
 dense (Dense)               (None, None, None, None,  129381    
                              101)                               
                                                                 
 global_average_pooling3d (G  (None, 101)              0         
 lobalAveragePooling3D)                                          
                                                                 
Total params: 4,178,952
Trainable params: 129,381
Non-tr

In [None]:
import tensorflow as tf
import numpy as np

def make_gradcam_heatmap_video(video_array, model, last_conv_layer_name, pred_index=None):
    grad_model = tf.keras.models.Model(
        [model.inputs], [model.get_layer(last_conv_layer_name).output, model.output]
    )

    with tf.GradientTape() as tape:
        last_conv_layer_outputs, preds = grad_model(video_array)
        if pred_index is None:
            pred_index = tf.argmax(preds[0])
        class_channel = preds[:, pred_index]

    heatmaps = []
    for i in range(video_array.shape[1]):  # Loop through frames
        grads = tape.gradient(class_channel, last_conv_layer_outputs[:, i])

        # Assuming 3D input with dimensions (batch_size, frames, height, width, channels)
        # Adjust the axis parameter based on the actual dimensions of your input
        pooled_grads = tf.reduce_mean(grads, axis=(1, 2, 3, 4))

        last_conv_layer_output = last_conv_layer_outputs[0, i]
        heatmap = last_conv_layer_output @ pooled_grads[..., tf.newaxis]
        heatmap = tf.squeeze(heatmap)

        heatmap = tf.maximum(heatmap, 0) / tf.math.reduce_max(heatmap)
        heatmaps.append(heatmap.numpy())

    return np.array(heatmaps)


In [None]:
print(test_frames.shape)
# video_frames = tf.transpose(test_frames, perm=[0, 1, 4, 2, 3])
heatmap = make_gradcam_heatmap(test_frames, loaded_model, 'global_average_pooling3d')
plt.matshow(heatmap)
plt.show()

In [None]:
# # # device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
# import matplotlib.pyplot as plt
# import tensorflow as tf
        

# # def check_normalization(video_tensor):
# #     # 주어진 텐서의 크기 확인
# #     assert len(video_tensor.size()) == 4, "텐서는 4차원이어야 합니다."

# #     # 텐서의 데이터 타입을 torch.float32로 변환
# #     video_tensor = video_tensor.float()

# #     # 프레임 수, 채널 수, 높이, 너비 추출
# #     num_frames, num_channels, height, width = video_tensor.size()

# #     # 각 프레임의 최솟값과 최댓값 확인
# #     for frame_idx in range(num_frames):
# #         frame = video_tensor[frame_idx]
# #         min_value = frame.min().item()
# #         max_value = frame.max().item()

# #         print(f"Frame {frame_idx + 1}: Min Value = {min_value}, Max Value = {max_value}")



# with tf.device("/GPU:3"):
#     for test_frames, test_labels, video_path in test_ds:
        
#         predictions = loaded_model.predict(test_frames)
#         predicted_class_index = np.argmax(predictions)
#         print(predictions, predicted_class_index)
        
# #         print(test_frames.shape, type(test_frames))
# #         visualize_video_frames_tf(tf.squeeze(test_frames, axis=0))
#         video_frames = tf.transpose(test_frames, perm=[0, 1, 4, 2, 3])
#         preprocessed_image = preprocess_images(video_frames[0])
#         video_frames = torch.tensor(preprocessed_image.numpy())
#         print(video_frames.dtype, type(video_frames))
# #         visualize_warped_frames(video_frames)
# #         check_normalization(video_frames)
        
#         adv_img_np = video_frames.clone().cpu().numpy()
#         data = np.stack(video_frames, axis=0)
#         data = torch.tensor(np.array(data)).to(device)
#         restored_frames = inverse_preprocess_images(data, device)
#         restored_frames = tf.expand_dims(restored_frames, axis=0)

#         predictions = loaded_model.predict(restored_frames)
#         predicted_class_index = np.argmax(predictions)
#         print(predictions, predicted_class_index)
        
#         break
        

In [None]:
import torchvision.models as models
import torch.nn as nn
import torch
import os
import cv2
import PIL
import torchvision
import torchvision.transforms as transforms
import datetime
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt
import matplotlib
from torchvision.utils import make_grid, save_image
import sys
sys.path.append('./pytorch-resnet3d/')
from models import resnet
import torch
import torch.nn as nn

net = resnet.i3_res50(400) # vanilla I3D ResNet50
net.eval()
# net = resnet.i3_res50_nl() # Nonlocal version
inp = {'frames': torch.rand(4, 3, 32, 224, 224)}
pred, losses = net(inp)
print(pred, losses)
print(net)


device = 'cuda' if torch.cuda.is_available() else 'cpu'

# resnet50 = models.resnet50(pretrained = True).to(device)
# resnet50.eval()
# print(resnet50)

for name, _ in net.named_modules():
    print(name)
    
def normalize(tensor, mean, std):
    if not tensor.ndimension() == 4:
        raise TypeError('tensor should be 4D')

    mean = torch.FloatTensor(mean).view(1, 3, 1, 1).expand_as(tensor).to(tensor.device)
    std = torch.FloatTensor(std).view(1, 3, 1, 1).expand_as(tensor).to(tensor.device)

    return tensor.sub(mean).div(std)


class Normalize(object):
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, tensor):
        return self.do(tensor)
    
    def do(self, tensor):
        return normalize(tensor, self.mean, self.std)
    
    def undo(self, tensor):
        return denormalize(tensor, self.mean, self.std)

    def __repr__(self):
        return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)

In [None]:
# video_frames = test_frames[0]
# print(video_frames.shape)
# normalizer = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
# torch_img = torch.from_numpy(np.asarray(video_frames)).permute(0, 3, 1, 2).float().div(255).cuda()
# print(torch_img.shape, type(torch_img))
# torch_img = F.interpolate(torch_img, size=(224, 224), mode='bilinear', align_corners=False) # (1, 3, 224, 224)
# normed_torch_img = normalizer(torch_img) # (1, 3, 224, 224)

In [19]:
current_time = datetime.datetime.now() + datetime.timedelta(hours= 9)
current_time = current_time.strftime('%Y-%m-%d-%H:%M')

saved_loc = os.path.join('./', current_time)
if os.path.exists(saved_loc):
    shutil.rmtree(saved_loc)
os.mkdir(saved_loc)

print("결과 저장 위치: ", saved_loc)

결과 저장 위치:  ./2024-02-20-04:32


In [20]:
from torch.autograd import Function


class GuidedBackpropReLU(Function):
    @staticmethod
    def forward(self, input_img):
        # input image 기준으로 양수인 부분만 1로 만드는 positive_mask 생성
        positive_mask = (input_img > 0).type_as(input_img)
        
        # torch.addcmul(input, tensor1, tensor2) => output = input + tensor1 x tensor 2
        # input image와 동일한 사이즈의 torch.zeros를 만든 뒤, input image와 positive_mask를 곱해서 output 생성
        output = torch.addcmul(torch.zeros(input_img.size()).type_as(input_img), input_img, positive_mask)
        
        # backward에서 사용될 forward의 input이나 output을 저장
        self.save_for_backward(input_img, output)
        return output

    @staticmethod
    def backward(self, grad_output):
        
        # forward에서 저장된 saved tensor를 불러오기
        input_img, output = self.saved_tensors
        grad_input = None

        # input image 기준으로 양수인 부분만 1로 만드는 positive_mask 생성
        positive_mask_1 = (input_img > 0).type_as(grad_output)
        
        # 모델의 결과가 양수인 부분만 1로 만드는 positive_mask 생성
        positive_mask_2 = (grad_output > 0).type_as(grad_output)
        
        # 먼저 모델의 결과와 positive_mask_1과 곱해주고,
        # 다음으로는 positive_mask_2와 곱해줘서 
        # 모델의 결과가 양수이면서 input image가 양수인 부분만 남도록 만들어줌
        grad_input = torch.addcmul(torch.zeros(input_img.size()).type_as(input_img),
                                   torch.addcmul(torch.zeros(input_img.size()).type_as(input_img), grad_output,
                                                 positive_mask_1), positive_mask_2)
        return grad_input


In [21]:
class GuidedBackpropReLUModel:
    def __init__(self, model, use_cuda):
        self.model = model
        self.model.eval()
        self.cuda = use_cuda
        if self.cuda:
            self.model = model.cuda()

        def recursive_relu_apply(module_top):
            for idx, module in module_top._modules.items():
                recursive_relu_apply(module)
                if module.__class__.__name__ == 'ReLU':
                    module_top._modules[idx] = GuidedBackpropReLU.apply

        # replace ReLU with GuidedBackpropReLU
        recursive_relu_apply(self.model)

    def forward(self, input_img):
        return self.model(input_img)

    def __call__(self, input_img, target_category=None):
        if self.cuda:
            input_img = input_img.cuda()

        input_img = input_img.requires_grad_(True)

        output = self.forward(input_img)

        if target_category is None:
            target_category = np.argmax(output.cpu().data.numpy())

        one_hot = np.zeros((1, output.size()[-1]), dtype=np.float32)
        one_hot[0][target_category] = 1
        one_hot = torch.from_numpy(one_hot).requires_grad_(True)
        if self.cuda:
            one_hot = one_hot.cuda()

        one_hot = torch.sum(one_hot * output)
        # 모델이 예측한 결과값을 기준으로 backward 진행
        one_hot.backward(retain_graph=True)

        # input image의 gradient를 저장
        output = input_img.grad.cpu().data.numpy()
        output = output[0, :, :, :]
        output = output.transpose((1, 2, 0))
        return output

# pytorch에서는 224x224 짜리 RGB 이미지를 (3, 224, 224)로 저장하게 되는데, numpy에서는 (224, 224, 3)으로 저장
def deprocess_image(img):
    """ see https://github.com/jacobgil/keras-grad-cam/blob/master/grad-cam.py#L65 """
    img = img - np.mean(img)
    img = img / (np.std(img) + 1e-5)
    img = img * 0.1
    img = img + 0.5
    img = np.clip(img, 0, 1)
    return np.uint8(img * 255)

# final conv layer name 
finalconv_name = 'layer4'

# activations
feature_blobs = []

# gradients
backward_feature = []

# output으로 나오는 feature를 feature_blobs에 append하도록
def forward_hook(module, input, output):
#     feature_blobs.append(output.cpu().data.numpy())
    feature_blobs.append(output.cpu().data)
    

# Grad-CAM
def backward_hook(module, input, output):
    backward_feature.append(output[0])
#     backward_feature.append(output[0])
    

# resnet50._modules.get(finalconv_name).register_forward_hook(hook_feature)
# resnet50._modules.get(finalconv_name).register_backward_hook(backward_hook)
# final_layer = net.layer4[-1]
# final_layer.register_forward_hook(forward_hook)
# final_layer.register_backward_hook(backward_hook)


In [22]:
# get the softmax weight
params = list(net.parameters())
weight_softmax = np.squeeze(params[-2].cpu().detach().numpy()) # [1000, 512]
print(weight_softmax.shape)

NameError: name 'net' is not defined

In [23]:
# score = logit[:, 30] # 예측값 y^
# score = tf.squeeze(score)
# score.backward(retain_graph = True) # 예측값 y^c에 대해서 backprop 진행

# I3D, Kinetics

In [24]:
import os
import torch
import numpy as np
from PIL import Image
import torch.utils.data as data
from torchvision import transforms
import tqdm
import torch.nn as nn
import numpy as np
import argparse
import collections
import torchnet as tnt
import sys
sys.path.append('./pytorch-resnet3d/')
from models import resnet
import torch
import torch.nn as nn
import cv2
import torch.nn.functional as FF
import torchvision
import random
from PIL import Image
import numbers
from utils import util
import tensorflow as tf
import torchvision.transforms.functional as F
sys.path.append('./semantic-segmentation-pytorch/')
from mit_semseg.models import ModelBuilder, SegmentationModule
import time

import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel
sys.path.append('./pytorch-grad-cam/')
from pytorch_grad_cam import GradCAM
from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
from pytorch_grad_cam.utils.image import show_cam_on_image, \
    deprocess_image, \
    preprocess_image


# os.environ["CUDA_VISIBLE_DEVICES"] = '0, 1, 2, 3'
os.environ["CUDA_VISIBLE_DEVICES"] = '2'

# visualize_video_frames_tf(tf.squeeze(test_frames, axis=0))


# 모델 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cpu')
# Load the pre-trained model
builder = ModelBuilder()
net_encoder = builder.build_encoder(
    arch='resnet50dilated',
    fc_dim=2048,
    weights='./semantic-segmentation-pytorch/ckpt/encoder_epoch_20.pth')
net_decoder = builder.build_decoder(
    arch='ppm_deepsup',
    fc_dim=2048,
    num_class=150,
    weights='./semantic-segmentation-pytorch/ckpt/decoder_epoch_20.pth',
    use_softmax=True)

crit = torch.nn.NLLLoss(ignore_index=-1)
segmentation_module = SegmentationModule(net_encoder, net_decoder, crit)
segmentation_module.eval()
segmentation_module.to(device)




def batch_cuda(batch, device):
    _batch = {}
    for k, v in batch.items():
        if isinstance(v, torch.Tensor):
            v = v.to(device)  # 모든 텐서를 cuda:2로 이동
        elif isinstance(v, list) and isinstance(v[0], torch.Tensor):
            v = [item.to(device) for item in v]  # 리스트 내의 텐서들을 cuda:2로 이동
        _batch[k] = v
    return _batch

class GroupResize(object):
    def __init__(self, size, interpolation=Image.BILINEAR):
        self.worker = torchvision.transforms.Resize(size, interpolation)
        
    def __call__(self, img_group):
        return [self.worker(img) for img in img_group]

class GroupRandomCrop(object):
    def __init__(self, size):
        if isinstance(size, numbers.Number):
            self.size = (int(size), int(size))
        else:
            self.size = size

    def __call__(self, img_group):
        if not img_group:
            return img_group  # 리스트가 비어 있으면 입력을 반환합니다.

        w, h = img_group[0].size
        th, tw = self.size

        out_images = list()

        x1 = random.randint(0, w - tw)
        y1 = random.randint(0, h - th)

        for img in img_group:
            assert(img.size[0] == w and img.size[1] == h)
            if w == tw and h == th:
                out_images.append(img)
            else:
                out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))

        return out_images

class GroupCenterCrop(object):
    def __init__(self, size):
        self.worker = torchvision.transforms.CenterCrop(size)

    def __call__(self, img_group):
        return [self.worker(img) for img in img_group]

class GroupRandomHorizontalFlip(object):
    def __call__(self, img_group):
        if random.random() < 0.5:
            img_group = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
        return img_group

class GroupNormalize(object):
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, tensor): # (T, 3, 224, 224)
        for b in range(tensor.size(0)):
            for t, m, s in zip(tensor[b], self.mean, self.std):
                t.sub_(m).div_(s)
        return tensor
class LoopPad(object):

    def __init__(self, max_len):
        self.max_len = max_len

    def __call__(self, tensor):
        length = tensor.size(0)

        if length==self.max_len:
            return tensor

        # repeat the clip as many times as is necessary
        n_pad = self.max_len - length
        pad = [tensor]*(n_pad//length)
        if n_pad%length>0:
            pad += [tensor[0:n_pad%length]]

        tensor = torch.cat([tensor]+pad, 0)
        return tensor

# NOTE: Returns [0-255] rather than torchvision's [0-1]
class ToTensor(object):
    def __init__(self):
        self.worker = lambda x: F.to_tensor(x)*255

    def __call__(self, img_group):
        img_group = [self.worker(img) for img in img_group]
        return torch.stack(img_group, 0)
    
def kinetics_mean_std():
    mean = [114.75, 114.75, 114.75]
    std = [57.375, 57.375, 57.375]
    return mean, std

def clip_transform(max_len):

    mean, std = kinetics_mean_std()
    transform = transforms.Compose([
                GroupResize(256),
                GroupCenterCrop(256),
                ToTensor(),
                GroupNormalize(mean, std),
                LoopPad(max_len),
        ])
    return transform

def extract_frames(video_path, num_frames):
    cap = cv2.VideoCapture(video_path)
    frames = []

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # 이미지로 변환
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        img_pil = Image.fromarray(frame_rgb)

        frames.append(img_pil)

        if len(frames) == num_frames:
            break

    cap.release()
    return frames

class KineticsTest(data.Dataset):

    def __init__(self, root, clip_len):
        super(KineticsTest, self).__init__()

        self.root = root
        self.clip_len = clip_len

        if not os.path.exists('/home/seclab_nahyun/jupyter_notebook/pytorch-resnet3d/data/kinetics_data.pth'):
            self.parse_annotations()
            print('Annotations created!')

        annotations = torch.load('/home/seclab_nahyun/jupyter_notebook/pytorch-resnet3d/data/kinetics_data.pth')
        self.labels = annotations['labels']
        self.test_data = annotations['val_data']
        print('%d test clips' % len(self.test_data))
        
        self.clip_transform = clip_transform(self.clip_len)
#         self.loader = lambda fl: Image.open(os.path.join(self.root, fl)).convert('RGB')
        self.loader = lambda fl: os.path.join(self.root, fl)
    
    def parse_annotations(self):

        def parse(annotation_csv):
            annotations = open(annotation_csv, 'r').read().strip().split('\n')[1:]
            annotations = [line.split(',') for line in annotations]
            clip_labels, yt_ids, start_times, end_times, _, _ = zip(*annotations)

            labels = map(lambda l: l.strip('"'), clip_labels)
            labels = np.unique(list(labels)).tolist()

            data = []
            for yt_id, start, end, label in tqdm.tqdm(zip(yt_ids, start_times, end_times, clip_labels), total=len(yt_ids)):
                label = label.strip('"')
                file_name = f"{yt_id}_{int(start):06d}_{int(end):06d}.mp4"
                file_path = os.path.join(frame_dir, file_name)
            
                # 파일이 존재하는지 확인
                if os.path.exists(file_path):
                    data.append({'frames': file_path, 'label': labels.index(label)})
                # else:
                    # print(f"파일을 찾을 수 없음: {file_path}")

            return data, labels


        frame_dir = '%s/frames/' % self.root
        val_data, labels = parse('%s/annotations/test.csv' % self.root)
        annotations = {'val_data': val_data, 'labels': labels}
        torch.save(annotations, '/home/seclab_nahyun/jupyter_notebook/pytorch-resnet3d/data/kinetics_data.pth')


    def sample(self, video_path):
        # video_path에서 직접 프레임을 추출하여 로드
        frames = extract_frames(video_path, num_frames=self.clip_len)
#         print(frames)

        # 필요한 경우 중앙 정렬 등의 추가적인 프레임 선택 및 조정 로직 적용

        # 이미지로드 및 변환
#         imgs = [self.loader(frame) for frame in frames]

        return frames
    

    def __getitem__(self, index):
        entry = self.test_data[index]
        video_path = entry['frames']  # 비디오 경로 추가
        frames = self.sample(video_path)
        if frames is None or len(frames) == 0:
            return self.__getitem__((index + 1) % len(self.test_data))  # 다음 인덱스로 이동
        frames = self.clip_transform(frames)
        frames = frames.permute(1, 0, 2, 3)  # (3, T, 224, 224)
        instance = {'frames': frames, 'label': entry['label'], 'video_path': video_path}
        return instance

    def __len__(self):
        return len(self.test_data)

def accuracy(output, target):
    """Computes the accuracy of the model"""
    with torch.no_grad():
        pred = torch.argmax(output, dim=1)
        assert pred.shape[0] == len(target)
        correct = torch.sum(pred == target).item()
    return correct / len(target)

def main():
    args = parser.parse_args()

    ngpus_per_node = torch.cuda.device_count()
    args.world_size = ngpus_per_node * args.world_size
    mp.spawn(main_worker, nprocs=ngpus_per_node, 
             args=(ngpus_per_node, args))
    
    
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu
    torch.cuda.set_device(args.gpu)
    
    print("Use GPU: {} for training".format(args.gpu))
    args.rank = args.rank * ngpus_per_node + gpu
    dist.init_process_group(backend='nccl', 
                            init_method='tcp://127.0.0.1:FREEPORT',
                            world_size=args.world_size, 
                            rank=args.rank)
    

test_dataset = KineticsTest('/home/seclab_nahyun/jupyter_notebook/kinetics-dataset/test_ds', clip_len=32)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=True, num_workers=36, pin_memory=True)


# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

save_path = 'saved_images'  # 저장 경로 
    

loaded_model = resnet.i3_res50(400).to(device)  # vanilla I3D ResNet50
# final_layer = loaded_model.layer4[-1]
# final_layer.register_forward_hook(forward_hook)
# final_layer.register_backward_hook(backward_hook)

# loaded_model._modules['layer4']._modules.get('2').register_forward_hook(forward_hook)
# loaded_model._modules['layer4']._modules.get('2').register_backward_hook(backward_hook)
# loaded_model._modules.get('layer4').register_forward_hook(forward_hook)
# loaded_model._modules.get('layer4').register_backward_hook(backward_hook)


# 시작 시간 기록
start_time = time.time()

i = 0
# with torch.no_grad():
loaded_model.eval()
total_correct = 0
total_images = 0
loss_meters = collections.defaultdict(lambda: tnt.meter.AverageValueMeter())


for idx, batch in enumerate(test_dataloader):
    i += 1
    batch = batch_cuda(batch, device)

#     batch['frames'] = batch['frames'].requires_grad_(True)
#     print(batch['frames'].shape)

    pred, loss_dict = loaded_model(batch)
    pred = pred.clone().requires_grad_(True)

    loss_dict = {k: v.mean() for k, v in loss_dict.items() if v.numel() > 0}
    loss = sum(loss_dict.values())

    for k, v in loss_dict.items():
        loss_meters[k].add(v.item())

    # 정확도 계산
    acc = accuracy(pred, batch['label'])
    total_correct += acc
    total_images += 1

#         # 예측된 레이블과 실제 레이블 출력
    _, predicted_labels = torch.max(pred, 1)
    print(f"Batch {idx}:")
    print(f"Predicted labels: {predicted_labels}")
    print(f"Actual labels: {batch['label']}")
    
    if  predicted_labels != batch['label']:
        continue

    if i > 10:
        break
        
    print(batch['frames'].shape, batch['label'], batch['video_path'])
    video_path = batch['video_path']
    test_labels = batch['label'].to('cpu')
    test_frames = batch['frames'].to('cpu') # [1, 3, 32, 256, 256]
    video_frames = tf.transpose(test_frames, perm=[0,  2,  1, 3, 4]) # [1, 32, 3, 256, 256]
    preprocessed_image = preprocess_images(video_frames[0])
    #     print(preprocessed_image.shape, type(preprocessed_image))
    video_frames = torch.tensor(preprocessed_image.numpy())

    label = torch.tensor(test_labels.numpy())
    input_video_path = video_path[0]
    print(video_frames.shape, label, input_video_path)

    output_video_path = "/home/seclab_nahyun/jupyter_notebook/resized_video.mp4"  # 출력 비디오 파일 경로
    target_size = (256, 256)  # 목표 크기
    convert_resize_video(input_video_path, output_video_path)

    video_frames=video_frames.to(device)

#     _perturbation_image(loaded_model, video_frames, label, output_video_path, save_path, None, None, None, device, segmentation_module)

print(total_correct, total_images)
print(f"전체 정확도: {total_correct / total_images}%")

end_time = time.time()
elapsed_time = end_time - start_time

print(f"코드 실행 시간: {elapsed_time}초")


ModuleNotFoundError: No module named 'pytorch_grad_cam'

In [None]:
# model = get_model(name = 'i3d_resnet50_v1_ucf101', 
#                       nclass = 101, 
#                       pretrained = True, 
#                       num_segments = 2)
# model.cast('float32')
# # model.hybridize(static_alloc=True, static_shape=True)

# model.collect_params().reset_ctx(context)
# 모델 로딩
model = get_model(name='i3d_resnet50_v1_ucf101',
                  nclass=101,
                  pretrained=True,
                  num_segments=2)

# 모델 디바이스 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')