In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline
from pathlib import Path

model_ckpt = "omermazig/videomae-base-finetuned-kinetics-finetuned-nba-binary-data-2-batch-50-epochs-new-database"

dataset_root_path = Path(r"C:\Users\User\PycharmProjects\DL-Workshop\new_dataset")

In [None]:
pipe = pipeline("video-classification", model=model_ckpt)
trained_model = pipe.model
image_processor = pipe.image_processor

In [None]:
# from transformers import AutoImageProcessor, VideoMAEForVideoClassification
# 
# 
# image_processor = AutoImageProcessor.from_pretrained(model_ckpt)
# trained_model = VideoMAEForVideoClassification.from_pretrained(model_ckpt)

In [None]:
import os
import pytorchvideo.data

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)

mean = image_processor.image_mean
std = image_processor.image_std
if "shortest_edge" in image_processor.size:
    height = width = image_processor.size["shortest_edge"]
else:
    height = image_processor.size["height"]
    width = image_processor.size["width"]
resize_to = (height, width)

num_frames_to_sample = trained_model.config.num_frames # Should be 16 for VideoMAE based

# Validation and evaluation datasets' transformations.
inference_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize(resize_to),
                ]
            ),
        ),
    ]
)

In [None]:
import torch


def run_inference(model, video, label):
    """Utility to run inference given a model and test video.
    
    The video is assumed to be preprocessed already.
    """
    # (num_frames, num_channels, height, width)
    perumuted_sample_test_video = video.permute(1, 0, 2, 3)

    inputs = {
        "pixel_values": perumuted_sample_test_video.unsqueeze(0),
        "labels": torch.tensor(
            [label]
        ),  # this can be skipped if you don't have labels available.
    }
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    model = model.to(device)

    # forward pass
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    return logits

In [None]:
from tqdm import tqdm

MAX_CLIP_DURATION = 6

for dataset_type in ['val', 'test']:
    # build datasets.
    inference_dataset = pytorchvideo.data.Ucf101(
        data_path=os.path.join(dataset_root_path, dataset_type),
        # Abuse, but make_clip_sampler with max duration take the whole clip
        clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", MAX_CLIP_DURATION), 
        decode_audio=False,
        transform=inference_transform,
    )

    correct = 0
    total = 0
    for sample_val_video in tqdm(inference_dataset):
        logits = run_inference(trained_model, sample_val_video["video"], sample_val_video["label"])
        video_name = sample_val_video['video_name']
        predicted_class_idx = logits.argmax(-1).item()
        real_class_idx = sample_val_video["label"]
        predicted_class = trained_model.config.id2label[predicted_class_idx]
        real_class = trained_model.config.id2label[real_class_idx]
        # print("Real class:", real_class)
        # print("Predicted class:", predicted_class)
        if real_class_idx == predicted_class_idx:
            correct += 1
        else:
            print(f"\033[91m Failed on {video_name}. Predicted {predicted_class}, but real label is {real_class}")
        total += 1

    print(f"\033[92m Accuracy on {dataset_type.upper()} is {correct / total}")