In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline
from pathlib import Path

model_ckpt = "omermazig/videomae-finetuned-nba-2-class-2-batch-5-epochs-399-vid-multiclass"

dataset_root_path = Path(r"C:\Users\User\Google Drive\dataset_2_classes_small")

In [None]:
pipe = pipeline("video-classification", model=model_ckpt)
trained_model = pipe.model
image_processor = pipe.image_processor

In [None]:
# from transformers import AutoImageProcessor, VideoMAEForVideoClassification
# 
# 
# image_processor = AutoImageProcessor.from_pretrained(model_ckpt)
# trained_model = VideoMAEForVideoClassification.from_pretrained(model_ckpt)

In [None]:
video_extension = "avi"

all_video_file_paths = (
    list(dataset_root_path.glob(f"train/*/*.{video_extension}"))
    + list(dataset_root_path.glob(f"val/*/*.{video_extension}"))
    + list(dataset_root_path.glob(f"test/*/*.{video_extension}"))
)
all_video_file_paths[:5]

In [None]:
class_labels = sorted({path.parent.stem for path in all_video_file_paths})
label2id = {label: i for i, label in enumerate(class_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"Unique classes: {list(label2id.keys())}.")

In [None]:
import pytorchvideo.data

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)

In [None]:
import os

mean = image_processor.image_mean
std = image_processor.image_std
if "shortest_edge" in image_processor.size:
    height = width = image_processor.size["shortest_edge"]
else:
    height = image_processor.size["height"]
    width = image_processor.size["width"]
resize_to = (height, width)

num_frames_to_sample = trained_model.config.num_frames

# Validation and Test datasets' transformations.
inference_transform = Compose(
    [
        ApplyTransformToKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize(resize_to),
                ]
            ),
        ),
    ]
)

MAX_VIDEO_DURATION = 6

def build_evaluate_dataset(dataset_type: str):
    # Validation and evaluation datasets.
    dataset = pytorchvideo.data.Ucf101(
        data_path=os.path.join(dataset_root_path, dataset_type),
        clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", MAX_VIDEO_DURATION),
        decode_audio=False,
        transform=inference_transform,
    )
    return dataset

In [None]:
import torch


def collate_fn(examples):
    """The collation function to be used by `Trainer` to prepare data batches."""
    # permute to (num_frames, num_channels, height, width)
    pixel_values = torch.stack(
        [example["video"].permute(1, 0, 2, 3) for example in examples]
    )
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

In [None]:
from transformers import TrainingArguments, Trainer

batch_size = 4

args = TrainingArguments(
    output_dir="kuku",
    per_device_eval_batch_size=batch_size,
    remove_unused_columns=False,
)

In [None]:
import evaluate

metric = evaluate.load("accuracy")

In [None]:
import numpy as np


# the compute_metrics function takes a Named Tuple as input:
# predictions, which are the logits of the model as Numpy arrays,
# and label_ids, which are the ground-truth labels as Numpy arrays.
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions."""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [None]:
def build_trainer(dataset):
    trainer = Trainer(
        trained_model,
        args,
        eval_dataset=dataset,
        tokenizer=image_processor,
        compute_metrics=compute_metrics,
        data_collator=collate_fn,
    )
    return trainer

In [None]:
for inference_dataset_type in ['val', 'test']:
    # build dataset.
    inference_dataset = build_evaluate_dataset(inference_dataset_type)
    # build trainer.
    inference_trainer = build_trainer(inference_dataset)
    # print results
    print(f"---------{inference_dataset_type}---------")
    results = inference_trainer.evaluate(inference_dataset)
    display(results)