In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline
from pathlib import Path

model_ckpt = "omermazig/5-class-8-batch-50-epochs-2000-vid-multiclass"

dataset_root_path = Path(r"C:\Users\User\Google Drive\dataset")

In [None]:
pipe = pipeline("video-classification", model=model_ckpt)
trained_model = pipe.model
image_processor = pipe.image_processor

In [None]:
# from transformers import AutoImageProcessor, VideoMAEForVideoClassification
# 
# 
# image_processor = AutoImageProcessor.from_pretrained(model_ckpt)
# trained_model = VideoMAEForVideoClassification.from_pretrained(model_ckpt)

In [None]:
video_extension = "avi"

all_video_file_paths = (
    list(dataset_root_path.glob(f"train/*/*.{video_extension}"))
    + list(dataset_root_path.glob(f"val/*/*.{video_extension}"))
    + list(dataset_root_path.glob(f"test/*/*.{video_extension}"))
)
all_video_file_paths[:5]

In [None]:
class_labels = sorted({path.parent.stem for path in all_video_file_paths})
label2id = {label: i for i, label in enumerate(class_labels)}
id2label = {i: label for label, i in label2id.items()}

print(f"Unique classes: {list(label2id.keys())}.")

In [None]:
import torch
from typing import Callable, Dict, List
import pytorchvideo.data

from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    RemoveKey,
    ShortSideScale,
    UniformTemporalSubsample,
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize,
)


class ApplyTransformToListUnderKey:
    """
    Applies transform to key of dictionary input, where there is a list of values under it.

    Args:
        key (str): the dictionary key the transform is applied to
        transform (callable): the transform that is applied for each element

    Example:
            transforms.ApplyTransformToKey(
                key='video',
                transform=UniformTemporalSubsample(num_video_samples),
            )
    """

    def __init__(self, key: str, transform: Callable):
        self._key = key
        self._transform = transform

    def __call__(self, x: Dict[str, List[torch.Tensor]]) -> Dict[str, List[torch.Tensor]]:
        for i in range(len(x[self._key])):
            x[self._key][i] = self._transform(x[self._key][i])
        return x

In [None]:
import os

mean = image_processor.image_mean
std = image_processor.image_std
if "shortest_edge" in image_processor.size:
    height = width = image_processor.size["shortest_edge"]
else:
    height = image_processor.size["height"]
    width = image_processor.size["width"]
resize_to = (height, width)

num_frames_to_sample = trained_model.config.num_frames
sample_rate = 4
fps = 30
clip_duration = num_frames_to_sample * sample_rate / fps

# Validation and Test datasets' transformations.
inference_transform = Compose(
    [
        ApplyTransformToListUnderKey(
            key="video",
            transform=Compose(
                [
                    UniformTemporalSubsample(num_frames_to_sample),
                    Lambda(lambda x: x / 255.0),
                    Normalize(mean, std),
                    Resize(resize_to),
                ]
            ),
        ),
    ]
)

MAX_VIDEO_DURATION = 6


def build_evaluate_dataset(dataset_type: str):
    # Validation and evaluation datasets.
    dataset = pytorchvideo.data.Ucf101(
        data_path=os.path.join(dataset_root_path, dataset_type),
        clip_sampler=pytorchvideo.data.make_clip_sampler("random_multi", clip_duration, 5),
        decode_audio=False,
        transform=inference_transform,
    )
    return dataset

In [None]:
import torch


def collate_fn(examples):
    """The collation function to be used by `Trainer` to prepare data batches."""
    # permute to (num_frames, num_channels, height, width)
    pixel_values = torch.cat(
        [torch.stack([single_example.permute(1, 0, 2, 3) for single_example in example["video"]]) for example in
         examples]
    )
    labels = torch.tensor([example["label"] for example in examples])
    # TODO - Maybe find a way to not unnecessarily duplicate those labels (They are duplicated just so the dimensions will fit with pixel_values, because torch tries to calculate loss for some reason  
    labels = labels.repeat_interleave(len(examples[0]["video"]))
    return {"pixel_values": pixel_values, "labels": labels}

In [None]:
from transformers import TrainingArguments, Trainer

batch_size = 1

args = TrainingArguments(
    output_dir="kuku",
    per_device_eval_batch_size=batch_size,
    remove_unused_columns=False,
)

In [None]:
import statistics
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, classification_report
import torch


# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def get_classification_report(predictions, labels):
    f1_micro_average = f1_score(y_true=labels, y_pred=predictions, average='micro')
    # roc_auc = roc_auc_score(y_true, y_pred, average='micro')
    accuracy = accuracy_score(labels, predictions)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               # 'roc_auc': roc_auc,
               'accuracy': accuracy}
    report = classification_report(labels, predictions, target_names=class_labels)
    print(report)
    return metrics


# the compute_metrics function takes a Named Tuple as input:
# predictions, which are the logits of the model as Numpy arrays,
# and label_ids, which are the ground-truth labels as Numpy arrays.
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions."""
    n = eval_pred.label_ids.shape[0] // 5
    predictions = [
        statistics.mode(np.argmax(batch, axis=1))
        for batch in np.array_split(eval_pred.predictions, n)
    ]
    labels = [batch[0] for batch in np.array_split(eval_pred.label_ids, n)]
    return get_classification_report(predictions=predictions, labels=labels)

In [None]:
def build_trainer(dataset):
    trainer = Trainer(
        trained_model,
        args,
        eval_dataset=dataset,
        tokenizer=image_processor,
        compute_metrics=compute_metrics,
        data_collator=collate_fn,
    )
    return trainer

In [None]:
for inference_dataset_type in ['val', 'test']:
    # build dataset.
    inference_dataset = build_evaluate_dataset(inference_dataset_type)
    # build trainer.
    inference_trainer = build_trainer(inference_dataset)
    # print results
    print(f"---------{inference_dataset_type}---------")
    results = inference_trainer.evaluate(inference_dataset)
    display(results)