In [1]:
!pip install mediapy
!pip install -U gdown
!pip install transformers



In [1]:
import os
from collections import defaultdict
from typing import Callable
from zipfile import ZipFile

import albumentations as A
import cv2
import matplotlib.pyplot as plt
import mediapy as media
import numpy as np
import pandas as pd
import torch
import torchvision.transforms as transforms
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

from transformers import (
    AutoImageProcessor,
    SiglipForImageClassification,
    Trainer,
    TrainingArguments,
)

  from .autonotebook import tqdm as notebook_tqdm


## Get raw videos from google drive

In [None]:
!gdown https://drive.google.com/drive/folders/1Y8W5yiAxTDiRzx_9-caqa9dNhmo4dYcS --folder

In [2]:
PATH_CWD = os.getcwd()
PATH_DATA = os.path.join(PATH_CWD + "/AI in PWF/data/")

In [8]:
PATH_DATA

'/home/bobby/repos/medication-intake-detection-master-challenge/AI in PWF/data/'

In [None]:
video_zip_file = (
    PATH_DATA + "raw videos/Dataset_AI_Masterchallange 2025-20250424T063117Z-001.zip"
)
with ZipFile(video_zip_file, "r") as video_zip:
    video_zip.extractall(path=PATH_DATA + "raw videos")

In [None]:
os.remove(video_zip_file)

## Helper functions

In [None]:
def play_video(fname):
    fname = PATH_DATA + fname
    video = media.read_video(fname)
    media.show_video(video, title=fname.split("/")[-1].split(".")[0], fps=60, width=500)

In [None]:
play_video("raw videos/Dataset_AI_Masterchallange 2025/video_20250328_011625.mp4")

# Dataset

In [12]:
class VideoDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        data_dir,
        frames_per_video: int = 60,
        transform: Callable = None,
        type_: str = "train",
        split_: float = 0.8,
        frame_based: bool = True,
    ):

        self.data_dir = data_dir
        self.transform = transform
        self.type_ = type_
        self.split_ = split_
        self.frame_based = (
            frame_based  # determmines if dataset returns frames or videos
        )
        self.video_files, self.video_labels = self._load_video_files(data_dir)
        self.frames_per_video = frames_per_video
        self.set_mean = None
        self.set_std = None
        self.data = [
            {"video_path": fname, "label": self.video_labels[fname][0]}
            for fname in self.video_files
        ]

    def _load_video_files(self, data_dir):
        self.video_dir = os.path.join(
            data_dir, "raw videos/Dataset_AI_Masterchallange 2025"
        )

        video_files = [
            name.split("/")[-1]
            for name in os.listdir(self.video_dir)
            if name.endswith(".mp4")
        ]

        labels_df = pd.read_csv(os.path.join(data_dir, "annotations_mid.csv"))

        labels = defaultdict(tuple)
        for _, row in labels_df.iterrows():
            labels[row["filename"]] = (row["label"], row["edge_case"])

        np.random.seed(42)
        np.random.shuffle(video_files)
        split_idx = int(len(video_files) * self.split_)
        if self.type_ == "train":
            self.video_files = video_files[:split_idx]
        else:  # validation or test
            self.video_files = video_files[split_idx:]
        labels = {k: labels[k] for k in video_files}

        return video_files, labels

    def __getitem__(self, index):
        assert (
            self.set_mean is not None and self.set_std is not None
        ), "Call get_statistics() first"

        if self.frame_based:
            # Frame-based training (individual frames)
            video_idx = index // self.frames_per_video
            frame_idx = index % self.frames_per_video
            frames, label, _ = self.__load(video_idx)
            frame = frames[frame_idx]
            # frame = (frame - self.set_mean) / self.set_std
            if self.transform:
                frame = self.transform(images=frame, return_tensors="pt")[
                    "pixel_values"
                ].squeeze(0)
            return {
                "pixel_values": frame,  # Single frame
                "label": torch.tensor(
                    label, dtype=torch.long
                ),  # Convert label to tensor
            }
        else:
            # Video-based evaluation (all frames)
            frames, label, _ = self.__load(index)
            frames = (frames - self.set_mean) / self.set_std
            processed_frames = (
                [
                    self.transform(images=frame, return_tensors="pt")[
                        "pixel_values"
                    ].squeeze(0)
                    for frame in frames
                ]
                if self.transform
                else frames
            )
            return {
                "pixel_values": torch.stack(processed_frames),  # All frames stacked
                "label": torch.tensor(label, dtype=torch.long),
            }

    def __len__(self):
        if self.frame_based:
            return len(self.video_files) * self.frames_per_video
        else:
            return len(self.video_files)

    def __load(self, index):
        video_path = os.path.join(self.video_dir, self.video_files[index])
        label, edge_case = self.video_labels[self.video_files[index]]
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            print(f"Warning: Could not open video {video_path}")
            return np.zeros(
                (self.frames_per_video, 224, 224, 3), label, edge_case
            )  # Return dummy data

        ret = True
        frames = []
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        sample_interval = max(frame_count // self.frames_per_video, 1)
        pbar = tqdm(total=frame_count)
        current_idx = 0
        last_valid_frame = None

        while len(frames) < self.frames_per_video:
            ret, frame = cap.read()
            if not ret:
                # Reached end of video early - pad with last valid frame
                if last_valid_frame is not None:
                    frames.extend(
                        [last_valid_frame] * (self.frames_per_video - len(frames))
                    )
                break
            if current_idx % sample_interval == 0:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

                # Basic corruption check
                if np.mean(frame) < 1e-6:  # Mostly black frame
                    if last_valid_frame is not None:
                        frame = last_valid_frame
                    else:
                        continue
                frames.append(frame)
                last_valid_frame = frame
            current_idx += 1
            pbar.update(1)
        pbar.close()

        # In case we collected less than required, pad with zeros using the first frame shape
        if len(frames) < self.frames_per_video:
            pad_frame = (
                np.zeros_like(frames[0])
                if frames
                else np.zeros((224, 224, 3), dtype=np.uint8)
            )
            frames += [pad_frame] * (self.frames_per_video - len(frames))

        # Convert to numpy array
        frames = np.array(frames)
        cap.release()
        return frames, label, edge_case

    def get_statistics(self):
        data_size = len(self.video_files)
        total_sum = np.zeros(3)
        total_sq_sum = np.zeros(3)
        total_count = np.zeros(3)
        for i in range(data_size):
            frames, _, _ = self.__load(i)
            frames_np = np.array(frames)  # shape: (num_frames, H, W, C)
            # Reshape to (-1, 3) to flatten all pixels, then sum per channel
            pixels = frames_np.reshape(-1, frames_np.shape[-1])
            total_sum += pixels.sum(axis=0)
            total_sq_sum += (pixels**2).sum(axis=0)
            total_count += pixels.shape[0]

        self.set_mean = total_sum / total_count
        self.set_std = np.sqrt((total_sq_sum / total_count) - (self.set_mean**2) + 1e-8)

In [13]:
processor = AutoImageProcessor.from_pretrained("prithivMLmods/Human-Action-Recognition")

In [14]:
training_set = VideoDataset(
    data_dir=PATH_DATA, transform=processor, type_="train", split_=0.8, frame_based=True
)
training_set.get_statistics()  # running separately because we should only take training set statistics for normalization

  0%|          | 0/761 [00:00<?, ?it/s]

 93%|█████████▎| 709/761 [00:02<00:00, 298.66it/s]
 93%|█████████▎| 709/760 [00:02<00:00, 304.77it/s]
 92%|█████████▏| 709/768 [00:02<00:00, 305.59it/s]
 98%|█████████▊| 768/782 [00:02<00:00, 310.49it/s]
 98%|█████████▊| 768/782 [00:02<00:00, 303.79it/s]
 92%|█████████▏| 709/767 [00:02<00:00, 306.39it/s]
 91%|█████████ | 709/777 [00:02<00:00, 309.31it/s]
 91%|█████████▏| 709/776 [00:02<00:00, 314.18it/s]
 91%|█████████▏| 709/775 [00:02<00:00, 308.49it/s]
 98%|█████████▊| 768/782 [00:02<00:00, 315.05it/s]
 98%|█████████▊| 768/787 [00:02<00:00, 312.06it/s]
 92%|█████████▏| 709/769 [00:02<00:00, 306.09it/s]
 98%|█████████▊| 768/782 [00:02<00:00, 303.48it/s]
 93%|█████████▎| 709/766 [00:02<00:00, 291.27it/s]
 98%|█████████▊| 768/785 [00:02<00:00, 302.92it/s]
 98%|█████████▊| 768/784 [00:02<00:00, 311.35it/s]
 92%|█████████▏| 709/768 [00:02<00:00, 303.41it/s]
 92%|█████████▏| 709/769 [00:02<00:00, 303.01it/s]
 93%|█████████▎| 709/763 [00:02<00:00, 295.28it/s]
 93%|█████████▎| 709/760 [00:02

In [15]:
val_set_frame = VideoDataset(
    data_dir=PATH_DATA,
    transform=processor,
    type_="validation",
    split_=0.8,
    frame_based=True,
)
val_set_frame.set_mean = training_set.set_mean
val_set_frame.set_std = training_set.set_std

In [16]:
val_set_video = VideoDataset(
    data_dir=PATH_DATA,
    transform=processor,
    type_="validation",
    split_=0.8,
    frame_based=False,
)
val_set_video.set_mean = training_set.set_mean
val_set_video.set_std = training_set.set_std

## Get the model

In [17]:
model = SiglipForImageClassification.from_pretrained(
    "prithivMLmods/Human-Action-Recognition"
)

In [18]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary"
    )

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [19]:
model.config.label2id = {"not_taking_medication": 0, "taking_medication": 1}
model.config.id2label = {0: "not_taking_medication", 1: "taking_medication"}

training_args = TrainingArguments(
    output_dir="./HAR-medication-finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_set,
    eval_dataset=val_set_frame,  # Frame-based for validation during training
    compute_metrics=compute_metrics,
)

In [20]:
trainer.train()

  0%|          | 0/787 [00:00<?, ?it/s]

 98%|█████████▊| 768/787 [00:02<00:00, 309.92it/s]
 98%|█████████▊| 768/787 [00:02<00:00, 306.08it/s]
 92%|█████████▏| 709/772 [00:02<00:00, 299.20it/s]
 93%|█████████▎| 709/763 [00:02<00:00, 296.24it/s]
 93%|█████████▎| 709/763 [00:02<00:00, 301.52it/s]
 91%|█████████ | 709/778 [00:02<00:00, 302.55it/s]
 92%|█████████▏| 709/772 [00:02<00:00, 302.74it/s]
 92%|█████████▏| 709/767 [00:02<00:00, 299.50it/s]
 98%|█████████▊| 768/787 [00:02<00:00, 304.72it/s]
 93%|█████████▎| 709/761 [00:02<00:00, 298.99it/s]
 98%|█████████▊| 768/787 [00:02<00:00, 313.05it/s]
 98%|█████████▊| 768/787 [00:02<00:00, 309.11it/s]
 92%|█████████▏| 709/768 [00:02<00:00, 304.42it/s]
 92%|█████████▏| 709/768 [00:02<00:00, 301.43it/s]
 91%|█████████▏| 709/776 [00:02<00:00, 305.34it/s]
 92%|█████████▏| 709/768 [00:02<00:00, 308.59it/s]


Epoch,Training Loss,Validation Loss


 93%|█████████▎| 709/765 [00:02<00:00, 305.65it/s]
 93%|█████████▎| 709/761 [00:02<00:00, 300.75it/s]
 91%|█████████ | 709/777 [00:02<00:00, 305.47it/s]
 93%|█████████▎| 709/762 [00:02<00:00, 304.88it/s]
 91%|█████████ | 709/778 [00:02<00:00, 303.27it/s]
 92%|█████████▏| 709/772 [00:02<00:00, 303.26it/s]
 93%|█████████▎| 709/762 [00:02<00:00, 303.10it/s]
 98%|█████████▊| 768/787 [00:02<00:00, 323.74it/s]
 91%|█████████▏| 709/776 [00:02<00:00, 307.74it/s]
 92%|█████████▏| 709/769 [00:02<00:00, 307.00it/s]
 98%|█████████▊| 768/782 [00:02<00:00, 314.56it/s]
 93%|█████████▎| 709/759 [00:02<00:00, 307.68it/s]
 98%|█████████▊| 768/783 [00:02<00:00, 315.24it/s]
 98%|█████████▊| 768/784 [00:02<00:00, 322.66it/s]
 93%|█████████▎| 709/761 [00:02<00:00, 305.57it/s]
 94%|█████████▍| 709/756 [00:02<00:00, 302.25it/s]
 92%|█████████▏| 709/770 [00:02<00:00, 309.08it/s]
 93%|█████████▎| 709/762 [00:02<00:00, 301.90it/s]
 91%|█████████▏| 709/775 [00:02<00:00, 298.93it/s]
 93%|█████████▎| 709/760 [00:02

KeyboardInterrupt: 

In [None]:
trainer.save_model("./HAR-med-finetuned")
processor.save_pretrained("./HAR-med-finetuned")

In [None]:
def evaluate_video_level(model, dataset, device="cuda"):
    model.eval()
    all_preds = []
    all_labels = []
    for idx in tqdm(range(len(dataset))):
        sample = dataset[idx]
        frames = sample["pixel_values"].to(device)
        label = sample["label"]
        with torch.no_grad():
            logits = model(pixel_values=frames).logits
            preds = torch.argmax(logits, dim=1)
            majority_pred = torch.mode(preds).values.item()
        all_preds.append(majority_pred)
        all_labels.append(label)
    return (
        accuracy_score(all_labels, all_preds),
        *precision_recall_fscore_support(all_labels, all_preds, average="binary")[:3],
    )

In [None]:
# Evaluate on video-based validation set
accuracy, precision, recall, f1 = evaluate_video_level(model, val_set_video)
print(
    f"Validation (Video-Level): Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1: {f1}"
)