In [1]:
import torch
import os
import numpy as np
import pandas as pd
from torchvision import transforms
from transformers import AutoModelForVideoClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, average_precision_score
from torch.utils.data import Dataset
import torch.nn as nn

# Define Data Augmentation for Training
train_transforms = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Validation/Test Transformations (No Augmentation)
val_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Dataset class with Augmentation
class PreprocessedDashcamDataset(Dataset):
    def __init__(self, df, preprocessed_dir, mode='train'):
        self.df = df
        self.preprocessed_dir = preprocessed_dir
        self.mode = mode
        self.transform = train_transforms if mode == 'train' else val_transforms

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        video_id = self.df['id'].iloc[idx]
        preprocessed_path = os.path.join(self.preprocessed_dir, f"{video_id}.npy")
        
        # Load preprocessed video frames (shape: [16, 3, 224, 224])
        pixel_values = torch.from_numpy(np.load(preprocessed_path)).float()
        
        # Apply augmentation frame-wise
        pixel_values = torch.stack([self.transform(frame) for frame in pixel_values])

        # Include labels for both train and val modes
        label = self.df['target'].iloc[idx]
        return {
            'pixel_values': pixel_values,
            'labels': torch.tensor(label, dtype=torch.long),
            'video_id': video_id
        }

# Load data and split
train_df_full = pd.read_csv('/kaggle/input/nexar-collision-prediction/train.csv')
train_df = train_df_full.sample(frac=0.8, random_state=42)
val_df = train_df_full.drop(train_df.index)

train_dataset = PreprocessedDashcamDataset(train_df, '/kaggle/input/new-approach-train-16fps/preprocessed_train_8frames/', mode='train')
val_dataset = PreprocessedDashcamDataset(val_df, '/kaggle/input/new-approach-train-16fps/preprocessed_train_8frames/', mode='val')

In [2]:
# Load VideoMAE-2 Model
model_name = "MCG-NJU/videomae-large-finetuned-kinetics"
model = AutoModelForVideoClassification.from_pretrained(
    model_name,
    num_labels=2,
    ignore_mismatched_sizes=True 
)

# Move model to GPU
model = model.to('cuda')

# Print Model Summary
print(model)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-large-finetuned-kinetics and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([400]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([400, 1024]) in the checkpoint and torch.Size([2, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VideoMAEForVideoClassification(
  (videomae): VideoMAEModel(
    (embeddings): VideoMAEEmbeddings(
      (patch_embeddings): VideoMAEPatchEmbeddings(
        (projection): Conv3d(3, 1024, kernel_size=(2, 16, 16), stride=(2, 16, 16))
      )
    )
    (encoder): VideoMAEEncoder(
      (layer): ModuleList(
        (0-23): 24 x VideoMAELayer(
          (attention): VideoMAESdpaAttention(
            (attention): VideoMAESdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=False)
              (key): Linear(in_features=1024, out_features=1024, bias=False)
              (value): Linear(in_features=1024, out_features=1024, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): VideoMAESelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): VideoMAEIntermediate(
       

In [3]:
# Define Evaluation Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)[:, 1]
    return {
        "eval_accuracy": accuracy_score(labels, predictions),
        "eval_average_precision": average_precision_score(labels, probs.numpy())
    }

# Training Configuration
training_args = TrainingArguments(
    output_dir="./video_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=15,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    ddp_find_unused_parameters=True,
    metric_for_best_model="eval_average_precision",
    report_to="none",
)

# Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Start Training
trainer.train()

# Save Model
trainer.save_model("./trained_videomae_model")



Epoch,Training Loss,Validation Loss,Accuracy,Average Precision
1,0.4962,0.650593,0.58,0.802731
2,0.4254,0.605347,0.666667,0.770876
3,0.3534,0.582172,0.69,0.754409
4,0.3764,0.59045,0.676667,0.76568
5,0.4451,0.589732,0.696667,0.761397
6,0.1379,0.607356,0.686667,0.783693
7,0.2516,0.653906,0.693333,0.783127
8,0.1688,0.631707,0.713333,0.801484
9,0.1062,0.655337,0.686667,0.795637
10,0.0298,0.727266,0.676667,0.783442


