In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fight-detection/dataset/videos.txt
/kaggle/input/fight-detection/dataset/fight/fi141.mp4
/kaggle/input/fight-detection/dataset/fight/fi034.mp4
/kaggle/input/fight-detection/dataset/fight/fi067.mp4
/kaggle/input/fight-detection/dataset/fight/fi135.mp4
/kaggle/input/fight-detection/dataset/fight/fi100.mp4
/kaggle/input/fight-detection/dataset/fight/fi066.mp4
/kaggle/input/fight-detection/dataset/fight/fi124.mp4
/kaggle/input/fight-detection/dataset/fight/fi143.mp4
/kaggle/input/fight-detection/dataset/fight/fi027.mp4
/kaggle/input/fight-detection/dataset/fight/fi080.mp4
/kaggle/input/fight-detection/dataset/fight/fi116.mp4
/kaggle/input/fight-detection/dataset/fight/fi136.mp4
/kaggle/input/fight-detection/dataset/fight/fi030.mp4
/kaggle/input/fight-detection/dataset/fight/fi003.mp4
/kaggle/input/fight-detection/dataset/fight/fi062.mp4
/kaggle/input/fight-detection/dataset/fight/fi002.mp4
/kaggle/input/fight-detection/dataset/fight/fi104.mp4
/kaggle/input/fight-detection/dat

In [2]:
# Install necessary libraries
!pip install -q pytorchvideo transformers evaluate


In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
import cv2
import numpy as np
from sklearn.model_selection import train_test_split

# Define paths to your dataset
fight_path_template = '/kaggle/input/fight-detection/dataset/fight/fi{num:03d}.mp4'
nofight_path_template = '/kaggle/input/fight-detection/dataset/noFight/nofi{num:03d}.mp4'

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)


cuda


In [4]:

class DataLoading:
    def load_video_with_resizing_and_frame_handling(self, video_path, target_frames=16, target_size=(224, 224)):
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            print(f"Error: Could not open video {video_path}")
            return None

        frames = []
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            resized_frame = cv2.resize(frame, target_size)
            normalized_frame = resized_frame / 255.0
            frames.append(normalized_frame)

        cap.release()
        video_tensor = np.array(frames)

        num_frames = video_tensor.shape[0]
        if num_frames > target_frames:
            indices = np.linspace(0, num_frames - 1, target_frames).astype(int)
            video_tensor = video_tensor[indices]
        elif num_frames < target_frames:
            pad_length = target_frames - num_frames
            padding = np.zeros((pad_length, target_size[0], target_size[1], 3))
            video_tensor = np.concatenate((video_tensor, padding), axis=0)
        return video_tensor  # numpy array

    def get_data(self):
        num_samples = 150
        data = []
        labels = []
        count = 0
        for i in range(1, num_samples + 1):
            video_path = fight_path_template.format(num=i)
            video_tensor = self.load_video_with_resizing_and_frame_handling(video_path)
            if video_tensor is not None:
                count += 1
                data.append(video_tensor)
                labels.append(1)  # Fight label

        for i in range(1, num_samples + 1):
            video_path = nofight_path_template.format(num=i)
            video_tensor = self.load_video_with_resizing_and_frame_handling(video_path)
            if video_tensor is not None:
                count += 1
                data.append(video_tensor)
                labels.append(0)  # NoFight label
        return data, labels

# Instantiate and load data
instance = DataLoading()
data, labels = instance.get_data()

# Convert to tensors
data = np.array(data)  # Shape: [300, 32, 112, 112, 3]
data = np.transpose(data,(0, 1, 4, 2, 3) )

labels = np.array(labels)  # Shape: [300]

# tensor_X = torch.tensor(data)
# tensor_Y = torch.tensor(labels)

# # Split into training and testing sets (80% train, 20% test)
# X_train, X_test, y_train, y_test = train_test_split(
#     tensor_X, tensor_Y, test_size=0.2, random_state=1, stratify=tensor_Y
# )

# # Permute the tensors to [batch_size, channels, depth, height, width]
# X_train = X_train.permute(0, 4, 1, 2, 3).float()  # Shape: [240, 3, 32, 112, 112]
# X_test = X_test.permute(0, 4, 1, 2, 3).float()    # Shape: [60, 3, 32, 112, 112]

# # Ensure labels are integers for CrossEntropyLoss
# y_train = y_train.long()  # Shape: [240]
# y_test = y_test.long()    # Shape: [60]

# class VideoDataset(Dataset):
#     def __init__(self, videos, labels):
#         self.videos = videos  # Tensor containing video data [batch, 3, 32, 112, 112]
#         self.labels = labels  # Tensor containing labels [batch]

#     def __len__(self):
#         return len(self.videos)

#     def __getitem__(self, idx):
#         video = self.videos[idx]    # [3, 32, 112, 112]
#         label = self.labels[idx]    # Scalar int (0 or 1)
#         return {"video": video, "label": label}

# # Create Dataset instances
# dataset_train = VideoDataset(X_train, y_train)
# dataset_test = VideoDataset(X_test, y_test)

# # Create DataLoader instances (optional, if you prefer using DataLoader instead of Hugging Face Datasets)
# batch_size = 2
# data_loader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, num_workers=2)
# data_loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False, num_workers=2)


In [5]:
import random
from datasets import Dataset as HFDataset

# Convert tensors to numpy for Hugging Face Dataset
def tensor_to_numpy(videos, labels):
    lst = []
    for i in range(0, len(videos) - 29, 30):
        lst.append({"video": videos[i:i + 30], "label": labels[i:i + 30]})
    return lst

# Convert tensors and split with randomness
lst = tensor_to_numpy(data, labels)

# Shuffle the list randomly
random.shuffle(lst)

# Split into train and test sets
train_split = int(0.8 * len(lst))  # 80% for training, 20% for testing
train_dict_list = lst[:train_split]
test_dict_list = lst[train_split:]


In [6]:
len(test_dict_list)

2

In [7]:
hf_train_dataset_list=[]
for train_dict in train_dict_list :
    hf_train_dataset_list.append(HFDataset.from_dict(train_dict))
hf_test_dataset_list=[]
for test_dict in lst :
    hf_test_dataset_list.append(HFDataset.from_dict(test_dict))


In [8]:
from datasets import concatenate_datasets
merged_dataset_train=hf_train_dataset_list[0]
for i in range(1,len(hf_train_dataset_list)) :
    merged_dataset_train=concatenate_datasets([merged_dataset_train,hf_train_dataset_list[i]])

merged_dataset_test=hf_test_dataset_list[0]
for i in range(1,len(hf_test_dataset_list)) :
    merged_dataset_test=concatenate_datasets([merged_dataset_test,hf_test_dataset_list[i]])

In [9]:
# Rename 'video' key to match the expected input by VideoMAE
hf_train_dataset = merged_dataset_train.rename_column("video", "pixel_values")  #what does this functions do
hf_test_dataset = merged_dataset_test.rename_column("video", "pixel_values")  

In [10]:
hf_train_dataset

Dataset({
    features: ['pixel_values', 'label'],
    num_rows: 240
})

In [11]:
# Define label mappings
label2id = {"noFight": 0, "fight": 1}
id2label = {0: "noFight", 1: "fight"}


In [12]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.19.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.19.0 (from mlflow)
  Downloading mlflow_skinny-2.19.0-py3-none-any.whl.metadata (31 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting cachetools<6,>=5.0.0 (from mlflow-skinny==2.19.0->mlflow)
  Downloading cachetools-5.5.0-py3-none-any.whl.metadata (5.3 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.19.0->mlflow)
  Downloading databricks_sdk-0.39.0-py3-none-any.whl.metadata (38 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.5-py3-none-any.whl.metadata (10 kB)
Collecting graphql-relay<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_relay-3.2.0-py3-none-any.whl.metadata (12 kB)
Downloading mlflow-2.19.0-py3-none-any.whl (27.

In [13]:
import torch
import torch.nn as nn
import numpy as np  # Added missing import
import evaluate
import mlflow
import mlflow.pytorch
from transformers import TrainerCallback
from transformers import (
    VideoMAEImageProcessor,
    VideoMAEForVideoClassification,
    TrainingArguments,
    Trainer,
    default_data_collator,
    TrainerCallback,
)
# Specify the pretrained model checkpoint
model_ckpt = "MCG-NJU/videomae-base"

# Load the image processor
image_processor = VideoMAEImageProcessor.from_pretrained(model_ckpt)

# Load the VideoMAE model for video classification with 2 labels
model = VideoMAEForVideoClassification.from_pretrained(
    model_ckpt,
    num_labels=400,
    # label2id=label2id,
    # id2label=id2label,
    ignore_mismatched_sizes=False,  # Allows adding new classification head
).to(device)

preprocessor_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/377M [00:00<?, ?B/s]

Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
import torch
import torch.nn as nn
import numpy as np  # Added missing import
import evaluate
import mlflow
import mlflow.pytorch
from transformers import (
    VideoMAEImageProcessor,
    VideoMAEForVideoClassification,
    TrainingArguments,
    Trainer,
    default_data_collator,
    TrainerCallback,
)

# Load accuracy metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

class CustomClassifier(nn.Module):
    def __init__(self, model, num_labels):
        super(CustomClassifier, self).__init__()
        self.model = model  # Pre-trained VideoMAE model
        self.new_layers = nn.Sequential(
            nn.Linear(400, 512),  # Add a linear layer
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_labels)  # Final classification layer
        )

    def forward(self, pixel_values, labels=None):
        # The model directly returns logits for classification
        outputs = self.model(pixel_values=pixel_values)
        logits = self.new_layers(outputs.logits)  # Modify to use logits directly
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)
            return {"loss": loss, "logits": logits}

        return {"logits": logits}

# Define a custom callback to log metrics to MLflow
class MLflowCallback(TrainerCallback):
    def __init__(self):
        super().__init__()

    def on_epoch_end(self, args, state, control, **kwargs):
        metrics = kwargs.get('metrics', {})
        if metrics:
            epoch = state.epoch
            if 'train_loss' in metrics:
                mlflow.log_metric("train_loss", metrics.get("train_loss"), step=epoch)
            if 'eval_accuracy' in metrics:
                mlflow.log_metric("eval_accuracy", metrics.get("eval_accuracy"), step=epoch)
            if 'eval_loss' in metrics:
                mlflow.log_metric("eval_loss", metrics.get("eval_loss"), step=epoch)

# Assuming label2id and id2label are defined
# label2id = {"class0": 0, "class1": 1}
# id2label = {0: "class0", 1: "class1"}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loop over learning rates
for lr in [1e-4, 1e-5]:  # Example learning rates
    with mlflow.start_run(run_name=f"lr_{lr}"):
        print(f"Training with learning rate: {lr}")

        # Specify the pretrained model checkpoint
        model_ckpt = "MCG-NJU/videomae-base"

        # Load the image processor
        image_processor = VideoMAEImageProcessor.from_pretrained(model_ckpt)

        # Load the VideoMAE model for video classification with 2 labels
        model = VideoMAEForVideoClassification.from_pretrained(
            model_ckpt,
            num_labels=400,
            # label2id=label2id,
            # id2label=id2label,
            ignore_mismatched_sizes=False,  # Allows adding new classification head
        ).to(device)

        # Wrap the model with custom classifier
        model = CustomClassifier(model, 2)

        # Define training arguments with GPU optimizations
        training_args = TrainingArguments(
            output_dir="./video_mae_fight_noFight",
            num_train_epochs=10,
            per_device_train_batch_size=10,
            per_device_eval_batch_size=10,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            learning_rate=lr,
            warmup_ratio=0.1,
            logging_steps=10,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            push_to_hub=False,
            remove_unused_columns=False,  # Necessary to keep 'pixel_values'
            report_to="none",
            fp16=True,  # Enable mixed precision training
            dataloader_num_workers=4,  # Increase for faster data loading
        )

        # Instantiate the Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=hf_train_dataset,
            eval_dataset=hf_test_dataset,
            compute_metrics=compute_metrics,
            data_collator=default_data_collator,
            tokenizer=image_processor,  # Necessary for push_to_hub to include image processor
            callbacks=[MLflowCallback()],
        )

        # Log the learning rate parameter
        mlflow.log_param('learning_rate', lr)

        # Start training
        train_results = trainer.train()

        # Log training metrics
        train_metrics = train_results.metrics
        mlflow.log_metric('train_loss', train_metrics['train_loss'])
        if 'train_accuracy' in train_metrics:
            mlflow.log_metric('train_accuracy', train_metrics['train_accuracy'])

        # Evaluate the model
        eval_results = trainer.evaluate()

        # Log evaluation metrics
        mlflow.log_metric('test_loss', eval_results.get('eval_loss'))
        if 'eval_accuracy' in eval_results:
            mlflow.log_metric('test_accuracy', eval_results['eval_accuracy'])

        print(f"Training results for lr {lr}: {train_metrics}")
        print(f"Evaluation results for lr {lr}: {eval_results}")


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Training with learning rate: 0.0001


Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6949,0.664962,0.553333
2,0.675,0.682303,0.52
3,0.6345,0.511513,0.776667
4,0.4924,0.368056,0.86
5,0.3047,0.334437,0.893333
6,0.1779,0.230751,0.92
7,0.0925,0.347311,0.916667
8,0.0626,0.158294,0.946667
9,0.1006,0.391693,0.92
10,0.0318,0.308792,0.933333


  self.pid = os.fork()
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):

  self.pid = os.fork()


Training results for lr 0.0001: {'train_runtime': 4064.9408, 'train_samples_per_second': 0.59, 'train_steps_per_second': 0.03, 'total_flos': 0.0, 'train_loss': 0.3097060183684031, 'epoch': 10.0}
Evaluation results for lr 0.0001: {'eval_loss': 0.1582939624786377, 'eval_accuracy': 0.9466666666666667, 'eval_runtime': 213.3864, 'eval_samples_per_second': 1.406, 'eval_steps_per_second': 0.07, 'epoch': 10.0}
Training with learning rate: 1e-05


Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6965,0.6859,0.563333
2,0.6816,0.674339,0.683333
3,0.6719,0.6614,0.713333
4,0.6464,0.646154,0.743333
5,0.6216,0.630367,0.75
6,0.5986,0.612221,0.753333
7,0.5803,0.594577,0.786667
8,0.5549,0.581549,0.796667
9,0.5384,0.572488,0.8
10,0.5182,0.569357,0.81


  self.pid = os.fork()
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  self.pid = os.fork()
  self.pid = os.fork()
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):

  self.pid = os.fork()


Training results for lr 1e-05: {'train_runtime': 4078.2583, 'train_samples_per_second': 0.588, 'train_steps_per_second': 0.029, 'total_flos': 0.0, 'train_loss': 0.6075323621431986, 'epoch': 10.0}
Evaluation results for lr 1e-05: {'eval_loss': 0.5693569183349609, 'eval_accuracy': 0.81, 'eval_runtime': 217.9941, 'eval_samples_per_second': 1.376, 'eval_steps_per_second': 0.069, 'epoch': 10.0}


In [15]:
import mlflow
import mlflow.pytorch

In [16]:
learning_rates=[0.001]
# Initialize MLflow
mlflow.set_tracking_uri("file://" + os.path.abspath("mlruns"))  # Change as needed
mlflow.set_experiment("VideoMAE_Fight_NoFight_Classification")

2024/12/12 12:47:25 INFO mlflow.tracking.fluent: Experiment with name 'VideoMAE_Fight_NoFight_Classification' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///kaggle/working/mlruns/216868649290100681', creation_time=1734007645157, experiment_id='216868649290100681', last_update_time=1734007645157, lifecycle_stage='active', name='VideoMAE_Fight_NoFight_Classification', tags={}>

In [17]:
mlflow.end_run()