In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fight-detection/dataset/videos.txt
/kaggle/input/fight-detection/dataset/fight/fi141.mp4
/kaggle/input/fight-detection/dataset/fight/fi034.mp4
/kaggle/input/fight-detection/dataset/fight/fi067.mp4
/kaggle/input/fight-detection/dataset/fight/fi135.mp4
/kaggle/input/fight-detection/dataset/fight/fi100.mp4
/kaggle/input/fight-detection/dataset/fight/fi066.mp4
/kaggle/input/fight-detection/dataset/fight/fi124.mp4
/kaggle/input/fight-detection/dataset/fight/fi143.mp4
/kaggle/input/fight-detection/dataset/fight/fi027.mp4
/kaggle/input/fight-detection/dataset/fight/fi080.mp4
/kaggle/input/fight-detection/dataset/fight/fi116.mp4
/kaggle/input/fight-detection/dataset/fight/fi136.mp4
/kaggle/input/fight-detection/dataset/fight/fi030.mp4
/kaggle/input/fight-detection/dataset/fight/fi003.mp4
/kaggle/input/fight-detection/dataset/fight/fi062.mp4
/kaggle/input/fight-detection/dataset/fight/fi002.mp4
/kaggle/input/fight-detection/dataset/fight/fi104.mp4
/kaggle/input/fight-detection/dat

In [2]:
# Install necessary libraries
!pip install -q pytorchvideo transformers evaluate


In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
import cv2
import numpy as np
from sklearn.model_selection import train_test_split

# Define paths to your dataset
fight_path_template = '/kaggle/input/fight-detection/dataset/fight/fi{num:03d}.mp4'
nofight_path_template = '/kaggle/input/fight-detection/dataset/noFight/nofi{num:03d}.mp4'

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)


cuda


In [4]:

class DataLoading:
    def load_video_with_resizing_and_frame_handling(self, video_path, target_frames=16, target_size=(224, 224)):
        cap = cv2.VideoCapture(video_path)
        if not cap.isOpened():
            print(f"Error: Could not open video {video_path}")
            return None

        frames = []
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            resized_frame = cv2.resize(frame, target_size)
            normalized_frame = resized_frame / 255.0
            frames.append(normalized_frame)

        cap.release()
        video_tensor = np.array(frames)

        num_frames = video_tensor.shape[0]
        if num_frames > target_frames:
            indices = np.linspace(0, num_frames - 1, target_frames).astype(int)
            video_tensor = video_tensor[indices]
        elif num_frames < target_frames:
            pad_length = target_frames - num_frames
            padding = np.zeros((pad_length, target_size[0], target_size[1], 3))
            video_tensor = np.concatenate((video_tensor, padding), axis=0)
        return video_tensor  # numpy array

    def get_data(self):
        num_samples = 150
        data = []
        labels = []
        count = 0
        for i in range(1, num_samples + 1):
            video_path = fight_path_template.format(num=i)
            video_tensor = self.load_video_with_resizing_and_frame_handling(video_path)
            if video_tensor is not None:
                count += 1
                data.append(video_tensor)
                labels.append(1)  # Fight label

        for i in range(1, num_samples + 1):
            video_path = nofight_path_template.format(num=i)
            video_tensor = self.load_video_with_resizing_and_frame_handling(video_path)
            if video_tensor is not None:
                count += 1
                data.append(video_tensor)
                labels.append(0)  # NoFight label
        return data, labels

# Instantiate and load data
instance = DataLoading()
data, labels = instance.get_data()

# Convert to tensors
data = np.array(data)  # Shape: [300, 32, 112, 112, 3]
data = np.transpose(data,(0, 1, 4, 2, 3) )

labels = np.array(labels)  # Shape: [300]

# tensor_X = torch.tensor(data)
# tensor_Y = torch.tensor(labels)

# # Split into training and testing sets (80% train, 20% test)
# X_train, X_test, y_train, y_test = train_test_split(
#     tensor_X, tensor_Y, test_size=0.2, random_state=1, stratify=tensor_Y
# )

# # Permute the tensors to [batch_size, channels, depth, height, width]
# X_train = X_train.permute(0, 4, 1, 2, 3).float()  # Shape: [240, 3, 32, 112, 112]
# X_test = X_test.permute(0, 4, 1, 2, 3).float()    # Shape: [60, 3, 32, 112, 112]

# # Ensure labels are integers for CrossEntropyLoss
# y_train = y_train.long()  # Shape: [240]
# y_test = y_test.long()    # Shape: [60]

# class VideoDataset(Dataset):
#     def __init__(self, videos, labels):
#         self.videos = videos  # Tensor containing video data [batch, 3, 32, 112, 112]
#         self.labels = labels  # Tensor containing labels [batch]

#     def __len__(self):
#         return len(self.videos)

#     def __getitem__(self, idx):
#         video = self.videos[idx]    # [3, 32, 112, 112]
#         label = self.labels[idx]    # Scalar int (0 or 1)
#         return {"video": video, "label": label}

# # Create Dataset instances
# dataset_train = VideoDataset(X_train, y_train)
# dataset_test = VideoDataset(X_test, y_test)

# # Create DataLoader instances (optional, if you prefer using DataLoader instead of Hugging Face Datasets)
# batch_size = 2
# data_loader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, num_workers=2)
# data_loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False, num_workers=2)


In [5]:
from datasets import Dataset as HFDataset

# Convert tensors to numpy for Hugging Face Dataset
def tensor_to_numpy(videos, labels):
    # videos = videos.numpy()
    # labels = labels.numpy()
    lst=[]
    for i in range(0,len(videos)-29,30) :
      lst.append({"video": videos[i:i+30], "label": labels[i:i+30]})
    return lst
lst=tensor_to_numpy(data, labels)
train_dict_list = lst[:-2]
test_dict_list = lst[-2:]


In [6]:
len(test_dict_list)

2

In [7]:
hf_train_dataset_list=[]
for train_dict in train_dict_list :
    hf_train_dataset_list.append(HFDataset.from_dict(train_dict))
hf_test_dataset_list=[]
for test_dict in test_dict_list :
    hf_test_dataset_list.append(HFDataset.from_dict(test_dict))


In [8]:
from datasets import concatenate_datasets
merged_dataset_train=hf_train_dataset_list[0]
for i in range(1,len(hf_train_dataset_list)) :
    merged_dataset_train=concatenate_datasets([merged_dataset_train,hf_train_dataset_list[i]])

merged_dataset_test=hf_test_dataset_list[0]
for i in range(1,len(hf_test_dataset_list)) :
    merged_dataset_test=concatenate_datasets([merged_dataset_test,hf_test_dataset_list[i]])

In [9]:
# Rename 'video' key to match the expected input by VideoMAE
hf_train_dataset = merged_dataset_train.rename_column("video", "pixel_values")  #what does this functions do
hf_test_dataset = merged_dataset_test.rename_column("video", "pixel_values")  

In [10]:
hf_train_dataset

Dataset({
    features: ['pixel_values', 'label'],
    num_rows: 240
})

In [11]:
# Define label mappings
label2id = {"noFight": 0, "fight": 1}
id2label = {0: "noFight", 1: "fight"}


In [12]:
from transformers import (
    VideoMAEImageProcessor,
    VideoMAEForVideoClassification,
    TrainingArguments,
    Trainer,
    default_data_collator,
    TrainerCallback,
    EarlyStoppingCallback,
    
    
)
# List of parameter names to make trainable
trainable_params = [
#     "videomae.encoder.layer.11.attention.attention.q_bias",
#     "videomae.encoder.layer.11.attention.attention.v_bias",
#     "videomae.encoder.layer.11.attention.attention.query.weight",
#     "videomae.encoder.layer.11.attention.attention.key.weight",
#     "videomae.encoder.layer.11.attention.attention.value.weight",
#     "videomae.encoder.layer.11.attention.output.dense.weight",
#     "videomae.encoder.layer.11.attention.output.dense.bias",
    # "videomae.encoder.layer.11.intermediate.dense.weight",
    # "videomae.encoder.layer.11.intermediate.dense.bias",
    # "videomae.encoder.layer.11.output.dense.weight",
    # "videomae.encoder.layer.11.output.dense.bias",
    # "videomae.encoder.layer.11.layernorm_before.weight",
    # "videomae.encoder.layer.11.layernorm_before.bias",
    # "videomae.encoder.layer.11.layernorm_after.weight",
    # "videomae.encoder.layer.11.layernorm_after.bias",
    "videomae.layernorm.weight",
    "videomae.layernorm.bias",
    "classifier.weight",
]
import evaluate
    # Load accuracy metric
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

from transformers import TrainingArguments
from transformers import default_data_collator

# Define a data collator (default is sufficient here)
data_collator = default_data_collator
from transformers import Trainer

# Define a custom callback to log metrics to MLflow
class MLflowCallback(TrainerCallback):
    def __init__(self):
        super().__init__()

    def on_epoch_end(self, args, state, control, **kwargs):
        # Log metrics to MLflow
        metrics = kwargs.get('metrics')
        if metrics:
            epoch = state.epoch
            mlflow.log_metric("train_loss", metrics.get("train_loss"), step=epoch)
            mlflow.log_metric("eval_accuracy", metrics.get("eval_accuracy"), step=epoch)
            mlflow.log_metric("eval_loss", metrics.get("eval_loss"), step=epoch)



Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [13]:
!pip install mlflow

  pid, fd = os.forkpty()


Collecting mlflow
  Downloading mlflow-2.17.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.17.0 (from mlflow)
  Downloading mlflow_skinny-2.17.0-py3-none-any.whl.metadata (30 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting cachetools<6,>=5.0.0 (from mlflow-skinny==2.17.0->mlflow)
  Downloading cachetools-5.5.0-py3-none-any.whl.metadata (5.3 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.17.0->mlflow)
  Downloading databricks_sdk-0.35.0-py3-none-any.whl.metadata (38 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.5-py3-none-any.whl.metadata (10 kB)
Collecting graphql-relay<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_relay-3.2.0-py3-none-any.whl.metadata (12 kB)
Downloading mlflow-2.17.0-py3-none-any.whl (26.7 

In [14]:
import mlflow
import mlflow.pytorch

In [15]:
learning_rates=[0.001]
# Initialize MLflow
mlflow.set_tracking_uri("file://" + os.path.abspath("mlruns"))  # Change as needed
mlflow.set_experiment("VideoMAE_Fight_NoFight_Classification")

2024/10/21 14:20:55 INFO mlflow.tracking.fluent: Experiment with name 'VideoMAE_Fight_NoFight_Classification' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///kaggle/working/mlruns/305284197126483020', creation_time=1729520455677, experiment_id='305284197126483020', last_update_time=1729520455677, lifecycle_stage='active', name='VideoMAE_Fight_NoFight_Classification', tags={}>

In [16]:
mlflow.end_run()

In [17]:
for lr in learning_rates:
    with mlflow.start_run(run_name=f"lr_{lr}"):
        print(f"Training with learning rate: {lr}")
        from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification

        # Specify the pretrained model checkpoint
        model_ckpt = "MCG-NJU/videomae-base"

        # Load the image processor
        image_processor = VideoMAEImageProcessor.from_pretrained(model_ckpt)

        # Load the VideoMAE model for video classification with 2 labels
        model = VideoMAEForVideoClassification.from_pretrained(
            model_ckpt,
            num_labels=2,
            label2id=label2id,
            id2label=id2label,
            ignore_mismatched_sizes=True,  # Allows adding new classification head
        ).to(device)

        # Freeze model parameters except for specific layers
        for param in model.parameters():
            param.requires_grad = False
        for name, param in model.named_parameters():
            if name in trainable_params:
                param.requires_grad = True

        import evaluate

        # Define training arguments with GPU optimizations
        training_args = TrainingArguments(
            weight_decay=0.01,
            save_total_limit=2,
            lr_scheduler_type="linear",  # Choose a learning rate scheduler (e.g., 'linear')
            output_dir="./video_mae_fight_noFight",
            num_train_epochs=10,
            per_device_train_batch_size=10,
            per_device_eval_batch_size=10,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            learning_rate=lr,
            warmup_ratio=0.1,
            logging_steps=10,
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            push_to_hub=False,
            remove_unused_columns=False,  # Necessary to keep 'pixel_values'
            report_to="none",
            fp16=True,  # Enable mixed precision training
            dataloader_num_workers=8,  # Increase for faster data loading
        )

        # Instantiate the Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=hf_train_dataset,
            eval_dataset=hf_test_dataset,
            compute_metrics=compute_metrics,
            data_collator=data_collator,
            
            tokenizer=image_processor,  # Necessary for push_to_hub to include image processor
            callbacks=[MLflowCallback(),EarlyStoppingCallback(early_stopping_patience=4)]
        )

        # Log the learning rate parameter
        mlflow.log_param('learning_rate', lr)

        # Start training
        train_results = trainer.train()

        # Log training metrics
        train_metrics = train_results.metrics
        mlflow.log_metric('train_loss', train_metrics['train_loss'])
        if 'train_accuracy' in train_metrics:
            mlflow.log_metric('train_accuracy', train_metrics['train_accuracy'])

        # Evaluate the model
        eval_results = trainer.evaluate()

        # Log evaluation metrics
        # Evaluate the model
        eval_results = trainer.evaluate()
        
        # Log evaluation metrics
        mlflow.log_metric('test_loss', eval_results.get('eval_loss'))
        if 'eval_accuracy' in eval_results:
            mlflow.log_metric('test_accuracy', eval_results['eval_accuracy'])


        print(f"Training results for lr {lr}: {train_metrics}")
        print(f"Evaluation results for lr {lr}: {eval_results}")

        # # Save the model locally
        # trainer.save_model(f"./video_mae_fight_noFight_lr_{lr}")

        # # Log the model to MLflow
        # mlflow.pytorch.log_model(model, f"model_lr_{lr}")


Training with learning rate: 0.001


preprocessor_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/377M [00:00<?, ?B/s]

Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  self.pid = os.fork()


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6927,1.278524,0.0
2,0.5984,0.959033,0.15
3,0.5789,1.136202,0.133333
4,0.523,1.105733,0.2
5,0.5168,1.376453,0.083333
6,0.4914,1.075264,0.3
7,0.485,1.215771,0.2
8,0.4733,1.09515,0.316667
9,0.4796,1.215688,0.216667
10,0.4719,1.201208,0.233333


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


Training results for lr 0.001: {'train_runtime': 2888.1474, 'train_samples_per_second': 0.831, 'train_steps_per_second': 0.083, 'total_flos': 2.9905550147321856e+18, 'train_loss': 0.5304402252038319, 'epoch': 10.0}
Evaluation results for lr 0.001: {'eval_loss': 1.0951497554779053, 'eval_accuracy': 0.31666666666666665, 'eval_runtime': 57.8396, 'eval_samples_per_second': 1.037, 'eval_steps_per_second': 0.104, 'epoch': 10.0}
