In [10]:
import matplotlib.pyplot as plt
import torchvision.models as models
import torch.nn as nn
import numpy as np
import torch
video_path = r'/kaggle/input/rwf2000/RWF-2000/val/Fight/48J5lk4QcpE_3.avi'
root_dir = r'/kaggle/input/rwf2000/RWF-2000'
train_data_root = r'/kaggle/input/rwf2000/RWF-2000/train'
val_data_root = r'/kaggle/input/rwf2000/RWF-2000/val'
# video_path


In [11]:
from torchvision.models import MobileNet_V2_Weights

weights = MobileNet_V2_Weights.DEFAULT
transform = weights.transforms()

# Shows all preprocessing steps including expected shape
print(transform)

ImageClassification(
    crop_size=[224]
    resize_size=[232]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)


In [12]:
import torch
import torch.nn as nn
import torchvision.models as models

class MobileNetLstmModel(nn.Module):
    def __init__(self,hidden_state=512, num_classes=2, lstm_layers=1):
        super(MobileNetLstmModel,self).__init__()
        mobilenet = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.DEFAULT)
        self.feature_extractor = mobilenet.features 
        self.pool = nn.AdaptiveAvgPool2d((1,1,))
        self.lstm = nn.LSTM(input_size=1280,
                            hidden_size = hidden_state,
                            num_layers = lstm_layers,
                            batch_first=True,
                            bidirectional = True                
                            )
        self.classifier = nn.Sequential(
            nn.Linear(hidden_state*2,128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128,num_classes),
        )

    def forward(self,x):
        B,T,C,H,W = x.shape
        x = x.view(B*T,C,H,W)
        x = self.feature_extractor(x)
        # print(self.feature_extractor)
        x = self.pool(x).squeeze(-1).squeeze(-1)
        x = x.view(B,T,-1)
        output,(hn,cn) = self.lstm(x)
        out = output[:,-1,:]
        # print(out.shape)
        final_out = self.classifier(out)
        # print(final_out)
        # print(final_out.shape)
        return final_out

# data = torch.randn([4,16,3,224,224])
# model = MobileNetLstmModel()
# print(model(data))

In [13]:
import os
import cv2
import torch
import random
import numpy as np
from torch.utils.data import Dataset
from torchvision import transforms

class ViolenceDataset(Dataset):
    def __init__(self, root_dir, clip_len=16, transform=None):
        self.clip_len = clip_len
        self.transform = transform

        self.data = []
        self.labels = []

        class_map = {"NonFight": 0, "Fight": 1}
        for label_name, label_val in class_map.items():
            class_dir = os.path.join(root_dir, label_name)
            for fname in os.listdir(class_dir):
                if fname.endswith(".mp4") or fname.endswith(".avi"):
                    self.data.append(os.path.join(class_dir, fname))
                    self.labels.append(label_val)

    def read_video(self, path):
        cap = cv2.VideoCapture(path)
        frames = []

        frame_total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        if frame_total == 0:
            cap.release()
            return [np.zeros((224, 224, 3), dtype=np.uint8)] * self.clip_len

        if frame_total >= self.clip_len:
            frame_idx = np.linspace(0, frame_total - 1, self.clip_len, dtype=np.int32)
        else:
            frame_idx = np.linspace(0, frame_total - 1, frame_total, dtype=np.int32)

        for idx in frame_idx:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()
            if ret:
                frame = cv2.resize(frame, (224, 224))
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frames.append(frame)
            else:
                frames.append(np.zeros((224, 224, 3), dtype=np.uint8))  # fallback frame

        cap.release()

        # Pad if too short
        if len(frames) < self.clip_len:
            frames += [frames[-1]] * (self.clip_len - len(frames))

        return frames

    def __getitem__(self, index):
        video_path = self.data[index]
        label = self.labels[index]

        frames = self.read_video(video_path)

        if self.transform:
            frames = [self.transform(frame) for frame in frames]

        clip = torch.stack(frames)  # (T, C, H, W)
        # clip = clip.permute(1,0,2,3) # (C,T,H,W)
        return clip, torch.tensor(label, dtype=torch.long)

    def __len__(self):
        return len(self.data)


In [14]:
from torchvision import transforms
import torch
from torchvision.transforms import InterpolationMode

r3d_transform = transforms.Compose([
    transforms.ToPILImage(), 
    transforms.Resize([232, 232], interpolation=InterpolationMode.BILINEAR),
    transforms.CenterCrop([224, 224]),
    transforms.ToTensor(), 
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])


In [15]:
train_dataset = ViolenceDataset(train_data_root,clip_len=16,transform=r3d_transform)
val_dataset = ViolenceDataset(val_data_root,clip_len=16,transform=r3d_transform)


In [16]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset,batch_size=4,shuffle=True)
val_loader = DataLoader(val_dataset,batch_size=4,shuffle=False)
# for i,l in train_loader:
#     print(i.size())

In [17]:
# !pip install mlflow

In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import mlflow
import mlflow.pytorch
import os

# Assuming model, train_loader, val_loader are already defined
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MobileNetLstmModel().to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

# Setup
best_val_acc = 0.0
num_epochs = 20
run_name = "mobilenet_lstm_run"

# Directory to save best model
best_model_path = "best_model.pt"

# MLflow tracking
mlflow.set_experiment("MobileNetLSTM Video Classification")

with mlflow.start_run(run_name=run_name):
    mlflow.log_params({
        "model": "MobileNetLSTM",
        "optimizer": "Adam",
        "lr": 1e-4,
        "loss": "CrossEntropyLoss",
        "epochs": num_epochs,
        "scheduler": "StepLR(5, gamma=0.5)"
    })

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_correct, train_total = 0, 0

        for clips, labels in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{num_epochs}"):
            clips = clips.to(device)
            labels = labels.to(device)
            # print(clips.shape)
            optimizer.zero_grad()
            outputs = model(clips)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            # print(outputs)

            train_loss += loss.item() * clips.size(0)
            _, predicted = torch.max(outputs, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()

        train_acc = 100.0 * train_correct / train_total
        train_loss /= train_total

        # ---------------- Validation ---------------- #
        model.eval()
        val_loss = 0.0
        val_correct, val_total = 0, 0

        with torch.no_grad():
            for clips, labels in tqdm(val_loader, desc="Validating"):
                clips = clips.to(device)
                labels = labels.to(device)

                outputs = model(clips)
                loss = criterion(outputs, labels)

                val_loss += loss.item() * clips.size(0)
                _, predicted = torch.max(outputs, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

        val_acc = 100.0 * val_correct / val_total
        val_loss /= val_total

        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
        print(f"Val   Loss: {val_loss:.4f}, Val   Acc: {val_acc:.2f}%")

        # Log to MLflow
        mlflow.log_metrics({
            "train_loss": train_loss,
            "train_accuracy": train_acc,
            "val_loss": val_loss,
            "val_accuracy": val_acc,
            "lr": optimizer.param_groups[0]['lr']
        }, step=epoch)

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), best_model_path)
            mlflow.log_artifact(best_model_path)
            print(f"✅ Saved Best Model @ Epoch {epoch+1} with Val Acc: {val_acc:.2f}%")

        scheduler.step()

    # Log final model
    mlflow.pytorch.log_model(model, "final_model")
    mlflow.log_metric("best_val_accuracy", best_val_acc)
    print("🚀 Training complete. Best Val Acc:", best_val_acc)


Downloading: "https://download.pytorch.org/models/mobilenet_v2-7ebf99e0.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2-7ebf99e0.pth
100%|██████████| 13.6M/13.6M [00:00<00:00, 125MB/s]
2025/06/06 16:20:45 INFO mlflow.tracking.fluent: Experiment with name 'MobileNetLSTM Video Classification' does not exist. Creating a new experiment.
Training Epoch 1/20: 100%|██████████| 400/400 [11:38<00:00,  1.75s/it]
Validating: 100%|██████████| 100/100 [02:48<00:00,  1.69s/it]


Epoch 1/20
Train Loss: 0.5870, Train Acc: 69.25%
Val   Loss: 0.5648, Val   Acc: 71.50%
✅ Saved Best Model @ Epoch 1 with Val Acc: 71.50%


Training Epoch 2/20: 100%|██████████| 400/400 [10:37<00:00,  1.59s/it]
Validating: 100%|██████████| 100/100 [02:31<00:00,  1.51s/it]


Epoch 2/20
Train Loss: 0.4641, Train Acc: 78.56%
Val   Loss: 0.4580, Val   Acc: 76.50%
✅ Saved Best Model @ Epoch 2 with Val Acc: 76.50%


Training Epoch 3/20: 100%|██████████| 400/400 [10:34<00:00,  1.59s/it]
Validating: 100%|██████████| 100/100 [02:30<00:00,  1.51s/it]


Epoch 3/20
Train Loss: 0.3826, Train Acc: 82.69%
Val   Loss: 0.4757, Val   Acc: 77.50%
✅ Saved Best Model @ Epoch 3 with Val Acc: 77.50%


Training Epoch 4/20: 100%|██████████| 400/400 [10:32<00:00,  1.58s/it]
Validating: 100%|██████████| 100/100 [02:30<00:00,  1.50s/it]


Epoch 4/20
Train Loss: 0.2991, Train Acc: 87.69%
Val   Loss: 0.7797, Val   Acc: 68.75%


Training Epoch 5/20: 100%|██████████| 400/400 [10:33<00:00,  1.58s/it]
Validating: 100%|██████████| 100/100 [02:30<00:00,  1.51s/it]


Epoch 5/20
Train Loss: 0.2183, Train Acc: 91.06%
Val   Loss: 0.6320, Val   Acc: 74.75%


Training Epoch 6/20: 100%|██████████| 400/400 [10:33<00:00,  1.58s/it]
Validating: 100%|██████████| 100/100 [02:30<00:00,  1.51s/it]


Epoch 6/20
Train Loss: 0.1605, Train Acc: 94.50%
Val   Loss: 0.7535, Val   Acc: 73.75%


Training Epoch 7/20: 100%|██████████| 400/400 [10:33<00:00,  1.58s/it]
Validating: 100%|██████████| 100/100 [02:30<00:00,  1.51s/it]


Epoch 7/20
Train Loss: 0.1059, Train Acc: 96.69%
Val   Loss: 0.8255, Val   Acc: 71.75%


Training Epoch 8/20: 100%|██████████| 400/400 [10:34<00:00,  1.59s/it]
Validating: 100%|██████████| 100/100 [02:30<00:00,  1.51s/it]


Epoch 8/20
Train Loss: 0.0707, Train Acc: 97.56%
Val   Loss: 0.7819, Val   Acc: 77.50%


Training Epoch 9/20: 100%|██████████| 400/400 [10:35<00:00,  1.59s/it]
Validating: 100%|██████████| 100/100 [02:31<00:00,  1.52s/it]


Epoch 9/20
Train Loss: 0.0632, Train Acc: 97.88%
Val   Loss: 0.9835, Val   Acc: 73.75%


Training Epoch 10/20: 100%|██████████| 400/400 [10:36<00:00,  1.59s/it]
Validating: 100%|██████████| 100/100 [02:31<00:00,  1.51s/it]


Epoch 10/20
Train Loss: 0.0597, Train Acc: 97.81%
Val   Loss: 1.0035, Val   Acc: 71.75%


Training Epoch 11/20: 100%|██████████| 400/400 [10:35<00:00,  1.59s/it]
Validating: 100%|██████████| 100/100 [02:31<00:00,  1.51s/it]


Epoch 11/20
Train Loss: 0.0359, Train Acc: 98.69%
Val   Loss: 1.0367, Val   Acc: 73.75%


Training Epoch 12/20: 100%|██████████| 400/400 [10:36<00:00,  1.59s/it]
Validating: 100%|██████████| 100/100 [02:31<00:00,  1.52s/it]


Epoch 12/20
Train Loss: 0.0341, Train Acc: 98.56%
Val   Loss: 1.0930, Val   Acc: 73.25%


Training Epoch 13/20: 100%|██████████| 400/400 [10:35<00:00,  1.59s/it]
Validating: 100%|██████████| 100/100 [02:31<00:00,  1.51s/it]


Epoch 13/20
Train Loss: 0.0344, Train Acc: 98.62%
Val   Loss: 1.0090, Val   Acc: 75.75%


Training Epoch 14/20: 100%|██████████| 400/400 [10:32<00:00,  1.58s/it]
Validating: 100%|██████████| 100/100 [02:30<00:00,  1.51s/it]


Epoch 14/20
Train Loss: 0.0306, Train Acc: 99.12%
Val   Loss: 1.0812, Val   Acc: 74.25%


Training Epoch 15/20: 100%|██████████| 400/400 [10:34<00:00,  1.59s/it]
Validating: 100%|██████████| 100/100 [02:31<00:00,  1.51s/it]


Epoch 15/20
Train Loss: 0.0283, Train Acc: 99.00%
Val   Loss: 1.2694, Val   Acc: 72.75%


Training Epoch 16/20: 100%|██████████| 400/400 [10:32<00:00,  1.58s/it]
Validating: 100%|██████████| 100/100 [02:30<00:00,  1.51s/it]


Epoch 16/20
Train Loss: 0.0210, Train Acc: 99.38%
Val   Loss: 1.0749, Val   Acc: 74.25%


Training Epoch 17/20: 100%|██████████| 400/400 [10:33<00:00,  1.58s/it]
Validating: 100%|██████████| 100/100 [02:30<00:00,  1.51s/it]


Epoch 17/20
Train Loss: 0.0122, Train Acc: 99.50%
Val   Loss: 1.2037, Val   Acc: 74.50%


Training Epoch 18/20: 100%|██████████| 400/400 [10:33<00:00,  1.58s/it]
Validating: 100%|██████████| 100/100 [02:31<00:00,  1.51s/it]


Epoch 18/20
Train Loss: 0.0132, Train Acc: 99.69%
Val   Loss: 1.2142, Val   Acc: 74.75%


Training Epoch 19/20: 100%|██████████| 400/400 [10:33<00:00,  1.58s/it]
Validating: 100%|██████████| 100/100 [02:30<00:00,  1.50s/it]


Epoch 19/20
Train Loss: 0.0143, Train Acc: 99.50%
Val   Loss: 1.2067, Val   Acc: 71.75%


Training Epoch 20/20: 100%|██████████| 400/400 [10:32<00:00,  1.58s/it]
Validating: 100%|██████████| 100/100 [02:30<00:00,  1.50s/it]


Epoch 20/20
Train Loss: 0.0114, Train Acc: 99.56%
Val   Loss: 1.1316, Val   Acc: 75.25%




🚀 Training complete. Best Val Acc: 77.5


In [20]:
import shutil

shutil.make_archive('mlruns_backup', 'zip', 'mlruns')


'/kaggle/working/mlruns_backup.zip'

In [226]:
frames = []
video_path = r'/kaggle/input/rwf2000/RWF-2000/val/Fight/48J5lk4QcpE_3.avi'
cap = cv2.VideoCapture(video_path)
while True:
    ret,frame = cap.read()
    if not ret:
        break
    frame = cv2.resize(frame,(224,224))
    frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
    frames.append(frame)
    # print(len(frames))
cap.release()

In [159]:
import random

# if len(frames) < 16:
#     frames = frames + [frames[-1]] * (16 - len(frames))  # pad last frame
start = random.randint(0,len(frames)-16)
# clip = 
clip = frames[start:start+16]
len(clip)

16

In [145]:
len(r'/kaggle/input/rwf2000/RWF-2000/val/NonFight')

43