In [1]:
import glob
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# pytorchvideo
from pytorchvideo.data import LabeledVideoDataset, make_clip_sampler, labeled_video_dataset
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    Normalize,
    RandomShortSideScale,
    UniformTemporalSubsample,
    Permute
)

from torchvision.transforms import (
    Compose,
    Lambda,
    RandomCrop,
    RandomHorizontalFlip,
    Resize
)

from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo
)

from torch.utils.data import DataLoader
import torch.nn as nn
import torch
from pytorch_lightning import LightningModule, seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.metrics import classification_report
import torchmetrics


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load Dataset
non_voi = glob.glob('data/NonViolence/*.mp4')
voi = glob.glob('data/Violence/*.mp4')
labels = [0] * len(non_voi) + [1] * len(voi)

df = pd.DataFrame(zip(non_voi+voi, labels), columns=['file', 'label'])
# print(df.head())

# Train Val split
train_df, val_df = train_test_split(df, test_size=0.2, shuffle=True)

In [3]:
# Augumentation
video_transform = Compose([
    ApplyTransformToKey(key='video',
    transform=Compose([
        UniformTemporalSubsample(20),
        Lambda(lambda x:x/255),
        Normalize((0.45, 0.45, 0.45), (0.225, 0.225, 0.225)),
        RandomShortSideScale(min_size=248, max_size=256),
        CenterCropVideo(224),
        RandomHorizontalFlip(p=0.5),
    ])
    )
])

In [4]:
class OurModel(LightningModule):
    def __init__(self):
        super(OurModel, self).__init__()

        # Model architecute
        self.video_model = torch.hub.load('facebookresearch/pytorchvideo', 'efficient_x3d_xs', pretrained=True)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(400, 1)

        self.lr = 1e-3
        self.batch_size = 4
        self.numworker = 4

        # Evaluation Metric
        self.metric = torchmetrics.Accuracy(task='binary')

        # Loss Function
        self.criterion = nn.BCEWithLogitsLoss()

    def forward(self, x):
        x = self.video_model(x)
        x = self.relu(x)
        x = self.linear(x)
        return x
    
    def configure_optimizers(self):
        opt = torch.optim.AdamW(params=self.parameters(), lr=self.lr)
        scheduler = CosineAnnealingLR(opt, T_max=10, eta_min=1e-6, last_epoch=-1)
        return {'optimizer':opt, 'lr_scheduler': scheduler}

    def train_dataloader(self):
        dataset = labeled_video_dataset(
            train_df, clip_sampler=make_clip_sampler('random', 2),
            transform=video_transform, decode_audio=False
        )
        loader = DataLoader(dataset, batch_size=self.batch_size, num_workers=self.numworker, pin_memory=True)
        return loader

    def training_step(self, batch, batch_idx):
        video, label = batch['video'], batch['label']
        out =  self.forward(video)
        loss = self.criterion(out, label)
        metric = self.metric(out, label.to(torch.int64))
        return {'loss': loss, 'metric': metric.detach()}

    def training_epoch_end(self, outputs):
        loss = torch.stack([x['loss'] for x in outputs]).mean().cpu().numpy().round(2)
        metric = torch.stack([x['metric'] for x in outputs]).mean().cpu().numpy().round(2)
        self.log('training_loss', loss)
        self.log('training_metric', metric)

    # Validation
    def val_dataloader(self):
        dataset = labeled_video_dataset(
            val_df, clip_sampler=make_clip_sampler('random', 2),
            transform=video_transform, decode_audio=False
        )
        loader = DataLoader(dataset, batch_size=self.batch_size, num_workers=self.numworker, pin_memory=True)
        return loader

    def validation_step(self, batch, batch_idx):
        video, label = batch['video'], batch['label']
        out =  self.forward(video)
        loss = self.criterion(out, label)
        metric = self.metric(out, label.to(torch.int64))
        return {'loss': loss, 'metric': metric.detach()}

    def validation_epoch_end(self, outputs):
        loss = torch.stack([x['loss'] for x in outputs]).mean().cpu().numpy().round(2)
        metric = torch.stack([x['metric'] for x in outputs]).mean().cpu().numpy().round(2)
        self.log('val_loss', loss)
        self.log('val_metric', metric)

    # Test data                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 
    def test_dataloader(self):
        dataset = labeled_video_dataset(
            val_df, clip_sampler=make_clip_sampler('random', 2),
            transform=video_transform, decode_audio=False
        )
        loader = DataLoader(dataset, batch_size=self.batch_size, num_workers=self.numworker, pin_memory=True)
        return loader

    def test_step(self, batch, batch_idx):
        video, label = batch['video'], batch['label']
        out =  self.forward(video)
        return {'label': label.detach(), 'pred': out.detach()}

    def test_epoch_end(self, outputs):
        label = torch.cat([x['label'] for x in outputs]).cpu().numpy()
        pred = torch.cat([x['pred'] for x in outputs]).cpu().numpy()
        pred = np.where(pred>0.5, 1, 0)
        print(classification_report(label, pred))


In [6]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss', dirpath='checkpoints',
    filename='best', save_last=True
)
lr_monitor = LearningRateMonitor(logging_interval='epoch')

model = OurModel()
seed_everything(0)

trainer = Trainer(
    max_epochs=15, accelerator='gpu', devices=-1,
    precision=16, accumulate_grad_batches=2,
    enable_progress_bar=True,
    num_sanity_val_steps=0,
    callbacks=[lr_monitor, checkpoint_callback]
)

Using cache found in /home/naseem/.cache/torch/hub/facebookresearch_pytorchvideo_main
Global seed set to 0
Using 16bit None Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [8]:
trainer.fit(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type              | Params
--------------------------------------------------
0 | video_model | EfficientX3d      | 3.8 M 
1 | relu        | ReLU              | 0     
2 | linear      | Linear            | 401   
3 | metric      | BinaryAccuracy    | 0     
4 | criterion   | BCEWithLogitsLoss | 0     
--------------------------------------------------
3.8 M     Trainable params
0         Non-trainable params
3.8 M     Total params
7.589     Total estimated model params size (MB)


Epoch 0: : 0it [00:31, ?it/s]
Epoch 2: : 392it [04:47,  1.36it/s, loss=0.16, v_num=6]  

mb_type 104 in P slice too large at 98 31
error while decoding MB 98 31


Epoch 3: : 473it [05:43,  1.38it/s, loss=0.218, v_num=6] 

mb_type 104 in P slice too large at 98 31
error while decoding MB 98 31


Epoch 4: : 413it [04:46,  1.44it/s, loss=0.185, v_num=6] 

mb_type 104 in P slice too large at 98 31
error while decoding MB 98 31


Epoch 14: : 492it [05:59,  1.37it/s, loss=0.056, v_num=6]   

`Trainer.fit` stopped: `max_epochs=15` reached.


Epoch 14: : 492it [06:00,  1.37it/s, loss=0.056, v_num=6]


In [9]:
torch.save(model.state_dict(), 'model.pt')

In [10]:
trainer.validate(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation DataLoader 0: : 100it [01:03,  1.57it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        val_loss                   0.09
       val_metric           0.9700000286102295
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'val_loss': 0.09, 'val_metric': 0.9700000286102295}]

In [11]:
trainer.test(model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: : 47it [00:26,  1.80it/s]

mb_type 104 in P slice too large at 98 31
error while decoding MB 98 31


Testing DataLoader 0: : 100it [01:04,  1.56it/s]              precision    recall  f1-score   support

         0.0       0.99      0.96      0.98       179
         1.0       0.97      1.00      0.98       212

    accuracy                           0.98       391
   macro avg       0.98      0.98      0.98       391
weighted avg       0.98      0.98      0.98       391

Testing DataLoader 0: : 100it [01:04,  1.56it/s]


[{}]

In [11]:
opt = torch.optim.AdamW(params=model.parameters(), lr=1e-3)

In [12]:
torch.save({
            'epoch': 2,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': opt.state_dict(),
            'loss': 0.253,
            }, 'model.pt')

In [13]:
checkpoint = torch.load('model.pt')
model.load_state_dict(checkpoint['model_state_dict'])

<All keys matched successfully>

In [14]:
epoch = checkpoint['epoch']
loss = checkpoint['loss']

In [15]:
epoch, loss

(2, 0.253)

In [16]:
from pytorchvideo.data.encoded_video import EncodedVideo

In [17]:
video = EncodedVideo.from_path('data/Violence/V_10.mp4')
video_data = video.get_clip(0, 2)
video_data = video_transform(video_data)

In [18]:
model = model.cuda()
inputs = video_data['video'].cuda()
inputs = torch.unsqueeze(inputs, 0)
inputs.shape

torch.Size([1, 3, 20, 224, 224])

In [19]:
preds = model(inputs)
preds = preds.detach().cpu().numpy()
preds

array([[0.3908517]], dtype=float32)

In [20]:
preds = np.where(preds>0.5, 1, 0)

In [21]:
preds

array([[0]])

In [22]:
preds[0][0]

0