# Import things

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install pytorchvideo

In [None]:
import os
import torch
import cv2
import numpy as np

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
import json

ROOT_DIR = "combined_data_v3"
LOG_INTERVAL = 10
NUM_VEHICLE = 6
config_path = None
if config_path:
  assert os.path.exists(config_path), "Config file not found"
  with open(config_path, "r") as f:
    config = json.loads(f.read())
else:
  config = {
      'video_encoder': {
          'size': 'XS'
      },
      'path_encoder': {
          'dim_feedforward': 2048,
          'n_hidden': 128,
          'n_head': 4,
          'n_layers': 4,
          'dropout': 0.2,
          'out_dim': 256,
      },
      'cross_net': {
          'n_hidden': 512,
          'dropout': 0.2,
      },
      'batch_size': 4,
      'lr1': 1e-4,
      'lr2': 3e-6,
      'n_epoch_1': 0,
      'n_epoch_2': 20,
      'seq_len': 256,
      'n_frames': 8,
      'img_size': 256,
      'mean': [0.45, 0.45, 0.45],
      'std': [0.225, 0.225, 0.225],
      'n_samples': [631, 700],
      'aug': 0.15
  }

# Model

In [None]:
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

## Path Encoder

In [None]:
class PathEncoder(nn.Module):
  def __init__(self, conf, in_channels=4):
    super(PathEncoder, self).__init__()
    self.fc1 = nn.Linear(in_channels, conf["n_hidden"])
    encoder_layer = nn.TransformerEncoderLayer(d_model=conf["n_hidden"], nhead=conf["n_head"], dim_feedforward=conf['dim_feedforward'])
    self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=conf["n_layers"])
    self.fc2 = nn.Linear(conf["n_hidden"]*config['seq_len'], conf["out_dim"])
    self.drop2 = nn.Dropout(p=conf["dropout"])
  
  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = self.transformer_encoder(x)
    x = torch.flatten(x, start_dim=1, end_dim=-1)
    x = self.drop2(x)
    x = F.relu(self.fc2(x))
    return x

## Video Encoder

In [None]:
model_transform_params  = {
    "x3d_xs": {
        "side_size": 182,
        "crop_size": 182,
        "num_frames": 4,
        "sampling_rate": 12,
    },
    "x3d_s": {
        "side_size": 182,
        "crop_size": 182,
        "num_frames": 13,
        "sampling_rate": 6,
    },
    "x3d_m": {
        "side_size": 256,
        "crop_size": 256,
        "num_frames": 16,
        "sampling_rate": 5,
    }
}
transform_params = model_transform_params[ 'x3d_{}'.format(config['video_encoder']['size'].lower())]

In [None]:
from pytorchvideo.models.accelerator.mobile_cpu.efficient_x3d import EfficientX3d
from torch.hub import load_state_dict_from_url

class VideoEncoder(nn.Module):
  def __init__(self, conf):
    super(VideoEncoder, self).__init__()
    self.model = EfficientX3d(expansion=conf['size'], head_act='identity')
    
    checkpoint_path = "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/efficient_x3d_{}_original_form.pyth".format(conf['size'].lower())
    checkpoint = load_state_dict_from_url(checkpoint_path)
    self.model.load_state_dict(checkpoint)
  
  def forward(self, x):
    return self.model(x)

## CrossNet

In [None]:
class CrossNet(nn.Module):
  def __init__(self, conf):
    super(CrossNet, self).__init__()
    self.path_encoder = PathEncoder(conf['path_encoder'], in_channels=4)
    self.video_encoder = VideoEncoder(conf['video_encoder'])
    num_features = 400 + conf['path_encoder']['out_dim']
    self.fc1 = nn.Linear(num_features, conf['cross_net']['n_hidden'])
    self.fc2 = nn.Linear(conf['cross_net']['n_hidden'], 2)
    self.drop = nn.Dropout(p=conf['cross_net']["dropout"])
  
  def forward(self, video, frames):
    video_encoding = self.video_encoder(video) # B * 400
    path_encoding = self.path_encoder(frames) # B * conf['path_encoder']['out_dim']
    
    encodings = torch.cat((video_encoding, path_encoding), dim=1)
    out = self.drop(F.relu(self.fc1(encodings)))
    out = self.fc2(out)
    return out

# Preparing data

In [None]:
if not os.path.exists(ROOT_DIR):
  !unzip "/content/drive/MyDrive/Viettel DTalent/mini_project_redlight_running/combined_data_v2.zip"

In [None]:
files = sorted(os.listdir(os.path.join(ROOT_DIR, "processed_labels")))
print(len(files))

## Train test split

In [None]:
from sklearn.model_selection import train_test_split

TRAIN_SIZE = 0.6
VAL_SIZE = 0.2
TEST_SIZE = 0.2

assert TRAIN_SIZE + VAL_SIZE + TEST_SIZE == 1

train_files, test_files = train_test_split(files, test_size=0.2, random_state=12)
train_files, val_files = train_test_split(train_files, test_size=VAL_SIZE/(1-TEST_SIZE), random_state=123)

## Data loader

In [None]:
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn.functional as F
import json

class RedLightDataset(Dataset):
  def __init__(self, root_dir, label_dir, data, vid_transform=None):
    self.label_dir = label_dir
    self.root_dir = root_dir
    self.data = data
    self.vid_transform = vid_transform
    self.img_transform = transforms.ToTensor()

  def __len__(self):
    return len(self.data)

  def get_data(self, idx):
    f = open(os.path.join(self.label_dir, self.data[idx]))
    data = json.load(f)
    f.close()
    return data

  def get_image_from_path(self, img_path):
    img = cv2.imread(img_path) # H * W * C
    dim = (transform_params["crop_size"], transform_params["crop_size"]) # W, H
    img = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return self.img_transform(img) # C * H * W

  def read_video(self, vid_path):
    paths = sorted(os.listdir(vid_path))
    video = torch.stack([self.get_image_from_path(os.path.join(vid_path, path)) for path in paths]) # T * C * H * W
    video = torch.transpose(video, 0, 1) # C * T * H * W
    return video

  def __getitem__(self, idx):
    data = self.get_data(idx)

    vid_path = os.path.join(self.root_dir, data["meta"]["vid_path"])
    video = self.read_video(vid_path) # C * T * H * W
    frames_bbox = torch.tensor(data["frames"]) # L * 4
    label = torch.tensor(int(data["meta"]["cross"]))

    if self.vid_transform:
        video = self.vid_transform(video)

    sample={
        "video": video,
        "frames_bbox": frames_bbox,
        # "path": os.path.join(self.label_dir, self.data[idx]),
        "label": label
        }
    return sample

In [None]:
def pad_sequence_fixed_size(sequences, batch_first=False, padding_value=0.0, max_len=256):
  # based on torch.nn.utils.rnn.pad_sequence
    max_size = sequences[0].size()
    trailing_dims = max_size[1:]
    
    if batch_first:
        out_dims = (len(sequences), max_len) + trailing_dims
    else:
        out_dims = (max_len, len(sequences)) + trailing_dims

    out_tensor = sequences[0].new_full(out_dims, padding_value)
    for i, tensor in enumerate(sequences):
        length = tensor.size(0)
        # use index notation to prevent duplicate references to the tensor
        if batch_first:
            out_tensor[i, :length, ...] = tensor
        else:
            out_tensor[:length, i, ...] = tensor

    return out_tensor

class Collate:
  def __call__(self, batch):
    videos = [item["video"].unsqueeze(0) for item in batch]
    videos = torch.cat(videos, dim=0) # N * C * T * H * W

    frames = [item["frames_bbox"] for item in batch]
    frames_pad = pad_sequence_fixed_size(frames, batch_first=True, max_len=config['seq_len']) # N * seq_len * 4
    
    labels = [item["label"].unsqueeze(0) for item in batch] 
    labels = torch.cat(labels, dim=0) # N

    return videos, frames_pad, labels

## Transform

In [None]:
import torchvision.transforms.functional as TF
from torchvision.transforms import Compose
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.transforms import (
    UniformTemporalSubsample,
)

In [None]:
class VideoRandomColorJitter(object):
  # Random Contrast in range [max(0, 1-value), 1+value]
  def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
    self.brightness = brightness
    self.contrast = contrast
    self.saturation = saturation
    self.hue = hue

  def apply_color_jitter(self, img, brightness, contrast, saturation, hue):
    img = TF.adjust_brightness(img, brightness)
    img = TF.adjust_contrast(img, contrast)
    img = TF.adjust_saturation(img, saturation)
    img = TF.adjust_hue(img, hue)
    return img

  def __call__(self, video):
    _brightness = np.random.uniform(max(0, 1 - self.brightness), 1 + self.brightness)
    _contrast = np.random.uniform(max(0, 1 - self.contrast), 1 + self.contrast)
    _saturation = np.random.uniform(max(0, 1 - self.saturation), 1 + self.saturation)
    _hue = np.random.uniform(-self.hue, self.hue)
    
    out = [self.apply_color_jitter(video[:,i,:], _brightness, _contrast, _saturation, _hue).unsqueeze(1) for i in range(video.shape[1])]

    return torch.cat(out,dim=1)

In [None]:
train_vid_transform = Compose([
                          UniformTemporalSubsample(transform_params["num_frames"]),
                          VideoRandomColorJitter(brightness=config['aug'], contrast=config['aug'], saturation=config['aug'], hue=config['aug']),
                          NormalizeVideo(config['mean'], config['std'])
                  ])
test_vid_transform = Compose([
                          UniformTemporalSubsample(transform_params["num_frames"]),
                          NormalizeVideo(config['mean'], config['std'])
                  ])

In [None]:
train_dataset = RedLightDataset(root_dir=ROOT_DIR,
                          label_dir=os.path.join(ROOT_DIR, "processed_labels"), 
                          data=train_files,
                          vid_transform=train_vid_transform)
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, collate_fn=Collate(), num_workers=2)

val_dataset = RedLightDataset(root_dir=ROOT_DIR,
                          label_dir=os.path.join(ROOT_DIR, "processed_labels"), 
                          data=val_files,
                          vid_transform=test_vid_transform)
val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False, collate_fn=Collate(), num_workers=2)

test_dataset = RedLightDataset(root_dir=ROOT_DIR,
                          label_dir=os.path.join(ROOT_DIR, "processed_labels"), 
                          data=test_files,
                          vid_transform=test_vid_transform)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, collate_fn=Collate(), num_workers=2)

# Train

In [None]:
import torch.optim as optim
import time 
import copy

def train(model, hist, dataloaders, criterion, optimizer, n_epochs, get_best_model=False):
  since = time.time()

  train_acc = []
  val_acc = []
  train_loss = []
  val_loss = []
  
  best_model_wts = copy.deepcopy(model.state_dict())

  best_loss = 1e9
  best_f1 = 0
  best_precision = 0
  best_recall = 0

  best_train_loss = 1e9
  best_train_f1 = 0
  best_train_precision = 0
  best_train_recall = 0

  for epoch in range(n_epochs):
    print('Epoch {}/{}'.format(epoch, n_epochs - 1))
    print('-' * 10) 
    for phase in ['train', 'val']:
      if phase == 'train':
        model.train()
      else:
        model.eval()
      
      running_loss = 0.0
      running_corrects = 0
      target_true = 0
      predicted_true = 0
      correct_true = 0

      for batch_idx, (videos, frames, labels) in enumerate(dataloaders[phase]):
        videos = videos.to(device)
        frames = frames.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        with torch.set_grad_enabled(phase=='train'):
          outputs = model(videos, frames)
          loss = criterion(outputs, labels.squeeze())
          # outputs = torch.exp(outputs)
          preds = torch.argmax(outputs, dim=1)

          if phase == 'train':
            loss.backward()
            optimizer.step()
          
          running_loss += loss.item() * config['batch_size']
          running_corrects += torch.sum(preds==labels.data)
          
          # Get data for f1 calculation
          target_classes = labels.data
          target_true += torch.sum(target_classes == 1).float()
          predicted_true += torch.sum(preds).float()
          correct_true += torch.sum(target_classes * preds == 1).float()

          if phase == 'train' and batch_idx % LOG_INTERVAL == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
              epoch, batch_idx * config['batch_size'], len(dataloaders[phase].dataset),
              100. * batch_idx / len(dataloaders[phase]), running_loss/((batch_idx+1)*config['batch_size'])))

      epoch_loss = running_loss / len(dataloaders[phase].dataset)
      epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

      recall = correct_true / target_true
      precision = correct_true / predicted_true
      f1_score = 2 * precision * recall / (precision + recall)
      print('{} Loss: {:.4f} Acc: {:.4f} F1: {:.4f}'.format(phase, epoch_loss, epoch_acc, f1_score))
      
      if phase == 'val' and best_loss > epoch_loss:
        best_f1 = f1_score.item()
        best_loss = epoch_loss
        best_precision = precision
        best_recall = recall

        best_train_f1 = hist['train_f1'][-1]
        best_train_loss = hist['train_loss'][-1]
        best_train_precision =  hist['train_precision'][-1]
        best_train_recall =  hist['train_recall'][-1]
        best_model_wts = copy.deepcopy(model.state_dict())

      if phase == 'val':
        hist['val_acc'].append(epoch_acc.item())
        hist['val_loss'].append(epoch_loss)
        hist['val_f1'].append(f1_score.item())
        hist['val_precision'].append(precision.item())
        hist['val_recall'].append(recall.item())
      else:
        hist['train_acc'].append(epoch_acc.item())
        hist['train_loss'].append(epoch_loss)
        hist['train_f1'].append(f1_score.item())
        hist['train_precision'].append(precision.item())
        hist['train_recall'].append(recall.item())

      print()

  time_elapsed = time.time() - since
  print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
  print('Best val loss: {:.4f} Precision {:.4f} Recall {:.4f} F1: {:.4f}'.format(best_loss, best_precision, best_recall, best_f1))
  print('Train loss: {:.4f} Precision {:.4f} Recall {:.4f} F1: {:.4f}'.format(best_train_loss, best_train_precision, best_train_recall, best_train_f1))

  if get_best_model:
    model.load_state_dict(best_model_wts)
  
  return model, hist

In [None]:
def get_class_weight(n_samples):
  normedWeights = [1 - (x / sum(n_samples)) for x in n_samples]  
  return torch.FloatTensor(normedWeights).to(device)

In [None]:
dataloaders = {"train": train_loader, "val": val_loader}
weight = get_class_weight(config['n_samples'])
criterion = nn.CrossEntropyLoss(weight=weight)
# criterion = nn.NLLLoss(weight=weight)

model = CrossNet(config)
model = model.to(device)

hist = {
      "train_loss": [],
      "val_loss": [],
      "train_acc": [],
      "val_acc": [],
      "train_precision":[],
      "val_precision":[],
      "train_recall":[],
      "val_recall":[],
      "train_f1": [],
      "val_f1": []
  }

In [None]:
# Freeze VideoEncoder
trainable_parameters = []
for name, p in model.named_parameters():
    if "VideoEncoder" not in name:
        trainable_parameters.append(p)

optimizer = optim.Adam(trainable_parameters, lr=config['lr1'])
model, hist = train(model, hist, dataloaders, criterion, optimizer, n_epochs=config['n_epoch_1'], get_best_model=True)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=config['lr2'])
model, hist = train(model, hist, dataloaders, criterion, optimizer, n_epochs=config['n_epoch_2'], get_best_model=True)

# Evaluate and save data

In [None]:
from tqdm import tqdm

def eval(model, dataloader, criterion):
  model.eval()

  running_loss = 0.0
  target_true = 0
  predicted_true = 0
  correct_true = 0

  preds_arr = []
  gt_arr = []
  for batch_idx, (videos, frames, labels) in enumerate(dataloader):
    videos = videos.to(device)
    frames = frames.to(device)
    labels = labels.to(device)

    with torch.no_grad():
      outputs = model(videos, frames)
      loss = criterion(outputs, labels)
      preds = torch.argmax(outputs, dim=1)

      preds_arr.extend(preds.tolist())
      gt_arr.extend(labels.tolist())

      running_loss += loss.item() * config['batch_size']
      # Get data for f1 calculation
      target_classes = labels.data
      target_true += torch.sum(target_classes == 1).float()
      predicted_true += torch.sum(preds).float()
      correct_true += torch.sum(target_classes * preds == 1).float()

  epoch_loss = running_loss / len(dataloader.dataset)
  recall = correct_true / target_true
  precision = correct_true / predicted_true
  f1_score = 2 * precision * recall / (precision + recall)
  
  print()
  print("Precision: {:.4f}".format(precision.item()))
  print("Recall: {:.4f}".format(recall.item()))
  print("F1: {:.4f}".format(f1_score.item()))
        
  return epoch_loss, f1_score.item()

In [None]:
hist['test_loss'], hist['test_f1'] = eval(model, test_loader, criterion)

In [None]:
print("Test loss: {:.4f}".format(hist['test_loss']))
print("Test F1: {:.4f}".format(hist['test_f1']))

In [None]:
RESULT_PATH = "model"
ZIP_PATH = "model.zip"
if not os.path.exists(RESULT_PATH):
  os.mkdir(RESULT_PATH)
if os.path.exists(ZIP_PATH):
  os.remove(ZIP_PATH)

In [None]:
import matplotlib.pyplot as plt

In [None]:
val_idx = np.argmin(hist['val_loss'])

In [None]:
plt.title("Loss | Best Val loss: {:.4f}".format(hist['val_loss'][val_idx]))
plt.xlabel("Training Epochs")
plt.ylabel("Loss")

plt.plot(hist['train_loss'],label="Train")
plt.plot(hist['val_loss'],label="Validation")
plt.legend()
plt.savefig(os.path.join(RESULT_PATH, "loss.png"))
plt.show()

In [None]:
plt.title("F1 | Best Val F1: {:.4f}".format(hist['val_f1'][val_idx]))
plt.xlabel("Training Epochs")
plt.ylabel("F1 Score")

plt.plot(hist['train_f1'],label="Train")
plt.plot(hist['val_f1'],label="Validation")
plt.legend()
plt.savefig(os.path.join(RESULT_PATH, "f1.png"))
plt.show()

In [None]:
torch.save(model.state_dict(), os.path.join(RESULT_PATH, "model.pth"))

In [None]:
with open(os.path.join(RESULT_PATH, 'config.json'), 'w') as f:
  json.dump(config, f, indent=4)

In [None]:
with open(os.path.join(RESULT_PATH, 'result.json'), 'w') as f:
  json.dump(hist, f, indent=4)

In [None]:
import shutil
from google.colab import files

output_zip = shutil.make_archive(RESULT_PATH, 'zip', RESULT_PATH)
files.download(output_zip)