# Baseline Experiments

In [None]:
import config
from db_utilities.utilities import MiddleFramesExtractor
from db_utilities.porn_800 import PornographyDatabase
from db_utilities.porn_2k import Pornography2kDatabase

import time
import datetime

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from torchvision import models, transforms

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

## Data

In [None]:
p800_dir = config.PORN_800_DIR
p2k_dir = config.PORN_2K_DIR

In [None]:
p800 = PornographyDatabase(data_dir=p800_dir)

extractor = MiddleFramesExtractor(5)
p2k = Pornography2kDatabase(data_dir=p2k_dir, frame_extractor=extractor)

In [None]:
SCALE = 256
CROP = 224
MEAN = [0.485, 0.456, 0.406]
STD = [0.229, 0.224, 0.225]

data_transforms = {
  "train": transforms.Compose([
    transforms.Resize(SCALE),
    transforms.RandomResizedCrop(CROP),
    transforms.ToTensor(),
    transforms.Normalize(MEAN, STD)
  ]),
  "val": transforms.Compose([
    transforms.Resize(SCALE),
    transforms.CenterCrop(CROP),
    transforms.ToTensor(),
    transforms.Normalize(MEAN, STD)
  ])
}

## Training

In [None]:
WEIGHTS = "IMAGENET1K_V1"
N_CLASSES = 2

In [None]:
def format_time(elapsed):
  '''
    Takes a time in seconds and returns a string hh:mm:ss
  '''
  
  elapsed_rounded = int(round((elapsed))) # Round to the nearest second
  return str(datetime.timedelta(seconds=elapsed_rounded)) # Format as hh:mm:ss

In [None]:
from sklearn.model_selection import StratifiedKFold
from statistics import mean

def run_epochs(
    model, 
    dataloaders,
    dataset_sizes,
    criterion, 
    optimizer, 
    scheduler,
    n_epochs): 
  '''
    General function to run n_epochs epochs
  '''
  for epoch_i in range(n_epochs):
    print()
    print('========== Start Epoch {:} / {:} =========='.format(epoch_i + 1, n_epochs))
    
    # Measure the training time per epoch
    t0 = time.time()

    best_model = model.state_dict()
    best_acc = 0.0

    # Each epoch has a training and validation phase
    for phase in ["train", "val"]:
      if phase == "train":
        print("Training...")
        model.train() # Set model to training mode
      else:
        print("Running Validation...")
        model.eval() # Set model to evaluate mode

      run_loss = 0.0
      run_corrects = 0

      # Iterate over data
      for inputs, labels in dataloaders[phase]:
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward
        # Track history if only in train
        with torch.set_grad_enabled(phase == "train"):
          outputs = model(inputs)
          _, preds = torch.max(outputs, 1)
          loss = criterion(outputs, labels)

          # If in training phase, backward + optimize
          if phase == "train":
            loss.backward()
            optimizer.step()

        # Statistics 
        run_loss += loss.item() * inputs.size(0)
        run_corrects += torch.sum(preds == labels.data)

      if phase == "train":
        scheduler.step()

      epoch_loss = run_loss / dataset_sizes[phase]
      epoch_acc = run_corrects.double() / dataset_sizes[phase]

      print("{} Loss: {:.4f} | Acc: {:.4f}".format("Training" if phase == "train" else "Validation", epoch_loss, epoch_acc))

      if phase == "val":
        if epoch_acc > best_acc:
          best_acc = epoch_acc
          best_model = model.state_dict()
        
    print("Epoch took {:}".format(format_time(time.time() - t0)))
    print('=========== End Epoch {:} / {:} ==========='.format(epoch_i + 1, n_epochs))
      
  return best_model, best_acc
  
  
def train_model(
    model, 
    dataset,
    criterion, 
    optimizer, 
    scheduler,
    n_epochs=25):
  '''
    General function to train a model
  '''

  kfold = StratifiedKFold(n_splits=5)

  # Measure the total training time for the whole run
  total_t0 = time.time()

  # Save best_model and best_acc for each fold
  models, accs = [], []

  for k, (train_i, val_i) in enumerate(kfold.split(dataset.frame_names, dataset.frame_labels)):
    print()
    print('==================== Start Fold {:} / {:} ===================='.format(k + 1, 5))

    # Measure the training time per fold
    t0 = time.time()

    indices = { "train": train_i, "val": val_i }
    datasets = { x: SubsetDataset(subset=Subset(dataset, indices[x]), transform=data_transforms[x]) for x in ["train", "val"] }
    dataloaders = { x: DataLoader(dataset=datasets[x], batch_size=32) for x in ["train", "val"] }
    dataset_sizes = { x: len(datasets[x]) for x in ["train", "val"] }

    print(next(iter(dataloaders["train"])))
    
    best_model, best_acc = model.state_dict(), 0.0
    best_model, best_acc = run_epochs(model, dataloaders, dataset_sizes, criterion, optimizer, scheduler, n_epochs)
    models.append(best_model)
    accs.append(best_acc)

    print("Fold took {:}".format(format_time(time.time() - t0)))
    print('===================== End Fold {:} / {:} ====================='.format(k + 1, 5))
    print()

  best_overall_acc = max(accs)
  mean_acc = mean(accs)

  print("Training complete!")
  print("Total training took {:}".format(format_time(time.time() - total_t0)))
  print("Best Overall Acc: {:.4f} | Average Acc: {:.4f}".format(best_overall_acc, mean_acc))
  print("Saving best model...")
  
  # Load best model
  best_overall_model = models[accs.index(best_overall_acc)]
  model.load_state_dict(best_overall_model)
  
  return model

In [None]:
criterion = nn.CrossEntropyLoss()

def get_optimizer(model):
  # Parameters of frozen layers will not be optimized
  return optim.SGD(
      params=list(filter(lambda p: p.requires_grad, model.parameters())), 
      lr=0.001, 
      momentum=0.9
    )

def get_scheduler(optimizer):
  # Decay LR by a factor of 0.1 every 7 epochs
  return optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=7, gamma=0.1)

### ResNet50

In [None]:
resnet = models.resnet50(weights=WEIGHTS)

for params in resnet.parameters(): 
  params.requires_grad = False

# Parameters of newly constructed modules have requires_grad=True by default
n_features = resnet.fc.in_features
resnet.fc = nn.Linear(n_features, N_CLASSES)

optimizer = get_optimizer(resnet)
scheduler = get_scheduler(optimizer)

In [None]:
resnet = train_model(
  model=resnet,
  dataset=p2k,
  criterion=criterion,
  optimizer=optimizer,
  scheduler=scheduler
)

### DenseNet121

In [None]:
densenet = models.densenet121(weights=WEIGHTS)

for params in densenet.parameters(): 
  params.requires_grad = False

n_features = densenet.classifier.in_features
densenet.classifier = nn.Linear(n_features, N_CLASSES)

optimizer = get_optimizer(densenet)
scheduler = get_scheduler(optimizer)

In [None]:
# TODO: add missing parameters
# densenet = train_model(densenet, criterion, optimizer, scheduler)

### VGG16

In [None]:
vgg = models.vgg16(weights=WEIGHTS)

for params in resnet.parameters(): 
  params.requires_grad = False

n_features = vgg.classifier[6].in_features
features = list(vgg.classifier.children())[:-1]
features.extend([nn.Linear(n_features, N_CLASSES)])
vgg.classifier = nn.Sequential(*features)

optimizer = get_optimizer(vgg)
scheduler = get_scheduler(optimizer)

In [None]:
# TODO: add missing parameters
# vgg = train_model(vgg, criterion, optimizer, scheduler)