In [1]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms, models
from torchvision.datasets import ImageFolder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from tensorboardX import SummaryWriter
import time
import numpy as np
from PIL import Image
from typing import List
StringList = List[str]
from glob import glob
import os
import torchvision.datasets.folder

In [2]:
# Configuration Variables
MEAN_PREPRO = [0.485, 0.456, 0.406]
STD_PREPRO = [0.229, 0.224, 0.225]
RESIZE_PREPRO = 256,256
RESIZE_DRAW = 256,256

TRAIN_BATCH_SIZE = 128
TRAIN_SHUFFLE = True
TRAIN_NUM_WORKERS = 8
TRAIN_PIN_MEMORY = True

VAL_BATCH_SIZE = 512
VAL_SHUFFLE = False
VAL_NUM_WORKERS = 4
VAL_PIN_MEMORY = True

INITIAL_LR = 1e-4
DEVICE_ID = 1

DATA_DIR = '/data/porn/binary/'
TRAINSET_ROOT_NSFW = f'{DATA_DIR}train/nsfw'
TRAINSET_ROOT_SAFE = f'{DATA_DIR}train/safe'
TESTSET_ROOT_NSFW = f'{DATA_DIR}test/nsfw'
TESTSET_ROOT_SAFE = f'{DATA_DIR}test/safe'

TENSORBOARD_DIR = '/data/porn/tensorboard/regression-resnnext/'

In [3]:
normalize = transforms.Normalize(
        mean=MEAN_PREPRO, std=STD_PREPRO
    )
prepro = transforms.Compose(
    [
        transforms.RandomResizedCrop(256),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ]
)
prepro_val = transforms.Compose(
    [transforms.Resize(RESIZE_PREPRO), transforms.ToTensor(), normalize]
)

prepro_draw = transforms.Compose(
    [transforms.Resize(RESIZE_DRAW), transforms.ToTensor()]
)

In [4]:
class BinaryDataset(Dataset):
    def __init__(self, positiveFolders: StringList, negativeFolders: StringList, transform: transforms.Compose, show_index=False):
        self.show_index = show_index
        self.transform = transform
        self.positives = []
        self.negatives = []
        
        for f in positiveFolders:
            self.positives.extend(glob(os.path.join(f, "*.jpg")))
        for f in negativeFolders:
            self.negatives.extend(glob(os.path.join(f, "*.jpg")))
    
    def __len__(self):
        return len(self.positives) + len(self.negatives)
    
    def __str__(self):
        return f"""
Binary Dataset:
    Positive Examples:{len(self.positives)}
    Negative Examples:{len(self.negatives)}
    """
    
    def __repr__(self):
        return str(self)
    
    def __getitem__(self, index):
        if index < 0:
            index = len(self)+index
        if index >= len(self.positives):
            index -= len(self.positives)
            image_path = self.negatives[index]
            cl = 0
        else:
            image_path = self.positives[index]
            cl = 1
            
        im = torchvision.datasets.folder.default_loader(image_path)
        im = self.transform(im)
        if self.show_index:
            return im, cl, index
        else:
            return im, cl

In [5]:
train_dataset = BinaryDataset([TRAINSET_ROOT_SAFE], [TRAINSET_ROOT_NSFW], prepro)
val_dataset = BinaryDataset([TESTSET_ROOT_SAFE], [TESTSET_ROOT_NSFW], prepro, show_index=True)

In [6]:
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=TRAIN_SHUFFLE, num_workers=TRAIN_NUM_WORKERS, pin_memory=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=VAL_BATCH_SIZE, shuffle=VAL_SHUFFLE, num_workers=VAL_NUM_WORKERS, pin_memory=VAL_PIN_MEMORY, drop_last=False)

In [7]:
model = models.resnext50_32x4d(pretrained=True)
model.fc = torch.nn.Linear(2048, 1)
DEVICE = f"cuda:{DEVICE_ID}"
model = model.to(DEVICE)
for p in model.parameters():
    p.requires_grad=False
for p in model.fc.parameters():
    p.requires_grad=True

Downloading: "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth" to /root/.cache/torch/checkpoints/resnext50_32x4d-7cdf4587.pth


HBox(children=(IntProgress(value=0, max=100441675), HTML(value='')))




In [8]:
optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=INITIAL_LR)
criterion = nn.L1Loss()

In [9]:
def train(
    train_loader,
    model,
    criterion,
    optimizer,
    epoch,
    on_iteration=None,
):
    model = model.train()
    end = time.time()
    print("Start Training")
    avg_loss = 0
    for i, (inputs, labels) in enumerate(train_loader):
        print(f"{i/len(train_loader) * 100 : 2.2f}%", end="\r")
        iteration_time = time.time()
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1))
        avg_loss += loss.item() / len(outputs)
        loss.backward()
        optimizer.step()
        if on_iteration is not None:
            on_iteration(iteration=i+epoch*len(train_loader), loss=loss, y_pred=outputs, y_true=labels)     
    return avg_loss/len(train_loader)

In [10]:
def validate(val_loader, 
             model, 
             criterion,
             print_freq=1000):
    model = model.eval()
    y_true, proba_pred, indexes = [], [], []
    avg_loss = 0
    for i, (inputs, labels, image_indexes) in enumerate(val_loader):
        inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
        labels = labels.unsqueeze(1)
        with torch.no_grad():
            outputs = model(inputs)
            avg_loss += criterion(outputs, labels).item() / len(outputs)
        proba_pred.append(outputs.cpu().clone())
        y_true.append(labels.cpu().clone())
        indexes.append(image_indexes.clone())
    return {"loss":avg_loss/len(val_loader),
            "ground_truth":torch.cat(y_true), 
            "probabilities":torch.cat(proba_pred), 
            "images_index":torch.LongTensor(torch.cat(indexes))}

In [11]:
def validation_logs(epoch, validation_res):
    logger.add_scalar("Loss/Avg_Val", validation_res["loss"], epoch)
    logger.add_pr_curve("Eval/Prec_recall", validation_res["ground_truth"], validation_res["probabilities"], epoch)

def on_iteration_logs(iteration, loss, y_pred, y_true):
    l = loss.item()
    if iteration%200 == 0:
        logger.add_scalar("Loss/Train", l, iteration)
        print(
                f"{iteration}/{len(train_loader)} \t"
                f"Loss {l}"
            )

In [12]:
logger = SummaryWriter(TENSORBOARD_DIR)

In [13]:
torch.backends.cudnn.deterministic = True

In [None]:
for i in range(10):
    val_res = validate(val_loader, model, criterion)
    validation_logs(i, val_res)
    loss = train(
                    train_loader,
                    model,
                    criterion,
                    optimizer,
                    i,
                    on_iteration=on_iteration_logs,
                )
    logger.add_scalar("Loss/Avg_Train", loss, i)

Start Training
0/1431 	Loss 0.822343647480011
 2.87%

In [None]:
for i, child in enumerate(model.children()):
    if i > 6:
        for p in child.parameters():
            p.requires_grad=True

In [None]:
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE//2, shuffle=TRAIN_SHUFFLE, num_workers=TRAIN_NUM_WORKERS, pin_memory=True, drop_last=True)
optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=INITIAL_LR*0.1)

In [None]:
for i in range(10,40):
    val_res = validate(val_loader, model, criterion)
    validation_logs(i, val_res)
    loss = train(
                    train_loader,
                    model,
                    criterion,
                    optimizer,
                    i,
                    on_iteration=on_iteration_logs,
                )
    logger.add_scalar("Loss/Avg_Train", loss, i)