In [1]:
import os
import time
import argparse
import numpy as np
from tqdm import tqdm
import seaborn as sns

import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from sklearn.metrics import roc_auc_score, average_precision_score

from model import dataloader
from model import create_dpnet

from utils import *

from sklearn.metrics import make_scorer, roc_curve
from scipy.optimize import brentq
from scipy.interpolate import interp1d

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(0)
np.random.seed(0)

def calculate_eer(y_true, y_score):
    fpr, tpr, thresholds = roc_curve(y_true, y_score, pos_label=1)
    eer = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
    return eer
    
class DPNet():
    def __init__(self, device, log_dir, args, train_loader, val_loader, test_loader):
        self.device = device
        self.log_dir = log_dir
        self.args = args
        
        self.train_loader = train_loader
        self.val_loader   = val_loader
        self.test_loader  = test_loader
        
        self.best_val_auc = 0
        self.counter = 0
        self.patience = 5
        
        self.build_model()
        
    def build_model(self):        
        self.model = create_dpnet(in_channel=3)
        self.model = nn.DataParallel(self.model)
        self.criterion = nn.CrossEntropyLoss()

        if self.args.checkpoint:
            cp = torch.load(self.args.checkpoint)
            self.epoch = cp['epoch']
            self.model.load_state_dict(cp['state_dict'])

        self.model = self.model.to(self.device)  
        
    def test(self):
        self.validate_1epoch(test_mode=True)

    def validate_1epoch(self, test_mode = False):
        if test_mode:
            print('|--> [testing stage]')
        else:
            print('|--> Epoch:[{0}/{1}][validation stage]'.format(self.epoch+1, self.args.num_epochs))

        losses, top1 = AverageMeter(), AverageMeter()

        # Evaluate mode
        self.model.eval()
        self.dic_video_level_preds = {}
        
        start = time.time()
        with torch.no_grad():
            progress = tqdm(self.test_loader) if test_mode else tqdm(self.val_loader)
            for _, (video_names, inputs, labels) in enumerate(progress):
                inputs = inputs.to(self.device)
                labels = labels.to(self.device)

                # Compute output
                batch_size = inputs.shape[0]             
                outputs, min_distances = self.model(inputs)

                # Accumulate video level prediction
                preds = outputs.data.cpu().numpy()
                for i in range(batch_size):
                    video_name = video_names[i]
                    if video_name not in self.dic_video_level_preds.keys():
                        self.dic_video_level_preds[video_name] = preds[i,:]
                    else:
                        self.dic_video_level_preds[video_name] += preds[i,:]
        
        # Calculate video level statistics
        video_top1, video_auc, video_loss, video_pauc_10, video_eer = self.frame_2_video_level_accuracy()

        info = {'Epoch': [self.epoch],
                'Time':  [round(time.time()-start,3)],
                'Loss':  [round(video_loss,5)],
                'Acc':   [round(video_top1,4)],
                'AUC':   [round(video_auc,4)],
                'pAUC_10':    [round(video_pauc_10,4)],
                'EER':   [round(video_eer,4)]}
        if test_mode:
            print(info)
        else:
            record_info(info, os.path.join(self.log_dir, 'test.csv'))
        return video_top1, video_auc, video_loss
                             
    def frame_2_video_level_accuracy(self):
        correct = 0
        video_level_preds = np.zeros((len(self.dic_video_level_preds),2))
        video_level_labels = np.zeros(len(self.dic_video_level_preds))
        
        for i, name in enumerate(sorted(self.dic_video_level_preds.keys())):
            preds = self.dic_video_level_preds[name]
            label = 1.0 if 'FAKE' in name else 0.0
                
            video_level_preds[i,:] = preds / 100
            video_level_labels[i] = label
            if np.argmax(preds) == (label):
                correct += 1
        if self.args.save_predictions:         
            np.save(open(f'predictions/{self.args.start_task}_{self.args.task}_labels_{self.args.stream}.npy','wb'), video_level_labels)
            np.save(open(f'predictions/{self.args.start_task}_{self.args.task}_preds_{self.args.stream}.npy','wb'), video_level_preds)

        video_level_labels = torch.from_numpy(video_level_labels).long()
        video_level_preds = torch.from_numpy(video_level_preds).float()
            
        top1 = accuracy(video_level_preds, video_level_labels, topk=(1,))
        loss = self.criterion(video_level_preds, video_level_labels)
                                 
        logits = nn.functional.softmax(video_level_preds, dim=1)[:, 1].numpy()
        auc = roc_auc_score(video_level_labels, logits)
        pauc_10 = roc_auc_score(video_level_labels, logits, max_fpr=0.1)
        eer = calculate_eer(video_level_labels, logits)
        
        return top1.item(), auc, loss.item(), pauc_10, eer                         

In [2]:
for i in ['Deepfakes', 'Face2Face', 'FaceSwap', 'NeuralTextures']:
    for j in ['Deepfakes', 'Face2Face', 'FaceSwap', 'NeuralTextures', 'FF++']:
        print(f'{i} to {j}')

        class Args:
            gpu = '2,3'
            start_task = i
            task = j
            num_workers = 8
            num_epochs = 20
            batch_size = 32
            learning_rate = 2e-4
            stream = 'rgb'
            checkpoint = ''
            save_predictions = False
        args = Args()
        
        if args.stream =='rgb':
            args.checkpoint = f'record/{i}/seed_1/best_val_checkpoint.pth'
        elif args.stream == 'luminance':
            args.checkpoint = f'record/luminance/{i}/seed_1/best_val_checkpoint.pth'
        elif args.stream == 'sharpened':
            args.checkpoint = f'record/sharpened/{i}/seed_1/best_val_checkpoint.pth'

        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        log_dir = ''

        train_transform = transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
        test_transform = transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

        data_dir = '/meladyfs/newyork/loctrinh/DATASETS/'
        frame_count = {'FF++': pd.read_csv(os.path.join(data_dir, 'FF++', 'video_stat.csv'), index_col=0)}
        train_df = pd.read_csv(os.path.join(data_dir, '{}/splits/{}_trainlist_01.csv'.format('FF++', args.task)))
        val_df = pd.read_csv(os.path.join(data_dir, '{}/splits/{}_vallist_01.csv'.format('FF++', args.task)))
        test_df = pd.read_csv(os.path.join(data_dir, '{}/splits/{}_testlist_01.csv'.format('FF++', args.task)))
        
        if args.stream == 'rgb':
            data_loader = dataloader.SingleImageLoader(args.batch_size, args.num_workers, data_dir, frame_count,
                                                       train_df, val_df, test_df, train_transform, test_transform)
        elif args.stream == 'luminance':
            data_loader = dataloader.LuminanceGradientImageLoader(args.batch_size, args.num_workers, data_dir, frame_count,
                                                                  train_df, val_df, test_df, train_transform, test_transform)
        elif args.stream == 'sharpened':
            data_loader = dataloader.SharpenedImageLoader(args.batch_size, args.num_workers, data_dir, frame_count,
                                                                  train_df, val_df, test_df, train_transform, test_transform)
        
        train_loader, val_loader, test_loader, push_loader = data_loader.run()

        # =================== Training =================== 
        detector = DPNet(device=device,
                            log_dir=log_dir,
                            args=args,
                            train_loader=train_loader,
                            val_loader=val_loader,
                            test_loader=test_loader)
        detector.test()

Deepfakes to Deepfakes
==> Training data: 388800 frames
==> Validation data: 28000 frames
==> Testing data: 28000 frames
==> Pushing data: 144000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/875 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 875/875 [02:46<00:00,  4.96it/s]


{'Epoch': [2], 'Time': [166.137], 'Loss': [0.01319], 'Acc': [99.2857], 'AUC': [0.9999], 'pAUC_10': [0.9995], 'EER': [0.014]}
Deepfakes to Face2Face
==> Training data: 388800 frames
==> Validation data: 28000 frames
==> Testing data: 28000 frames
==> Pushing data: 144000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/875 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 875/875 [02:54<00:00,  5.29it/s]


{'Epoch': [2], 'Time': [174.602], 'Loss': [4.50846], 'Acc': [52.1429], 'AUC': [0.8024], 'pAUC_10': [0.598], 'EER': [0.2657]}
Deepfakes to FaceSwap
==> Training data: 388800 frames
==> Validation data: 28000 frames
==> Testing data: 28000 frames
==> Pushing data: 144000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/875 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 875/875 [02:57<00:00,  5.09it/s]


{'Epoch': [2], 'Time': [177.525], 'Loss': [11.47503], 'Acc': [49.6429], 'AUC': [0.279], 'pAUC_10': [0.4753], 'EER': [0.6571]}
Deepfakes to NeuralTextures
==> Training data: 388800 frames
==> Validation data: 28000 frames
==> Testing data: 28000 frames
==> Pushing data: 144000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/875 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 875/875 [02:59<00:00,  5.01it/s]


{'Epoch': [2], 'Time': [179.285], 'Loss': [2.904], 'Acc': [58.2143], 'AUC': [0.8765], 'pAUC_10': [0.7081], 'EER': [0.1643]}
Deepfakes to FF++
==> Training data: 972000 frames
==> Validation data: 70000 frames
==> Testing data: 70000 frames
==> Pushing data: 360000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/2188 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 2188/2188 [07:35<00:00,  5.35it/s]


{'Epoch': [2], 'Time': [455.352], 'Loss': [7.54921], 'Acc': [44.1429], 'AUC': [0.7395], 'pAUC_10': [0.6952], 'EER': [0.3116]}
Face2Face to Deepfakes
==> Training data: 388800 frames
==> Validation data: 28000 frames
==> Testing data: 28000 frames
==> Pushing data: 144000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/875 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 875/875 [02:58<00:00,  5.17it/s]


{'Epoch': [2], 'Time': [178.642], 'Loss': [3.6989], 'Acc': [50.3571], 'AUC': [0.8223], 'pAUC_10': [0.6485], 'EER': [0.2571]}
Face2Face to Face2Face
==> Training data: 388800 frames
==> Validation data: 28000 frames
==> Testing data: 28000 frames
==> Pushing data: 144000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/875 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 875/875 [02:59<00:00,  5.09it/s]


{'Epoch': [2], 'Time': [179.335], 'Loss': [0.06816], 'Acc': [99.2857], 'AUC': [0.9952], 'pAUC_10': [0.9925], 'EER': [0.0143]}
Face2Face to FaceSwap
==> Training data: 388800 frames
==> Validation data: 28000 frames
==> Testing data: 28000 frames
==> Pushing data: 144000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/875 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 875/875 [02:59<00:00,  5.15it/s]


{'Epoch': [2], 'Time': [179.401], 'Loss': [5.01213], 'Acc': [50.0], 'AUC': [0.4766], 'pAUC_10': [0.4946], 'EER': [0.5286]}
Face2Face to NeuralTextures
==> Training data: 388800 frames
==> Validation data: 28000 frames
==> Testing data: 28000 frames
==> Pushing data: 144000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/875 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 875/875 [02:59<00:00,  5.09it/s]


{'Epoch': [2], 'Time': [179.24], 'Loss': [4.16911], 'Acc': [50.0], 'AUC': [0.7223], 'pAUC_10': [0.5897], 'EER': [0.3429]}
Face2Face to FF++
==> Training data: 972000 frames
==> Validation data: 70000 frames
==> Testing data: 70000 frames
==> Pushing data: 360000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/2188 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 2188/2188 [07:36<00:00,  5.43it/s]


{'Epoch': [2], 'Time': [456.205], 'Loss': [5.17916], 'Acc': [39.8571], 'AUC': [0.7541], 'pAUC_10': [0.6813], 'EER': [0.3143]}
FaceSwap to Deepfakes
==> Training data: 388800 frames
==> Validation data: 28000 frames
==> Testing data: 28000 frames
==> Pushing data: 144000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/875 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 875/875 [02:59<00:00,  4.87it/s]


{'Epoch': [1], 'Time': [179.245], 'Loss': [9.28957], 'Acc': [50.0], 'AUC': [0.537], 'pAUC_10': [0.5405], 'EER': [0.4714]}
FaceSwap to Face2Face
==> Training data: 388800 frames
==> Validation data: 28000 frames
==> Testing data: 28000 frames
==> Pushing data: 144000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/875 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 875/875 [02:59<00:00,  4.94it/s]


{'Epoch': [1], 'Time': [179.588], 'Loss': [7.51737], 'Acc': [50.0], 'AUC': [0.6975], 'pAUC_10': [0.5902], 'EER': [0.3554]}
FaceSwap to FaceSwap
==> Training data: 388800 frames
==> Validation data: 28000 frames
==> Testing data: 28000 frames
==> Pushing data: 144000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/875 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 875/875 [02:59<00:00,  5.02it/s]


{'Epoch': [1], 'Time': [179.419], 'Loss': [0.09711], 'Acc': [99.2857], 'AUC': [0.994], 'pAUC_10': [0.9962], 'EER': [0.0071]}
FaceSwap to NeuralTextures
==> Training data: 388800 frames
==> Validation data: 28000 frames
==> Testing data: 28000 frames
==> Pushing data: 144000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/875 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 875/875 [02:59<00:00,  4.94it/s]


{'Epoch': [1], 'Time': [179.548], 'Loss': [10.88944], 'Acc': [50.0], 'AUC': [0.416], 'pAUC_10': [0.4868], 'EER': [0.5786]}
FaceSwap to FF++
==> Training data: 972000 frames
==> Validation data: 70000 frames
==> Testing data: 70000 frames
==> Pushing data: 360000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/2188 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 2188/2188 [07:36<00:00,  4.97it/s]


{'Epoch': [1], 'Time': [456.449], 'Loss': [11.11734], 'Acc': [39.7143], 'AUC': [0.6611], 'pAUC_10': [0.6535], 'EER': [0.4]}
NeuralTextures to Deepfakes
==> Training data: 388800 frames
==> Validation data: 28000 frames
==> Testing data: 28000 frames
==> Pushing data: 144000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/875 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 875/875 [02:59<00:00,  4.79it/s]


{'Epoch': [2], 'Time': [179.279], 'Loss': [1.30951], 'Acc': [67.5], 'AUC': [0.9072], 'pAUC_10': [0.6955], 'EER': [0.1571]}
NeuralTextures to Face2Face
==> Training data: 388800 frames
==> Validation data: 28000 frames
==> Testing data: 28000 frames
==> Pushing data: 144000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/875 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 875/875 [02:59<00:00,  5.12it/s]


{'Epoch': [2], 'Time': [179.794], 'Loss': [3.51625], 'Acc': [53.2143], 'AUC': [0.6924], 'pAUC_10': [0.5454], 'EER': [0.35]}
NeuralTextures to FaceSwap
==> Training data: 388800 frames
==> Validation data: 28000 frames
==> Testing data: 28000 frames
==> Pushing data: 144000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/875 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 875/875 [03:00<00:00,  5.12it/s]


{'Epoch': [2], 'Time': [180.151], 'Loss': [5.50233], 'Acc': [48.5714], 'AUC': [0.4438], 'pAUC_10': [0.4839], 'EER': [0.5286]}
NeuralTextures to NeuralTextures
==> Training data: 388800 frames
==> Validation data: 28000 frames
==> Testing data: 28000 frames
==> Pushing data: 144000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/875 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 875/875 [02:59<00:00,  4.72it/s]


{'Epoch': [2], 'Time': [179.435], 'Loss': [0.23931], 'Acc': [95.7143], 'AUC': [0.9818], 'pAUC_10': [0.9637], 'EER': [0.0429]}
NeuralTextures to FF++
==> Training data: 972000 frames
==> Validation data: 70000 frames
==> Testing data: 70000 frames
==> Pushing data: 360000 frames
==> Loading pretrained model model/pretrained_models/hrnetv2_w48_imagenet_pretrained.pth


  0%|          | 0/2188 [00:00<?, ?it/s]

|--> [testing stage]


100%|██████████| 2188/2188 [07:36<00:00,  5.12it/s]

{'Epoch': [2], 'Time': [456.915], 'Loss': [4.13182], 'Acc': [48.5714], 'AUC': [0.7563], 'pAUC_10': [0.6721], 'EER': [0.2995]}



