In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import glob
import sys
sys.path.append('/content/drive/My Drive/kaggle/cnn_detection/networks')
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import argparse
import random
import tqdm
import numpy as np
import torchvision.transforms as transforms
import pandas as pd
from PIL import Image
from resnet import resnet50
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models

## Hyper-parameters

In [None]:
image_size = 224 #299
batch_size = 64
epoch = 10
n_frames = 10
hidden_dim = 100

# training batch03
data_folder = '/content/drive/My Drive/kaggle/batch03'
metadata_dir = glob.glob(os.path.join(data_folder, 'dfdc_train_part_47', '*.json'))[0]
split = 0.8
model_path = '/content/drive/My Drive/kaggle/cnn_detection'

# valset from batch02
data_folder_val = '/content/drive/My Drive/kaggle/batch02'
metadata_dir_val = glob.glob(os.path.join(data_folder_val, 'dfdc_train_part_48', '*.json'))[0]

In [None]:
use_cuda = torch.cuda.is_available()
# torch.manual_seed(123)
device = torch.device('cuda' if use_cuda else "cpu")
print(device)

cuda


## Prepare Data

Load data from npy files

In [None]:
class VideoDatasetArray(Dataset):
    def __init__(self, root, n_frames, transform=None, train=True):
        """ Intialize the dataset from npy files
        
        Args:
            - root: root directory of the data
            - n_frame: the number of frames for each video
            - tranform: a custom tranform function
            - train: dataset for training
        """
        self.root = root
        self.transform = transform['train' if train else 'val']
        face_dir = os.path.join(self.root)
        if train:
            face_file = glob.glob(os.path.join(face_dir, '*.npy'))
        else:
            face_dir = os.path.join(self.root, 'face10train')
            face_file = [glob.glob(os.path.join(face_dir, '*.npy'))[1]]

        # Preload dataset to memory
        self.labels = []
        self.images = []
        print("\nPreload dataset to memory...\n")
        for face_batch in tqdm.tqdm(face_file, ncols=80):
            data = np.load(face_batch, allow_pickle=True)
            labels = data.item()['y']
            for k in range(len(labels)):
                target = 1 if labels[k] == "FAKE" else 0
                collections = []
                for i in range(10):
                    image = data.item()['x' + str(i)][k].transpose()
                    collections.append(image.copy())
                self.images.append(collections)
                self.labels.append(target)
            # del data
            
        self.len = len(self.labels)
    
    def __getitem__(self, index):
        images = self.images[index]
        label = self.labels[index]
        X = []
        if self.transform is not None:
            for image in images:
                # image = torch.FloatTensor(image)
                x = Image.fromarray(image.astype(np.uint8).transpose(1,2,0))
                X.append(self.transform(x))
        return X, label
    
    def __len__(self):
        return self.len

In [None]:
transform = {
        'train': transforms.Compose([
                    # transforms.ToPILImage(),
                    transforms.Resize(image_size),
                    transforms.RandomHorizontalFlip(),
                    transforms.ColorJitter(hue=0.5),
                    transforms.ToTensor(),
                    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                ]), 
        'val': transforms.Compose([
                    transforms.Resize(image_size),
                    transforms.ToTensor(),
                    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                ])
        }

In [None]:
print('\n----- Load Training Set -------')
trainset = VideoDatasetArray(
    root= data_folder, 
    n_frames = n_frames,
    transform=transform, train=True
)
trainset_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

print('\n----- Load Val Set-------')
valset = VideoDatasetArray(
    root= data_folder_val, 
    n_frames = n_frames,
    transform=transform, train=False
)
valset_loader = DataLoader(valset, batch_size=batch_size, shuffle=True, num_workers=2)

  0%|                                                    | 0/15 [00:00<?, ?it/s]


----- Load Training Set -------

Preload dataset to memory...



100%|███████████████████████████████████████████| 15/15 [04:29<00:00, 17.98s/it]
  0%|                                                     | 0/1 [00:00<?, ?it/s]


----- Load Val Set-------

Preload dataset to memory...



100%|█████████████████████████████████████████████| 1/1 [00:18<00:00, 18.92s/it]


## Utilities for Training

In [None]:
class FineTune():
    def __init__(self, model, model_name, epoch, optimizer, filename, log_interval=10):
        self.model = model
        self.model_name = model_name
        self.epoch = epoch
        self.optimizer = optimizer
        self.log_interval = log_interval

        self.train_loss = []
        self.train_accuracy = []
        self.val_loss = []
        self.val_accuracy = []
        self.output_folder = '/content/drive/My Drive/kaggle/output'
        self.filename = filename

    def train(self):  # set training mode
        loss_fn = nn.BCELoss()
        for ep in range(self.epoch):
            self.model.train()
            iteration = 0
            for batch_idx, (data, target) in enumerate(trainset_loader):
                data = [_data.to(device) for _data in data] 
                target = target.to(device)
                self.optimizer.zero_grad()
                output = self.model(data)
                loss = loss_fn(output.squeeze(dim=1), target.type_as(output))
                loss.backward()
                self.optimizer.step()
                if iteration % self.log_interval == 0:
                    print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                        ep, batch_idx * batch_size, len(trainset_loader.dataset),
                        100. * (batch_idx+1) / len(trainset_loader), loss.item()))
                iteration += 1

            # Evaluation for both the training set and validation set
            self.eval(False)
            self.eval(True)

            # Save
            history = [self.train_loss, self.train_accuracy, self.val_loss, self.val_accuracy]
            np.save(os.path.join(self.output_folder, self.filename), history)
            torch.save({
                        'epoch': ep,
                        'model_state_dict': self.model.state_dict(),
                        'optimizer_state_dict': self.optimizer.state_dict()
                        }, os.path.join(self.output_folder, self.filename+'.pt'))
        
        # Save loss and accuracy
        output_file = os.path.join(self.output_folder, self.filename+'.txt')

        with open(output_file, 'a') as f:
            f.write('train_loss\n')
            for item in self.train_loss:
                f.write('%s\n' % item)
            f.write('train_accuracy\n')
            for item in self.train_accuracy:
                f.write('%s\n' % item)
            f.write('val_loss\n')
            for item in self.val_loss:
                f.write('%s\n' % item)
            f.write('val_accuracy\n')
            for item in self.val_accuracy:
                f.write('%s\n' % item)

    def eval(self, is_val=True):
        loss_fn = nn.BCELoss(reduction="sum")
        self.model.eval()  # set evaluation mode
        loss = 0
        correct = 0
        TP = 0
        TN = 0
        FP = 0
        FN = 0
        data_loader = valset_loader if is_val else trainset_loader
        with torch.no_grad():  # set all requires_grad flags to False
            for data, target in data_loader:
                data = [_data.to(device) for _data in data] 
                target = target.to(device)
                output = self.model(data)
                loss += loss_fn(output.squeeze(dim=1), target.type_as(output)).item()
                pred = (output > 0.5).int()
                correct += pred.eq(target.view_as(pred)).sum().item()

                if is_val:
                    # for calculating precision and recall
                    TP += (pred * target.view_as(pred)).sum().item()
                    TN += ((1 - pred) * (1 - target.view_as(pred))).sum().item()
                    FP += (pred * (1 - target.view_as(pred))).sum().item()
                    FN += ((1 - pred) * target.view_as(pred)).sum().item()

        loss /= len(data_loader.dataset)
        accuracy = 100. * correct / len(data_loader.dataset)

        if is_val:
            # save validation loss and accuracy
            self.val_loss.append(loss)
            self.val_accuracy.append(accuracy)

            # calculate precision, recall, and f1
            precision = TP / (TP + FP)
            recall = TP / (TP + FN)
            f1 = 2 * precision * recall / (precision + recall)
            print('Val set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%), Precision: {:.4f}, Recall: {:.4f}, F1: {:.4f}'.format(
                loss, correct, len(data_loader.dataset),
                accuracy, precision, recall, f1))
        else:
            self.train_loss.append(loss)
            self.train_accuracy.append(accuracy)
            print('Train set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
                loss, correct, len(data_loader.dataset), accuracy))
    
    def plot_loss(self):
        plt.plot(self.train_loss, label='Training loss')
        plt.plot(self.val_loss, label='Validation loss')
        plt.legend(frameon=False)
        plt.show()

    def plot_accuracy(self):
        plt.plot(self.train_accuracy, label='Training accuracy')
        plt.plot(self.val_accuracy, label='Validation accuracy')
        plt.legend(frameon=False)
        plt.show()

In [None]:
def freeze_until(net, param_name):
    found_name = False
    for name, params in net.named_parameters():
        if name == param_name:
            found_name = True
        params.requires_grad = found_name
    
    fine_tuned = [k for k,v in net.named_parameters() if v.requires_grad]
    print('Layer to fine-tune:', fine_tuned)

In [None]:
class ResNet(nn.Module):
    def __init__(self, model):
        super(ResNet, self).__init__()
        self.resnet_layer = nn.Sequential(*list(model.children())[:-1])

    def forward(self, x):
        x = self.resnet_layer(x)
        return x

class LSTM(nn.Module):
    '''
    Ensemble all results from different frames and train the last layer as a classifier.
    '''
    def __init__(self, feature_extracter, n_frames, hidden_dim, maxpool=False):
        super(LSTM, self).__init__()
        self.feature_extracter = feature_extracter
        self.lstm = nn.LSTM(2048, hidden_dim)
        self.maxpool = nn.MaxPool1d(n_frames)
        self.classifier = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

        self.mp = maxpool        
    
    def forward(self, images):
        batch_size = images[0].shape[0]
        
        X = []
        for x in images:
            x = torch.squeeze(self.feature_extracter(x), dim=3)
            x = torch.transpose(x, 1, 2)
            X.append(x)
        features = torch.transpose(torch.cat(X, dim=1), 0, 1)
        output, (h, _)= self.lstm(features)  # (len, batch, hidden_dim)
        if not self.mp:
            x = self.classifier(h.view(batch_size, -1))
            x = self.sigmoid(x)
        else:
            output = self.maxpool(output.transpose(0,2)).transpose(0,2).view(batch_size, -1)
            x = self.classifier(output)
            x = self.sigmoid(x)
        return x       

class GRU(nn.Module):
    '''
    Ensemble all results from different frames and train the last layer as a classifier.
    '''
    def __init__(self, feature_extracter, n_frames, hidden_dim1, hidden_dim2):
        super(GRU, self).__init__()
        self.feature_extracter = feature_extracter
        self.gru1 = nn.GRU(2048, hidden_dim1)
        self.gru2 = nn.GRU(hidden_dim1, hidden_dim2)
        self.drop = nn.Dropout(p=0.3)
        self.classifier = nn.Linear(hidden_dim2, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, images):
        X = []
        for x in images:
            x = torch.squeeze(self.feature_extracter(x), dim=3)
            x = torch.transpose(x, 1, 2)
            X.append(x)
        features = torch.transpose(torch.cat(X, dim=1), 0, 1)
        output, _ = self.gru1(features)
        output = self.drop(output)
        _, h = self.gru2(output)
        h = self.drop(h)
        x = self.classifier(h)
        x = self.sigmoid(x)
        return x       

def cnn_model():
    resnet50 = models.resnet50(pretrained=True)
    pretrained_dict = resnet50.state_dict()
    model = ResNet(resnet50)
    model_dict = model.state_dict()

    pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
    model_dict.update(pretrained_dict)
    model.load_state_dict(model_dict)
    for param in model.parameters():
        param.requires_grad = False
    return model


## Train Model

In [None]:
conv_model = cnn_model()

Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /root/.cache/torch/checkpoints/resnet50-19c8e357.pth


HBox(children=(IntProgress(value=0, max=102502400), HTML(value='')))




In [None]:
# del model
model = LSTM(conv_model, n_frames=n_frames, hidden_dim=50, maxpool=True)
model.cuda()

optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)
finetune = FineTune(model, 'lstm', epoch=150, optimizer=optimizer, filename='lstm_100_0.0001SGD', log_interval=100)
finetune.train()

In [None]:
del model
model = LSTM(conv_model, n_frames=n_frames, hidden_dim=100, maxpool=False)
model.cuda()

optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)
finetune = FineTune(model, 'lstm', epoch=150, optimizer=optimizer, filename='lstm_100_0.0001SGD_noMaxPool', log_interval=100)
finetune.train()

In [None]:
del model
model = GRU(conv_model, n_frames=n_frames, hidden_dim1=50, hidden_dim2=10)
model.cuda()

## Attention

In [None]:
class Attention(nn.Module):
    def __init__(self, feature_extracter, n_frames, hidden_size, conv_features=2048, maxpool=False):
        super(Attention, self).__init__()
        
        self.hidden_size = hidden_size
        self.conv_features = conv_features
        self.n_frames = n_frames
        self.positional_encodings = self.create_positional_encodings()
        self.mp = maxpool

        self.feature_extracter = feature_extracter
        self.Q = nn.Linear(conv_features, hidden_size)
        self.K = nn.Linear(conv_features, hidden_size)
        self.V = nn.Linear(conv_features, hidden_size)
        self.softmax = nn.Softmax(dim=1)
        self.scaling_factor = torch.rsqrt(torch.tensor(hidden_size, dtype=torch.float))
        self.maxpool = nn.MaxPool1d(n_frames)
        if self.mp:
            self.classifier = nn.Linear(hidden_size, 1)
        else:
            self.classifier = nn.Linear(hidden_size*n_frames, 1)

        self.sigmoid = nn.Sigmoid()
    
    def forward(self, images):
        batch_size = images[0].shape[0]
        
        X = []
        for x in images: # (batch, 3, 224, 224)
            x = torch.squeeze(self.feature_extracter(x), dim=3)
            x = torch.transpose(x, 1, 2)
            X.append(x)

        features = torch.cat(X, dim=1) # (batch, frames, 2048)
        features = features + self.positional_encodings[:self.n_frames].unsqueeze(0)
        q = self.Q(features)
        k = self.K(features)
        v = self.V(features)    
        unnormalized_attention = torch.bmm(k, q.transpose(2,1)) * self.scaling_factor
        attention_weights = self.softmax(unnormalized_attention)
        if self.mp:
            context = torch.bmm(attention_weights.transpose(2,1), v)
            context = self.maxpool(context.transpose(1,2)).view(batch_size, -1)
        else:
            context = torch.bmm(attention_weights.transpose(2,1), v).view(batch_size, -1) # (batch, hidden*frames)
        x = self.sigmoid(self.classifier(context))
        return x

    def create_positional_encodings(self, max_seq_len=100):
      pos_indices = torch.arange(max_seq_len)[..., None]
      dim_indices = torch.arange(self.conv_features//2)[None, ...]
      exponents = (2*dim_indices).float()/(self.conv_features)
      trig_args = pos_indices / (10000**exponents)
      sin_terms = torch.sin(trig_args)
      cos_terms = torch.cos(trig_args)

      pos_encodings = torch.zeros((max_seq_len, self.conv_features))
      pos_encodings[:, 0::2] = sin_terms
      pos_encodings[:, 1::2] = cos_terms

      pos_encodings = pos_encodings.cuda()

      return pos_encodings

In [None]:
# del model
model = Attention(conv_model, n_frames=n_frames, hidden_size=100, maxpool=True)
model.cuda()

optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)
finetune = FineTune(model, 'attention', epoch=35, optimizer=optimizer, filename='attention_100_0.0001SGD', log_interval=100)
finetune.train()

In [None]:
# del model
model = Attention(conv_model, n_frames=n_frames, hidden_size=100, maxpool=False)
model.cuda()

optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.9)
finetune = FineTune(model, 'attention', epoch=35, optimizer=optimizer, filename='attention_100_0.0001SGD_nomaxpool', log_interval=100)
finetune.train()

Train set: Average loss: 0.6897, Accuracy: 2154/3836 (56%)
Val set: Average loss: 0.6928, Accuracy: 131/261 (50%), Precision: 0.5389, Recall: 0.7172, F1: 0.6154
Train set: Average loss: 0.6870, Accuracy: 2302/3836 (60%)
Val set: Average loss: 0.6919, Accuracy: 131/261 (50%), Precision: 0.5424, Recall: 0.6621, F1: 0.5963
Train set: Average loss: 0.6830, Accuracy: 2054/3836 (54%)
Val set: Average loss: 0.6860, Accuracy: 145/261 (56%), Precision: 0.5556, Recall: 1.0000, F1: 0.7143
Train set: Average loss: 0.6828, Accuracy: 2054/3836 (54%)
Val set: Average loss: 0.6854, Accuracy: 145/261 (56%), Precision: 0.5556, Recall: 1.0000, F1: 0.7143
Train set: Average loss: 0.6779, Accuracy: 2089/3836 (54%)
Val set: Average loss: 0.6834, Accuracy: 145/261 (56%), Precision: 0.5556, Recall: 1.0000, F1: 0.7143
Train set: Average loss: 0.6747, Accuracy: 2115/3836 (55%)
Val set: Average loss: 0.6816, Accuracy: 145/261 (56%), Precision: 0.5556, Recall: 1.0000, F1: 0.7143
