# Initial setup

## Mount drive
This needs to be done every time you start the colab runtime. If you don't know what that means, don't worry.

In [0]:
from google.colab import drive
drive.mount('/content/drive')

# Downloading data

Download the data to the shared drive (modified version of http://kaldir.vc.in.tum.de/faceforensics_download_v4.py)


## Accept The TOS

In [0]:
# TOS
print('By pressing any key to continue you confirm that you have agreed '\
          'to the FaceForensics terms of use as described at:')
print('http://canis.vc.in.tum.de:8100/webpage/FaceForensics_TOS.pdf')
print('***')
print('Press any key to continue, or CTRL-C to exit.')
_ = input('')

## Download the DeepFake dataset

In [0]:
import argparse
import os
import urllib
import urllib.request
import tempfile
import time
import sys
import json
import random
from tqdm import tqdm
from os.path import join


# URLs and filenames
FILELIST_URL = 'misc/filelist.json'
DEEPFEAKES_DETECTION_URL = 'misc/deepfake_detection_filenames.json'
DEEPFAKES_MODEL_NAMES = ['decoder_A.h5', 'decoder_B.h5', 'encoder.h5',]

# Parameters
DATASETS = {
    'original_youtube_videos': 'misc/downloaded_youtube_videos.zip',
    'original_youtube_videos_info': 'misc/downloaded_youtube_videos_info.zip',
    'original': 'original_sequences/youtube',
    'DeepFakeDetection_original': 'original_sequences/actors',
    'Deepfakes': 'manipulated_sequences/Deepfakes',
    'DeepFakeDetection': 'manipulated_sequences/DeepFakeDetection',
    'Face2Face': 'manipulated_sequences/Face2Face',
    'FaceSwap': 'manipulated_sequences/FaceSwap',
    'NeuralTextures': 'manipulated_sequences/NeuralTextures'
    }
ALL_DATASETS = ['original', 'DeepFakeDetection_original', 'Deepfakes',
                'DeepFakeDetection', 'Face2Face', 'FaceSwap',
                'NeuralTextures']
COMPRESSION = ['raw', 'c23', 'c40']
TYPE = ['videos', 'masks', 'models']
SERVERS = ['EU', 'EU2', 'CA']


def parse_args():
    parser = argparse.ArgumentParser(
        description='Downloads FaceForensics v2 public data release.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument('output_path', type=str, help='Output directory.')
    parser.add_argument('-d', '--dataset', type=str, default='all',
                        help='Which dataset to download, either pristine or '
                             'manipulated data or the downloaded youtube '
                             'videos.',
                        choices=list(DATASETS.keys()) + ['all']
                        )
    parser.add_argument('-c', '--compression', type=str, default='raw',
                        help='Which compression degree. All videos '
                             'have been generated with h264 with a varying '
                             'codec. Raw (c0) videos are lossless compressed.',
                        choices=COMPRESSION
                        )
    parser.add_argument('-t', '--type', type=str, default='videos',
                        help='Which file type, i.e. videos, masks, for our '
                             'manipulation methods, models, for Deepfakes.',
                        choices=TYPE
                        )
    parser.add_argument('-n', '--num_videos', type=int, default=None,
                        help='Select a number of videos number to '
                             "download if you don't want to download the full"
                             ' dataset.')
    parser.add_argument('--server', type=str, default='EU',
                        help='Server to download the data from. If you '
                             'encounter a slow download speed, consider '
                             'changing the server.',
                        choices=SERVERS
                        )
    args = parser.parse_args()

    # URLs
    server = args.server
    if server == 'EU':
        server_url = 'http://canis.vc.in.tum.de:8100/'
    elif server == 'EU2':
        server_url = 'http://kaldir.vc.in.tum.de/faceforensics/'
    elif server == 'CA':
        server_url = 'http://falas.cmpt.sfu.ca:8100/'
    else:
        raise Exception('Wrong server name. Choices: {}'.format(str(SERVERS)))
    args.tos_url = server_url + 'webpage/FaceForensics_TOS.pdf'
    args.base_url = server_url + 'v3/'
    args.deepfakes_model_url = server_url + 'v3/manipulated_sequences/' + \
                               'Deepfakes/models/'

    return args



def download_files(filenames, base_url, output_path, report_progress=True):
    os.makedirs(output_path, exist_ok=True)
    if report_progress:
        filenames = tqdm(filenames)
    for filename in filenames:
        download_file(base_url + filename, join(output_path, filename))


def reporthook(count, block_size, total_size):
    global start_time
    if count == 0:
        start_time = time.time()
        return
    duration = time.time() - start_time
    progress_size = int(count * block_size)
    speed = int(progress_size / (1024 * duration))
    percent = int(count * block_size * 100 / total_size)
    sys.stdout.write("\rProgress: %d%%, %d MB, %d KB/s, %d seconds passed" %
                     (percent, progress_size / (1024 * 1024), speed, duration))
    sys.stdout.flush()


def download_file(url, out_file, report_progress=False):
    out_dir = os.path.dirname(out_file)
    if not os.path.isfile(out_file):
        fh, out_file_tmp = tempfile.mkstemp(dir=out_dir)
        f = os.fdopen(fh, 'w')
        f.close()
        if report_progress:
            urllib.request.urlretrieve(url, out_file_tmp,
                                       reporthook=reporthook)
        else:
            urllib.request.urlretrieve(url, out_file_tmp)
        os.rename(out_file_tmp, out_file)
    else:
        tqdm.write('WARNING: skipping download of existing file ' + out_file)


def main(args):

    # Extract arguments
    c_datasets = [args['dataset']] if args['dataset'] != 'all' else ALL_DATASETS
    c_type = args['type']
    c_compression = args['compression']
    num_videos = args['num_videos']
    output_path = args['output_path']
    os.makedirs(output_path, exist_ok=True)

    # Check for special dataset cases
    for dataset in c_datasets:
        dataset_path = DATASETS[dataset]
        # Special cases
        if 'original_youtube_videos' in dataset:
            # Here we download the original youtube videos zip file
            print('Downloading original youtube videos.')
            if not 'info' in dataset_path:
                print('Please be patient, this may take a while (~40gb)')
                suffix = ''
            else:
            	suffix = 'info'
            download_file(args['base_url'] + '/' + dataset_path,
                          out_file=join(output_path,
                                        'downloaded_videos{}.zip'.format(
                                            suffix)),
                          report_progress=True)
            return

        # Else: regular datasets
        print('Downloading {} of dataset "{}"'.format(
            c_type, dataset_path
        ))

        # Get filelists and video lenghts list from server
        if 'DeepFakeDetection' in dataset_path or 'actors' in dataset_path:
        	filepaths = json.loads(urllib.request.urlopen(args['base_url'] + '/' + 
                DEEPFEAKES_DETECTION_URL).read().decode("utf-8"))
        	if 'actors' in dataset_path:
        		filelist = filepaths['actors']
        	else:
        		filelist = filepaths['DeepFakesDetection']
        elif 'original' in dataset_path:
            # Load filelist from server
            file_pairs = json.loads(urllib.request.urlopen(args['base_url'] + '/' +
                FILELIST_URL).read().decode("utf-8"))
            filelist = []
            for pair in file_pairs:
            	filelist += pair
        else:
            # Load filelist from server
            file_pairs = json.loads(urllib.request.urlopen(args['base_url'] + '/' + 
                FILELIST_URL).read().decode("utf-8"))
            # Get filelist
            filelist = []
            for pair in file_pairs:
                filelist.append('_'.join(pair))
                if c_type != 'models':
                    filelist.append('_'.join(pair[::-1]))
        # Maybe limit number of videos for download
        if num_videos is not None and num_videos > 0:
        	print('Downloading the first {} videos'.format(num_videos))
        	filelist = filelist[:num_videos]

        # Server and local paths
        dataset_videos_url = args['base_url'] + '{}/{}/{}/'.format(
            dataset_path, c_compression, c_type)
        dataset_mask_url = args['base_url'] + '{}/{}/videos/'.format(
            dataset_path, 'masks', c_type)

        if c_type == 'videos':
            dataset_output_path = join(output_path, dataset_path, c_compression,
                                       c_type)
            print('Output path: {}'.format(dataset_output_path))
            filelist = [filename + '.mp4' for filename in filelist]
            download_files(filelist, dataset_videos_url, dataset_output_path)
        elif c_type == 'masks':
            dataset_output_path = join(output_path, dataset_path, c_type,
                                       'videos')
            print('Output path: {}'.format(dataset_output_path))
            if 'original' in dataset:
                if args['dataset'] != 'all':
                    print('Only videos available for original data. Aborting.')
                    return
                else:
                    print('Only videos available for original data. '
                          'Skipping original.\n')
                    continue
            filelist = [filename + '.mp4' for filename in filelist]
            download_files(filelist, dataset_mask_url, dataset_output_path)

        # Else: models for deepfakes
        else:
            if dataset != 'Deepfakes' and c_type == 'models':
                print('Models only available for Deepfakes. Aborting')
                return
            dataset_output_path = join(output_path, dataset_path, c_type)
            print('Output path: {}'.format(dataset_output_path))

            # Get Deepfakes models
            for folder in tqdm(filelist):
                folder_filelist = DEEPFAKES_MODEL_NAMES

                # Folder paths
                folder_base_url = args['deepfakes_model_url'] + folder + '/'
                folder_dataset_output_path = join(dataset_output_path,
                                                  folder)
                download_files(folder_filelist, folder_base_url,
                               folder_dataset_output_path,
                               report_progress=False)   # already done
if __name__ == "__main__":
    #args = parse_args()
    args = {}
    args['base_url']='http://canis.vc.in.tum.de:8100/v3/'
    args['compression']='raw'
    args['dataset']='DeepFakeDetection'
    args['deepfakes_model_url']='http://canis.vc.in.tum.de:8100/v3/manipulated_sequences/Deepfakes/models/'
    args['num_videos']=None
    args['output_path']='drive/Shared drives/AI Club/Projects/2019-2020/Deepfake detector/Data'
    args['server']='EU'
    args['type']='videos'
    main(args)


## Download the Deepfake Originals dataset

In [0]:
args['dataset']='DeepFakeDetection_original'
args['deepfakes_model_url']='http://canis.vc.in.tum.de:8100/v3/original_sequences/actors'
main(args)

# Processing the data

##Processing video into frames (WIP)
need add function to save files

In [0]:

"""Cut the video"""
import cv2

video_dir = "drive/Shared drives/AI Club/Projects/2019-2020/Deepfake detector/Data/original_sequences/actors/raw/videos/"

vc = cv2.VideoCapture(video_dir+"01__exit_phone_room.mp4")  # read the video
n = 1
 
if vc.isOpened():  
    rval, frame = vc.read()
else:
    rval = False
 
timeF = 10  

# Save the frames
frame_dir = "drive/Shared drives/AI Club/Projects/2019-2020/Deepfake detector/Spliced Frames/" 
num = 0
while rval:  
    rval, frame = vc.read()
    if (n % timeF == 0):  # save the frame
        num += 1
        print(num)
        cv2.imwrite(frame_dir+"{}.jpg".format(num), frame)  
    n += 1
    cv2.waitKey(1)
vc.release()
# install pnslib
!pip install git+git://github.com/PnS2019/pnslib.git
from pnslib import utils

target_size=256

for i in range(num):
  # read image
  img = cv2.imread(frame_dir+str(i+1)+".jpg")

  # load face cascade
  face_cascade = cv2.CascadeClassifier(
      utils.get_haarcascade_path('haarcascade_frontalface_default.xml'))

  # search face
  gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
  faces = face_cascade.detectMultiScale(gray, 1.3, 5)\

  for (x, y, w, h) in faces:
      x-=int((target_size-w)/2)
      w=target_size
      y-=int((target_size-h)/2)
      h=target_size
    face_img = img[y:y+h, x:x+w]

    face_dir = "drive/Shared drives/AI Club/Projects/2019-2020/Deepfake detector/Faces/" 
    cv2.imwrite(face_dir+str(i+1)+".jpg", face_img)
  print(i)

# Crop Original Faces

In [0]:
#list every mp4 file in original folder
original_dir = "drive/Shared drives/AI Club/Projects/2019-2020/Deepfake detector/Data/original_sequences/actors/raw/videos/"
original_sequences_list = os.listdir(original_dir)

for filenamemp4 in original_sequences_list:
  filename = filenamemp4[:-4]
  filename+='/'
  vc = cv2.VideoCapture(original_dir+filenamemp4)  # read the video
  n = 1
    
  if vc.isOpened():  
      rval, frame = vc.read()
  else:
      rval = False
    
  timeF = 10  

  # Save the frames
  original_spliced_dir = "drive/Shared drives/AI Club/Projects/2019-2020/Deepfake detector/Data/original_sequences/Spliced_Frames/" 
  os.mkdir(original_spliced_dir)
  num = 0

  #create a folder that contains the spliced images 
  os.mkdir(original_spliced_dir+filename)
  while rval:  
      rval, frame = vc.read()
      if (n % timeF == 0):  # save the frame
          num += 1
          print(num)
          cv2.imwrite(original_spliced_dir+filename+"{}.jpg".format(num), frame)  
      n += 1
      cv2.waitKey(1)
  vc.release()

  #create a folder that contains images that have the face cropped from the spliced images
  os.mkdir(original_spliced_dir+filename+"Face/")
  for i in range(num):
    # read image
    img = cv2.imread(original_spliced_dir+filename+str(i+1)+".jpg")

    # load face cascade
    face_cascade = cv2.CascadeClassifier(
        utils.get_haarcascade_path('haarcascade_frontalface_default.xml'))

    # search face
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.3, 5)\

    for (x, y, w, h) in faces:
        face_img = img[y:y+h, x:x+w]

    cv2.imwrite(original_spliced_dir+filename+"Face/"+str(i+1)+".jpg", face_img)
    

#Crop Manipulated Faces

In [0]:
#list every mp4 file in manipulated folder
manipulated_dir = "drive/Shared drives/AI Club/Projects/2019-2020/Deepfake detector/Data/manipulated_sequences/DeepFakeDetection/raw/videos/"
manipulated_sequences_list = os.listdir(manipulated_dir)
for filenamemp4 in manipulated_sequences_list:
  filename = filenamemp4[:-4]
  filename+='/'
  vc = cv2.VideoCapture(manipulated_dir+filenamemp4)  # read the video
  n = 1
    
  if vc.isOpened():  
      rval, frame = vc.read()
  else:
      rval = False
    
  timeF = 10  

  # Save the frames
  manipulated_spliced_dir = "drive/Shared drives/AI Club/Projects/2019-2020/Deepfake detector/Data/original_sequences/Spliced_Frames/" 
  os.mkdir(manipulated_spliced_dir)
  num = 0

  #create a folder that contains the spliced images 
  os.mkdir(manipulated_spliced_dir+filename)
  while rval:  
      rval, frame = vc.read()
      if (n % timeF == 0):  # save the frame
          num += 1
          print(num)
          cv2.imwrite(manipulated_spliced_dir+filename+"{}.jpg".format(num), frame)  
      n += 1
      cv2.waitKey(1)
  vc.release()

  #create a folder that contains images that have the face cropped from the spliced images
  os.mkdir(manipulated_spliced_dir+filename+"Face/")
  for i in range(num):
    # read image
    img = cv2.imread(manipulated_spliced_dir+filename+str(i+1)+".jpg")

    # load face cascade
    face_cascade = cv2.CascadeClassifier(
        utils.get_haarcascade_path('haarcascade_frontalface_default.xml'))

    # search face
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.3, 5)\

    for (x, y, w, h) in faces:
        face_img = img[y:y+h, x:x+w]

    cv2.imwrite(manipulated_spliced_dir+filename+"Face/"+str(i+1)+".jpg", face_img)