# Deepfake Detection Test

**Key Resources:**

- [Kaggle 1st place solution](https://www.kaggle.com/competitions/deepfake-detection-challenge/discussion/145721#818975)
- [Kaggle 3rd place solution](https://www.kaggle.com/competitions/deepfake-detection-challenge/discussion/158158#884330)

In [100]:
import cv2
import json
import numpy as np
import pandas as pd
import pickle
import os
import tensorflow as tf

from IPython.display import Video
from pathlib import Path
from typing import List

DATA_PATH = "../datasets/Microsoft-Deepfake-Detection-Challenge/"

TRAIN_PATH = DATA_PATH + "train_sample_videos"
TEST_PATH = DATA_PATH + "test_videos"
LABEL_PATH = DATA_PATH + "train_sample_videos/metadata.json"

In [14]:
!pwd

/Users/michellesun/Dropbox/dev/IN_PROGRESS/deepfake-detection/model


## Load Dataset

### Labels

In [28]:
# Labels
with open(LABEL_PATH, 'r', encoding='utf-8') as f:
  labels_dict = json.load(f)

# Preview first line of labels_dict
print(next(iter(labels_dict.items())))

# Restructure dictionary for pandas dataframe input {'col':[1, 2], 'col2':[3, 4]}
filenames = list(labels_dict.keys())
labels = [ labels_dict[name]['label'] for name in filenames ]
splits = [ labels_dict[name]['split'] for name in filenames ]
originals = [ labels_dict[name]['original'] for name in filenames ]
labels_dict = {
  'filename': filenames,
  'label': labels,
  'split': splits,
  'original': originals
}

# 
labels = pd.DataFrame(labels_dict)
labels.head(10)

('aagfhgtpmv.mp4', {'label': 'FAKE', 'split': 'train', 'original': 'vudstovrck.mp4'})


Unnamed: 0,filename,label,split,original
0,aagfhgtpmv.mp4,FAKE,train,vudstovrck.mp4
1,aapnvogymq.mp4,FAKE,train,jdubbvfswz.mp4
2,abarnvbtwb.mp4,REAL,train,
3,abofeumbvv.mp4,FAKE,train,atvmxvwyns.mp4
4,abqwwspghj.mp4,FAKE,train,qzimuostzz.mp4
5,acifjvzvpm.mp4,FAKE,train,kbvibjhfzo.mp4
6,acqfdwsrhi.mp4,FAKE,train,ccfoszqabv.mp4
7,acxnxvbsxk.mp4,FAKE,train,fjlyaizcwc.mp4
8,acxwigylke.mp4,FAKE,train,ffcwhpnpuw.mp4
9,aczrgyricp.mp4,FAKE,train,slwkmefgde.mp4


In [35]:
labels.shape

(400, 4)

In [39]:
# Validate unique values
print("Unique inputs:", len(labels['filename'].unique()))

Unique inputs: 400


In [41]:
labels['original'].value_counts()

atvmxvwyns.mp4    6
meawmsgiti.mp4    6
qeumxirsme.mp4    5
kgbkktcjxf.mp4    5
fysyrqfguw.mp4    4
                 ..
cizlkenljw.mp4    1
xagsvjctmp.mp4    1
uuxqylnzls.mp4    1
brwrlczjvi.mp4    1
bdnaqemxmr.mp4    1
Name: original, Length: 209, dtype: int64

In [42]:
labels['split'].value_counts() # all train

train    400
Name: split, dtype: int64

In [34]:
labels['label'].value_counts()

FAKE    323
REAL     77
Name: label, dtype: int64

**Observation:** Data imbalance with significantly more FAKE videos than real. Potential bias towards FAKE videos.

### Load Deepfake Dataset

In [87]:
# Helper Functions

def isdir(fullpath: str) -> bool:
    """Returns true if is a file, and false if otherwise (a directory)."""
    try:
        if os.path.exists(fullpath):
            if os.path.isdir(fullpath):
                return True
            return False
    except FileNotFoundError:
        print(f'{fullpath} does not exist.')

def iterate_files(directory: str) -> List:
    """Iterates over the files in the given directory and returns a list of 
    found files."""
    files = []
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        fullpath = os.path.join(directory, filename)
        if (isdir(fullpath)):
            files += iterate_files(fullpath)
        else:
            files.append(fullpath)
    return files
  
def get_filename(source: str):
    """Returns the filename given a full path."""
    return os.path.basename(source)

In [113]:
# dataset currently 
dataset = iterate_files(TRAIN_PATH)

# Removes label (metadata.json) from dataset
if LABEL_PATH in dataset:
  dataset.remove(LABEL_PATH)

# Sorts alphabetically
dataset = sorted(dataset)

In [114]:
# Test video capture

random_video = dataset[np.random.random_integers(1,len(dataset))]
EXPORT_PATH = "extracted_frames"

video_name = os.path.splitext(get_filename(random_video))[0] + "/"
export_loc = os.path.join(EXPORT_PATH, video_name)
Path(export_loc).mkdir(parents=True, exist_ok=True)
vidcap = cv2.VideoCapture(random_video)
success, image = vidcap.read()
count = 0
while success:
  cv2.imwrite(os.path.join(export_loc, "frame%d.jpg" % count), image)
  success, image = vidcap.read()
  count += 1
#   print('Read a new frame: ', success)

print(f"Successfully created {count} frames for {get_filename(random_video)}")
Video(random_video, width=500, height=300)

  


Created 301 frames for bqhtpqmmqp.mp4


In [160]:
def get_frames_evenly(video_source: str, n_frames: int) -> List[str]:
  """
  Exports n_frames number of frames from the video_source distributed
  evenly across the video.
  
  :return: list of paths for the exported frames
  """
  vidcap = cv2.VideoCapture(video_source)
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT)) 
  if n_frames > 0 and n_frames <= total_frames:
    step = int(total_frames/n_frames)
    frames = np.arange(1,total_frames+1,step)
  else:
    frames = np.arange(1,total_frames,1)
  
  video_name = os.path.splitext(get_filename(video_source))[0] + "/"
  export_loc = os.path.join(EXPORT_PATH, video_name)
  Path(export_loc).mkdir(parents=True, exist_ok=True)
  exported_files = []
  for i in frames:
    vidcap.set(1,i)
    success, image = vidcap.read()
    exported_path = os.path.join(export_loc, f"frame{i}.jpg")
    cv2.imwrite(exported_path, image)
    exported_files.append(exported_path)
  
  print(f"Successfully extracted {n_frames} frames from {get_filename(video_source)}.")


random_video = dataset[100]
get_frames_evenly(random_video, n_frames=10)

Video(random_video, width=500)

Successfully extracted 10 frames from benmsfzfaz.mp4.


### Split dataset

In [161]:
from deepface import DeepFace

Directory  /Users/michellesun /.deepface created
Directory  /Users/michellesun /.deepface/weights created


In [163]:
# Create test db for deepface
import sqlite3

sqlite3.connect('deepface_db')

<sqlite3.Connection at 0x7fe021b231f0>

In [164]:
df = DeepFace.find(img_path=random_video, db_path='deepface_db')

ValueError: Passed db_path does not exist!