In [1]:
%matplotlib inline
from src.preprocessing_util import *
from src.util import create_train_test_sets
from facenet_pytorch import MTCNN
import torch
import pandas as pd
from shutil import copyfile
from tqdm import tqdm_notebook as tqdm

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## First we need to setup data folders

Required:

1. Path to raw data
2. Path to store non-augmented data
3. Path to store augmented data

The raw data directory should contain two subfolders, called "real" and "fake", and should hold the videos belonging to that category. So in the special case of the Celeb-DF:

Place the videos from "Celeb-real" and "YouTube-real" into the "real" folder. \
Place the videos from "Celeb-synthesis" into the "fake" folder.

We specify whether the derived dataset should be aimed at training temporal, or non-temporal models. Currently, both model types can only be trained using their respective dataset.


In [2]:
path_to_raw_data = 'raw_data/' # path to raw data

temporal = False

path_to_store_faces = f'data/{"temporal" if temporal else "nontemp"}/faces/'           # path to store non-augmented data
path_to_store_faces_aug = f'data/{"temporal" if temporal else "nontemp"}/faces_aug/'   # path to store augmented data


### First load the face detection module

In [3]:
# Load face detector
face_detector = MTCNN(image_size=224, margin=10, keep_all=False, device=device, post_process=False).eval()

### Config
Specify the amount of frames extracted per real file. If this is adapted, this needs to be changed as well separately for fake files.

E.g. for the Celeb-DF dataset:

There are 890 real files: 890 * 65 = 57.850
There are 5.639 fake files: 5.639 * 10 = 56.390

So the current configuration results in a balanced dataset. Note that only multiples of 5 can be selected when extracting temporal data, because the sequence length for the face sequences is set to 5 for the LSTM.

Also, a minimum face detection threshold can be set to disregard files which result in a large proportion of frames not detecting any faces.

In [4]:
n_frames = 5
min_face_cutoff = 0

# Load facial detection pipeline
face_detection = FaceDetection(face_detector, device, n_frames=n_frames)

# enable logging plots, if this is true, no face images will be saved, just the plots
log_plots = False

# Extract the dataset
Extract one subfolder after another. We keep track of the labels for each datapoint via stored csv files.


In [5]:
# real folder
path_to_folder =  path_to_raw_data + 'real/'
name_csv = 'real'
label = 'Real'

labels = get_CDF_per_folder(path_to_data=path_to_folder,
                            path_to_store_faces=path_to_store_faces,
                            path_to_store_faces_aug=path_to_store_faces_aug,
                            face_detection=face_detection,
                            label=label,
                            csv_file_name=name_csv,
                            min_face_cutoff=min_face_cutoff,
                            temporal = temporal,
                            log_plots=log_plots,
                            verbose=False)

Extracting  faces from 100 Real files


HBox(children=(FloatProgress(value=0.0), HTML(value='')))




We have now derived data from all real files. Next we need to derive data from the fake files.
As mentioned, this needs to be done with different number of frames per file, to ensure a balanced dataset.
Make sure both n_frame instances produce an even amount of datapoints (see above).

In [6]:
# need to initialize a new face detection model for smaller amount of frames
n_frames = 5
min_face_cutoff = 0
face_detection = FaceDetection(face_detector, device, n_frames=n_frames)

In [7]:
# synthesis folder
path_to_folder =  path_to_raw_data + 'fake/'
name_csv = 'fake'
label = 'Fake'

labels = get_CDF_per_folder(path_to_data=path_to_folder,
                            path_to_store_faces=path_to_store_faces,
                            path_to_store_faces_aug=path_to_store_faces_aug,
                            face_detection=face_detection,
                            label=label,
                            csv_file_name=name_csv,
                            min_face_cutoff=min_face_cutoff,
                            temporal = temporal,
                            log_plots=log_plots,
                            verbose=False)

Extracting  faces from 116 Fake files


HBox(children=(FloatProgress(value=0.0, max=116.0), HTML(value='')))




## Label merging
For each folder, we have a respective label file. Those need to be merged.
For each dataset, we handle two different types of labels.

1. Per file labels
2. Per datapoint labels (per face image for non-temporal models, per face-window for temporal models)


First we merge the labels on file-level.

In [8]:
path = "Labels/"
file1 = "real_labels_per_file.csv"
file2 = "fake_labels_per_file.csv"



labels_per_file = combine_labels(path, file1, file2)
labels_per_file.to_csv("Labels/labels_per_file.csv")
labels_per_file

Unnamed: 0,file,label
0,000.mp4,0
1,003.mp4,0
2,012.mp4,0
3,020.mp4,0
4,025.mp4,0
...,...,...
211,df_95.mp4,1
212,df_96.mp4,1
213,df_97.mp4,1
214,df_98.mp4,1


Next we merge the labels on datapoint-level.

In [9]:
path = "Labels/"
file1 = f"real_labels_per_{'face' if not temporal else 'face_window'}.csv"
file2 = f"fake_labels_per_{'face' if not temporal else 'face_window'}.csv"

labels_per_face = combine_labels(path, file1, file2)
labels_per_face.to_csv(f"Labels/labels_per_{'face' if not temporal else 'face_window'}.csv")
labels_per_face

Unnamed: 0,file,label
0,000_000.mp4,0
1,001_000.mp4,0
2,002_000.mp4,0
3,003_000.mp4,0
4,004_000.mp4,0
...,...,...
1075,000_df_99.mp4,1
1076,001_df_99.mp4,1
1077,002_df_99.mp4,1
1078,003_df_99.mp4,1


## Create splits

Now that we have the datasets and labels, we can perform the train/val/test split. We do this on file level.
For this, we can adapt the size of the training set. The resulting proportion of the dataset will be evenly split into validation/testing sets.

In [10]:
train_size = 0.8

Do the split. The resulting label files per split will be stored in the root folders where the data is located.

In [11]:
labels_per_file = 'Labels/labels_per_file.csv'
labels_per_face = 'Labels/labels_per_face.csv'


create_train_test_sets(labels_per_file=labels_per_file,
                       labels_per_face=labels_per_face,
                       root_dir=path_to_store_faces,
                       root_dir_aug=path_to_store_faces_aug,
                       train_size=train_size,
                       temporal=temporal)

Deriving the correct face labels for the split...


HBox(children=(FloatProgress(value=0.0, max=172.0), HTML(value='')))



Per face labelling derived for split:
               file  label
0     003_df_11.mp4      1
1     002_df_11.mp4      1
2     000_df_11.mp4      1
3     001_df_11.mp4      1
4     004_df_11.mp4      1
..              ...    ...
855  002_df_101.mp4      1
856  001_df_101.mp4      1
857  004_df_101.mp4      1
858  003_df_101.mp4      1
859  000_df_101.mp4      1

[860 rows x 2 columns]

Per window labelling for split:
           file  label
152   df_11.mp4      1
107   df_27.mp4      1
119   df_41.mp4      1
47      042.mp4      0
135   df_88.mp4      1
..          ...    ...
35      774.mp4      0
164   df_76.mp4      1
172     187.mp4      0
73    df_21.mp4      1
199  df_101.mp4      1

[172 rows x 2 columns]
Finished split train/!
Deriving the correct face labels for the split...


HBox(children=(FloatProgress(value=0.0, max=22.0), HTML(value='')))



Per face labelling derived for split:
            file  label
0    004_053.mp4      0
1    001_053.mp4      0
2    000_053.mp4      0
3    002_053.mp4      0
4    003_053.mp4      0
..           ...    ...
105  000_857.mp4      0
106  001_857.mp4      0
107  004_857.mp4      0
108  002_857.mp4      0
109  003_857.mp4      0

[110 rows x 2 columns]

Per window labelling for split:
          file  label
136    053.mp4      0
116    922.mp4      0
124    003.mp4      0
94   df_87.mp4      1
86   df_75.mp4      1
162  df_51.mp4      1
55     846.mp4      0
151  df_63.mp4      1
182    808.mp4      0
108  df_15.mp4      1
130    940.mp4      0
169  df_22.mp4      1
74     845.mp4      0
146  df_99.mp4      1
3    df_24.mp4      1
178    492.mp4      0
186  df_79.mp4      1
38     025.mp4      0
32   df_83.mp4      1
134  df_98.mp4      1
57   df_69.mp4      1
208    857.mp4      0
Finished split val/!
Deriving the correct face labels for the split...


HBox(children=(FloatProgress(value=0.0, max=22.0), HTML(value='')))



Per face labelling derived for split:
              file  label
0    003_df_72.mp4      1
1    001_df_72.mp4      1
2    004_df_72.mp4      1
3    000_df_72.mp4      1
4    002_df_72.mp4      1
..             ...    ...
105  004_df_62.mp4      1
106  002_df_62.mp4      1
107  003_df_62.mp4      1
108  001_df_62.mp4      1
109  000_df_62.mp4      1

[110 rows x 2 columns]

Per window labelling for split:
           file  label
95    df_72.mp4      1
179   df_47.mp4      1
40      745.mp4      0
5     df_45.mp4      1
45    df_56.mp4      1
69    df_71.mp4      1
166     413.mp4      0
19    df_57.mp4      1
201   df_10.mp4      1
157     418.mp4      0
170     273.mp4      0
188   df_19.mp4      1
138     735.mp4      0
91      391.mp4      0
39      546.mp4      0
175   df_53.mp4      1
137    df_4.mp4      1
8    df_111.mp4      1
6       807.mp4      0
90   df_110.mp4      1
56   df_106.mp4      1
189   df_62.mp4      1
Finished split test/!


Finally, because we in the end, we want to predict on video-level, we create a subfolder in our datafolder holding the video sequences associated with the testsplit.

In [12]:
testlabels = pd.read_csv(f"data/{'temporal' if temporal else 'nontemp'}/faces/testlabels_per_file.csv" , index_col=0) 

target_dir_real = f"data/{'temporal' if temporal else 'nontemp'}/testfiles/real/"
target_dir_fake = f"data/{'temporal' if temporal else 'nontemp'}/testfiles/fake/"

for subpath in [target_dir_real, target_dir_fake]:                
    if not os.path.exists(subpath):
        os.makedirs(subpath)


for row in tqdm(testlabels.iterrows()): 
    target_dir = target_dir_fake if row[1][1] == 1 else target_dir_real
    try:
        copyfile(path_to_raw_data + 'real/' + row[1][0], target_dir + row[1][0])
    except FileNotFoundError:
        try:
            copyfile(path_to_raw_data + 'fake/' + row[1][0], target_dir + row[1][0])
        except FileNotFoundError:
            print(f'whoops, did not find file {row[1][0]} at all')


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


