In [2]:
%matplotlib inline
from src.preprocessing_util import *
from src.util import create_train_test_sets
from facenet_pytorch import MTCNN
import torch
import pandas as pd
from shutil import copyfile
from tqdm import tqdm_notebook as tqdm

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## First we need to setup data folders

Required:

1. Path to raw data
2. Path to store non-augmented data
3. Path to store augmented data

The raw data directory should contain two subfolders, called "real" and "fake", and should hold the videos belonging to that category. So in the special case of the Celeb-DF:

Place the videos from "Celeb-real" and "YouTube-real" into the "real" folder. \
Place the videos from "Celeb-synthesis" into the "fake" folder.

We specify whether the derived dataset should be aimed at training temporal, or non-temporal models. Currently, both model types can only be trained using their respective dataset.


In [9]:
path_to_raw_data = 'raw_data/' # path to raw data

temporal = False

path_to_store_faces = f'data/{"temporal" if temporal else "nontemp"}/faces/'           # path to store non-augmented data
path_to_store_faces_aug = f'data/{"temporal" if temporal else "nontemp"}/faces_aug/'   # path to store augmented data


### First load the face detection module

In [3]:
# Load face detector
face_detector = MTCNN(image_size=224, margin=10, keep_all=False, device=device, post_process=False).eval()

### Config
Specify the amount of frames extracted per real file. If this is adapted, this needs to be changed as well separately for fake files.

E.g. for the Celeb-DF dataset:

There are 890 real files: 890 * 65 = 57.850
There are 5.639 fake files: 5.639 * 10 = 56.390

So the current configuration results in a balanced dataset. Note that only multiples of 5 can be selected when extracting temporal data, because the sequence length for the face sequences is set to 5 for the LSTM.

Also, a minimum face detection threshold can be set to disregard files which result in a large proportion of frames not detecting any faces.

In [4]:
n_frames = 65
min_face_cutoff = 32

# Load facial detection pipeline
face_detection = FaceDetection(face_detector, device, n_frames=n_frames)

# enable logging plots, if this is true, no face images will be saved, just the plots
log_plots = False

# Extract the dataset
Extract one subfolder after another. We keep track of the labels for each datapoint via stored csv files.


In [5]:
# real folder
path_to_folder =  path_to_raw_data + 'real/'
name_csv = 'real'
label = 'Real'

labels = get_CDF_per_folder(path_to_data=path_to_folder,
                            path_to_store_faces=path_to_store_faces,
                            path_to_store_faces_aug=path_to_store_faces_aug,
                            face_detection=face_detection,
                            label=label,
                            csv_file_name=name_csv,
                            min_face_cutoff=min_face_cutoff,
                            temporal = temporal,
                            log_plots=log_plots,
                            verbose=False)

Extracting  faces from 890 Real files


HBox(children=(FloatProgress(value=0.0, max=890.0), HTML(value='')))

File has less than 65 frames. Skipping...



We have now derived data from all real files. Next we need to derive data from the fake files.
As mentioned, this needs to be done with different number of frames per file, to ensure a balanced dataset.
Make sure both n_frame instances produce an even amount of datapoints (see above).

In [6]:
# need to initialize a new face detection model for smaller amount of frames
n_frames = 10
min_face_cutoff = 5
face_detection = FaceDetection(face_detector, device, n_frames=n_frames)

In [7]:
# synthesis folder
path_to_folder =  path_to_raw_data + 'fake/'
name_csv = 'fake'
label = 'Fake'

labels = get_CDF_per_folder(path_to_data=path_to_folder,
                            path_to_store_faces=path_to_store_faces,
                            path_to_store_faces_aug=path_to_store_faces_aug,
                            face_detection=face_detection,
                            label=label,
                            csv_file_name=name_csv,
                            min_face_cutoff=min_face_cutoff,
                            temporal = temporal,
                            log_plots=log_plots,
                            verbose=False)

Extracting  faces from 5639 Fake files


HBox(children=(FloatProgress(value=0.0, max=5639.0), HTML(value='')))




## Label merging
For each folder, we have a respective label file. Those need to be merged.
For each dataset, we handle two different types of labels.

1. Per file labels
2. Per datapoint labels (per face image for non-temporal models, per face-window for temporal models)


First we merge the labels on file-level.

In [8]:
path = "Labels/"
file1 = "real_labels_per_file.csv"
file2 = "fake_labels_per_file.csv"



labels_per_file = combine_labels(path, file1, file2)
labels_per_file.to_csv("Labels/labels_per_file.csv")
labels_per_file

Unnamed: 0,file,label
0,00000.mp4,0
1,00001.mp4,0
2,00002.mp4,0
3,00003.mp4,0
4,00004.mp4,0
...,...,...
6523,id9_id6_0005.mp4,1
6524,id9_id6_0006.mp4,1
6525,id9_id6_0007.mp4,1
6526,id9_id6_0008.mp4,1


Next we merge the labels on datapoint-level.

In [9]:
path = "Labels/"
file1 = f"real_labels_per_{'face' if not temporal else 'face_window'}.csv"
file2 = f"fake_labels_per_{'face' if not temporal else 'face_window'}.csv"

labels_per_face = combine_labels(path, file1, file2)
labels_per_face.to_csv(f"Labels/labels_per_{'face' if not temporal else 'face_window'}.csv")
labels_per_face

Unnamed: 0,file,label
0,000_00000.mp4,0
1,001_00000.mp4,0
2,002_00000.mp4,0
3,003_00000.mp4,0
4,004_00000.mp4,0
...,...,...
114153,005_id9_id6_0009.mp4,1
114154,006_id9_id6_0009.mp4,1
114155,007_id9_id6_0009.mp4,1
114156,008_id9_id6_0009.mp4,1


## Create splits

Now that we have the datasets and labels, we can perform the train/val/test split. We do this on file level.
For this, we can adapt the size of the training set. The resulting proportion of the dataset will be evenly split into validation/testing sets.

In [10]:
train_size = 0.8

Do the split. The resulting label files per split will be stored in the root folders where the data is located.

In [11]:
labels_per_file = 'Labels/labels_per_file.csv'
labels_per_face = 'Labels/labels_per_face.csv'


create_train_test_sets(labels_per_file=labels_per_file,
                       labels_per_face=labels_per_face,
                       root_dir=path_to_store_faces,
                       root_dir_aug=path_to_store_faces_aug,
                       train_size=train_size,
                       temporal=temporal)

Deriving the correct face labels for the split...


HBox(children=(FloatProgress(value=0.0, max=5222.0), HTML(value='')))



Per face labelling derived for split:
                         file  label
0      006_id23_id29_0001.mp4      1
1      002_id23_id29_0001.mp4      1
2      003_id23_id29_0001.mp4      1
3      008_id23_id29_0001.mp4      1
4      005_id23_id29_0001.mp4      1
...                       ...    ...
91911    002_id8_id9_0008.mp4      1
91912    008_id8_id9_0008.mp4      1
91913    007_id8_id9_0008.mp4      1
91914    004_id8_id9_0008.mp4      1
91915    000_id8_id9_0008.mp4      1

[91916 rows x 2 columns]

Per window labelling for split:
                    file  label
3858  id23_id29_0001.mp4      1
2441           00052.mp4      0
94    id31_id17_0003.mp4      1
4954  id54_id49_0003.mp4      1
4109       id49_0008.mp4      0
...                  ...    ...
1546           00096.mp4      0
389   id54_id52_0008.mp4      1
4374  id19_id27_0007.mp4      1
5305    id0_id1_0009.mp4      1
1823    id8_id9_0008.mp4      1

[5222 rows x 2 columns]
Finished split train/!
Deriving the correct face

HBox(children=(FloatProgress(value=0.0, max=653.0), HTML(value='')))



Per face labelling derived for split:
                         file  label
0       004_id6_id26_0005.mp4      1
1       007_id6_id26_0005.mp4      1
2       008_id6_id26_0005.mp4      1
3       001_id6_id26_0005.mp4      1
4       006_id6_id26_0005.mp4      1
...                       ...    ...
10705  008_id46_id43_0003.mp4      1
10706  004_id46_id43_0003.mp4      1
10707  009_id46_id43_0003.mp4      1
10708  000_id46_id43_0003.mp4      1
10709  001_id46_id43_0003.mp4      1

[10710 rows x 2 columns]

Per window labelling for split:
                    file  label
3710   id6_id26_0005.mp4      1
241    id4_id37_0009.mp4      1
1993  id20_id21_0009.mp4      1
6012       id25_0002.mp4      0
4790       id17_0005.mp4      0
...                  ...    ...
3673  id57_id50_0005.mp4      1
5297    id8_id7_0008.mp4      1
1668  id23_id20_0009.mp4      1
2667       id48_0001.mp4      0
3530  id46_id43_0003.mp4      1

[653 rows x 2 columns]
Finished split val/!
Deriving the correct face la

HBox(children=(FloatProgress(value=0.0, max=653.0), HTML(value='')))



Per face labelling derived for split:
                        file  label
0      004_id7_id11_0004.mp4      1
1      009_id7_id11_0004.mp4      1
2      000_id7_id11_0004.mp4      1
3      007_id7_id11_0004.mp4      1
4      001_id7_id11_0004.mp4      1
...                      ...    ...
11527   009_id9_id0_0000.mp4      1
11528   007_id9_id0_0000.mp4      1
11529   005_id9_id0_0000.mp4      1
11530   008_id9_id0_0000.mp4      1
11531   000_id9_id0_0000.mp4      1

[11532 rows x 2 columns]

Per window labelling for split:
                    file  label
5705   id7_id11_0004.mp4      1
3368  id16_id21_0012.mp4      1
1207  id52_id56_0002.mp4      1
1852  id44_id46_0000.mp4      1
6409    id8_id6_0008.mp4      1
...                  ...    ...
3815  id21_id29_0007.mp4      1
4110  id30_id17_0000.mp4      1
227         id2_0009.mp4      0
1343   id23_id2_0005.mp4      1
2797    id9_id0_0000.mp4      1

[653 rows x 2 columns]
Finished split test/!


Finally, because in the end, we want to predict on video-level, we create a subfolder in our datafolder holding the video sequences associated with the testsplit.

In [11]:
testlabels = pd.read_csv(f"data/{'temporal' if temporal else 'nontemp'}/faces/testlabels_per_file.csv" , index_col=0) 

# only select en equal amount of positive and negative files
real_labels = testlabels.loc[testlabels.label == 0]
fake_labels = testlabels.loc[testlabels.label == 1].sample(n=len(real_labels))

testlabels = pd.concat([real_labels, fake_labels])
    

target_dir_real = f"data/{'temporal' if temporal else 'nontemp'}/testfiles/real/"
target_dir_fake = f"data/{'temporal' if temporal else 'nontemp'}/testfiles/fake/"
    

for subpath in [target_dir_real, target_dir_fake]:                
    if not os.path.exists(subpath):
        os.makedirs(subpath)


for row in tqdm(testlabels.iterrows()): 
    target_dir = target_dir_fake if row[1][1] == 1 else target_dir_real
    try:
        copyfile(path_to_raw_data + 'real/' + row[1][0], target_dir + row[1][0])
    except FileNotFoundError:
        try:
            copyfile(path_to_raw_data + 'fake/' + row[1][0], target_dir + row[1][0])
        except FileNotFoundError:
            print(f'whoops, did not find file {row[1][0]} at all')


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


