# Deepfake Dataset Preprocessing

### Importing neccesary libraries

In [28]:
import os
import zipfile
import glob
from pathlib import Path
import pandas as pd
import tensorflow as tf
import cv2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten

### Unzipping the dataset

In [3]:
# Path to the dataset
zip_file_path = '../datasets/140k_real_and_fake_faces.zip'

# Directory to extract the files to
extract_to_dir = '../datasets/140k_real_and_fake_faces'


In [4]:

# Check if the directory to extract to exists, create if it does not
if not os.path.exists(extract_to_dir):
    os.makedirs(extract_to_dir)

# Unzipping the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_dir)

In [5]:
main_path = '../datasets/140k_real_and_fake_faces/real_vs_fake/real-vs-fake'

In [6]:
train_dir = os.path.join(main_path, 'train')
valid_dir = os.path.join(main_path, 'valid')
test_dir = os.path.join(main_path, 'test')

In [7]:
paths = [train_dir, valid_dir, test_dir]
for path in paths:
    print(f'{os.path.basename(path)} subdirectories: {os.listdir(path)}')

train subdirectories: ['real', 'fake']
valid subdirectories: ['real', 'fake']
test subdirectories: ['real', 'fake']


In [23]:
def is_image_conforming(image_path, target_size=(256, 256), channels=3):
    image = cv2.imread(image_path)
    if image is None:
        print(f"Unable to load image: {image_path}")
        return False
    h, w, c = image.shape
    return (h, w) == target_size and c == channels

In [24]:

main_path = '../datasets/140k_real_and_fake_faces/real_vs_fake/real-vs-fake'

conforming_images = []
non_conforming_images = []

In [25]:

for folder in os.listdir(main_path):
    folder_path = Path(main_path) / folder
    if folder_path.is_dir():
        for label in os.listdir(folder_path):
            label_path = folder_path / label
            if label_path.is_dir():
                for img_path in label_path.glob("*.jpg"):
                    if is_image_conforming(str(img_path)):
                        conforming_images.append({
                            "folder": folder,
                            "image_path": str(img_path),
                            "label": label
                        })
                    else:
                        non_conforming_images.append(str(img_path))

Number of conforming images: 140000
Number of non-conforming images: 0


In [26]:
# Now, image_records contains info of conforming images
# and non_conforming_images contains paths of non-conforming images

print(f"Number of conforming images: {len(conforming_images)}")
print(f"Number of non-conforming images: {len(non_conforming_images)}")

# Optionally, print or log the non-conforming images
for img_path in non_conforming_images:
    print(f"Non-conforming image: {img_path}")


Number of conforming images: 140000
Number of non-conforming images: 0


In [21]:
# image_records = []

# for folder in os.listdir(main_path):
#     folder_path = Path(main_path) / folder
#     if folder_path.is_dir():
#         for label in os.listdir(folder_path):
#             label_path = folder_path / label
#             if label_path.is_dir():
#                 for img_path in label_path.glob("*.jpg"):
#                     image_records.append({
#                         "folder": folder,
#                         "image_path": str(img_path),
#                         "label": label
#                     })


In [27]:
image_df = pd.DataFrame(conforming_images)
print(image_df)

       folder                                         image_path label
0       valid  ../datasets/140k_real_and_fake_faces/real_vs_f...  real
1       valid  ../datasets/140k_real_and_fake_faces/real_vs_f...  real
2       valid  ../datasets/140k_real_and_fake_faces/real_vs_f...  real
3       valid  ../datasets/140k_real_and_fake_faces/real_vs_f...  real
4       valid  ../datasets/140k_real_and_fake_faces/real_vs_f...  real
...       ...                                                ...   ...
139995  train  ../datasets/140k_real_and_fake_faces/real_vs_f...  fake
139996  train  ../datasets/140k_real_and_fake_faces/real_vs_f...  fake
139997  train  ../datasets/140k_real_and_fake_faces/real_vs_f...  fake
139998  train  ../datasets/140k_real_and_fake_faces/real_vs_f...  fake
139999  train  ../datasets/140k_real_and_fake_faces/real_vs_f...  fake

[140000 rows x 3 columns]


In [10]:
image_df[image_df['folder'] == 'train'].describe()

Unnamed: 0,folder,image_path,label
count,100000,100000,100000
unique,1,100000,2
top,train,../datasets/140k_real_and_fake_faces/real_vs_f...,real
freq,100000,1,50000


In [11]:
image_df[image_df['folder'] == 'test'].describe()

Unnamed: 0,folder,image_path,label
count,20000,20000,20000
unique,1,20000,2
top,test,../datasets/140k_real_and_fake_faces/real_vs_f...,real
freq,20000,1,10000


In [12]:
image_df[image_df['folder'] == 'valid'].describe()

Unnamed: 0,folder,image_path,label
count,20000,20000,20000
unique,1,20000,2
top,valid,../datasets/140k_real_and_fake_faces/real_vs_f...,real
freq,20000,1,10000


In [13]:
train_grouped_df = image_df[image_df['folder'] == 'train'].groupby('label')
test_grouped_df = image_df[image_df['folder'] == 'test'].groupby('label')
valid_grouped_df = image_df[image_df['folder'] == 'valid'].groupby('label')

In [14]:
train_grouped_df.size()

label
fake    50000
real    50000
dtype: int64

In [15]:
test_grouped_df.size()

label
fake    10000
real    10000
dtype: int64

In [16]:
valid_grouped_df.size()

label
fake    10000
real    10000
dtype: int64

In [18]:
train_gen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255.,
    # rotation_range=20,
    # width_shift_range=0.2,
    # height_shift_range=0.2,
    # shear_range=0.2,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

valid_gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255.)
test_gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255.)

In [19]:
train_generator = train_gen.flow_from_directory(
    train_dir,
    target_size=(256, 256),
    batch_size=32,
    class_mode='binary'
)

validation_generator = valid_gen.flow_from_directory(
    valid_dir,
    target_size=(256, 256),
    batch_size=32,
    class_mode='binary'
)

test_generator = test_gen.flow_from_directory(
    test_dir,
    target_size=(256, 256),
    batch_size=32,
    class_mode='binary'
)

Found 100000 images belonging to 2 classes.
Found 20000 images belonging to 2 classes.
Found 20000 images belonging to 2 classes.


In [None]:
input_shape = (256, 256, 3)  # Change this to match your image size and channels
flattened_input_size = input_shape[0] * input_shape[1] * input_shape[2]

In [29]:

# Assuming each image is resized to 256x256 pixels and is in RGB format
input_shape = (256, 256, 3)  # Change this to match your image size and channels
flattened_input_size = input_shape[0] * input_shape[1] * input_shape[2]

model = Sequential([
    Flatten(input_shape=input_shape),  # Flatten the input images
    Dense(512, activation='relu'),     # First dense layer
    Dense(256, activation='relu'),     # Second dense layer
    Dense(128, activation='relu'),     # Third dense layer
    Dense(1, activation='sigmoid')     # Output layer
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])