# Introduction to Deepfake Detection
Deepfakes are synthetic media (usually videos or images) created using deep learning techniques. They convincingly replace a person’s likeness in existing media with someone else’s, often leading to misinformation and potential harm. Detecting deepfakes is crucial for maintaining trust in visual content.

**on this project we will use dataset** ***140k Real and Fake Faces***

This dataset consists of all 70k REAL faces from the Flickr dataset collected by Nvidia, as well as 70k fake faces sampled from the 1 Million FAKE faces (generated by StyleGAN) that was provided by Bojan.

In this dataset, I convenient combined both dataset, resized all the images into 256px, and split the data into train, validation and test set. I also included some CSV files for convenience.

In [None]:
import os
import tensorflow as tf
import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.applications import Xception
from sklearn.metrics import classification_report
from tensorflow.keras import Model


In [None]:
train_data = pd.read_csv("/kaggle/input/140k-real-and-fake-faces/train.csv")
test_data  = pd.read_csv("/kaggle/input/140k-real-and-fake-faces/test.csv")
validation_data = pd.read_csv("/kaggle/input/140k-real-and-fake-faces/valid.csv")

In [None]:
path="/kaggle/input/140k-real-and-fake-faces/real_vs_fake/real-vs-fake"

# Preprocessing and Augmentation
The ImageDataGenerator is a powerful tool for data augmentation and preprocessing in deep learning. It generates batches of augmented image data on-the-fly during training.

In [None]:
train_image_generator = ImageDataGenerator(rescale=1./255.,preprocessing_function=preprocess_input)

train_data_generator = train_image_generator.flow_from_dataframe(
    dataframe=train_data,
    directory=path,
    x_col ='path',
    y_col ='label_str',
    color_mode="rgb",
    target_size=(256, 256),
    class_mode="binary",
    batch_size=32,
    shuffle = True
)


In [None]:
validation_image_generator = ImageDataGenerator(rescale=1./255.,preprocessing_function=preprocess_input)
test_image_generator = ImageDataGenerator(rescale=1./255.,preprocessing_function=preprocess_input)

validation_data_generator = validation_image_generator.flow_from_dataframe(
    dataframe=validation_data,
    directory=path,
    x_col ='path',
    y_col ='label_str',
    color_mode="rgb",
    target_size=(256, 256),
    class_mode="binary",
    batch_size=32,
    shuffle = True
)


test_data_generator = test_image_generator.flow_from_dataframe(
    dataframe=test_data,
    directory=path,
    x_col ='path',
    y_col ='label_str',
    color_mode="rgb",
    target_size=(256, 256),
    class_mode="binary",
    batch_size=32,
    shuffle = False
)


# visualization

In [None]:
labels = train_data_generator.class_indices
class_names = list(labels.keys())
print('class names:',class_names)

In [None]:
import matplotlib.pyplot as plt

labels = ["Train", "Validation", "Test"]
counts = [len(train_data), len(validation_data), len(test_data)]

fig, ax = plt.subplots()
ax.pie(counts, labels=labels, autopct="%1.1f%%")
ax.set_title("Distribution of Images")
plt.show()


In [None]:
import matplotlib.pyplot as plt

train_size = len(train_data)
validation_size = len(validation_data)
test_size = len(test_data)

fig, ax = plt.subplots()
x = ['Train', 'Validation', 'Test']
y = [train_size, validation_size, test_size]
ax.bar(x, y)
ax.set_xlabel('Dataset')
ax.set_ylabel('Number of Images')
ax.set_title('Distribution of Images in Each Dataset')

plt.show()

# Convolutional Neural Networks (CNNs)
CNNs have been widely used for image classification tasks. They excel at learning hierarchical features from raw pixel data. In the context of deepfake detection, CNNs can analyze patterns and features in images to distinguish between real and manipulated content.

In [None]:
model_cnn = Sequential()

model_cnn.add(Conv2D(32,kernel_size=(3, 3), activation='relu', input_shape=(256, 256, 3)))
model_cnn.add(MaxPooling2D((2, 2)))
model_cnn.add(Conv2D(64, (3, 3), activation='relu'))
model_cnn.add(MaxPooling2D((2, 2)))
model_cnn.add(Conv2D(64, (3, 3), activation='relu'))
model_cnn.add(MaxPooling2D((2, 2)))
model_cnn.add(Dropout(0.2))
model_cnn.add(Flatten())
model_cnn.add(Dense(64, activation='relu'))
model_cnn.add(Dropout(0.2))
model_cnn.add(Dense(1, activation='sigmoid'))

In [None]:
model_cnn.summary()

In [None]:
model_cnn.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
history = model_cnn.fit(train_data_generator ,epochs = 10,validation_data= (validation_data_generator))

In [None]:
test_loss, test_accuracy = model_cnn.evaluate(test_data_generator)
print(f'Loss: {test_loss}, Accuracy: {test_accuracy}')

In [None]:
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='validation loss')
plt.legend()
plt.show()


plt.plot(history.history['accuracy'], label='train accuracy')
plt.plot(history.history['val_accuracy'], label='validation accuracy')
plt.legend()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix

predictions = model_cnn.predict(test_data_generator)
predicted_labels = np.where(predictions > 0.5, 1, 0)
true_labels = test_data_generator.classes

conf_matrix = confusion_matrix(true_labels, predicted_labels)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=test_data_generator.class_indices,
            yticklabels=test_data_generator.class_indices)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()


In [None]:
class_names = list(test_data_generator.class_indices.keys())
report = classification_report(true_labels, predicted_labels, target_names=class_names)
print("Classification Report:")
print(report)

# XceptionNet
XceptionNet is a deep learning architecture based on CNNs. It was designed to improve the efficiency of feature extraction by replacing standard convolutional layers with depthwise separable convolutions. XceptionNet has been successfully applied to various computer vision tasks, including deepfake detection.

In [None]:
base_model = Xception(weights='imagenet',include_top=False,input_shape=(256, 256, 3))


In [None]:
for layer in base_model.layers:
    layer.trainable = False

Xception_model = Sequential()
Xception_model.add(base_model)
Xception_model.add(Flatten())
Xception_model.add(BatchNormalization())
Xception_model.add(Dense(256, activation='relu'))
Xception_model.add(Dropout(0.5))
Xception_model.add(Dense(128, activation='relu'))
Xception_model.add(Dropout(0.5))
Xception_model.add(BatchNormalization())
Xception_model.add(Dense(1, activation='sigmoid'))

Xception_model.summary()

In [None]:
Xception_model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
history = Xception_model.fit(train_data_generator ,epochs = 10,validation_data= (validation_data_generator))
print(history.history)

In [None]:
test_loss, test_accuracy = Xception_model.evaluate(test_data_generator)
print(f'Loss: {test_loss}, Accuracy: {test_accuracy}')

In [None]:
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='validation loss')
plt.legend()
plt.show()

plt.plot(history.history['accuracy'], label='train accuracy')
plt.plot(history.history['val_accuracy'], label='validation accuracy')
plt.legend()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

predictions = Xception_model.predict(test_data_generator)
predicted_labels = np.where(predictions > 0.5, 1, 0)
true_labels = test_data_generator.classes
conf_matrix = confusion_matrix(true_labels, predicted_labels)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=test_data_generator.class_indices,
            yticklabels=test_data_generator.class_indices)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()


In [None]:
class_names = list(test_data_generator.class_indices.keys())
report = classification_report(true_labels, predicted_labels, target_names=class_names)

print("Classification Report:")
print(report)
