In [2]:
import os
import numpy as np
import cv2
from moviepy.editor import VideoFileClip
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import json
from datetime import datetime
import shutil

In [4]:
# Create necessary directories
def create_directories():
    directories = [
        '../processed_dataset/real',
        '../processed_dataset/fake',
        '../models',
        '../models/checkpoints',
        '../logs',
        '../logs/training_history',
        '../evaluation'
    ]
    for dir in directories:
        os.makedirs(dir, exist_ok=True)

In [5]:
create_directories()

In [6]:
class VideoFrameExtractor:
    def __init__(self, sample_rate=30):
        self.sample_rate = sample_rate
        self.face_cascade = cv2.CascadeClassifier(
            cv2.data.haarcascades + 'haarcascade_frontalface_default.xml'
        )

    def extract_faces_from_frame(self, frame):
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        faces = self.face_cascade.detectMultiScale(
            gray,
            scaleFactor=1.1,
            minNeighbors=5,
            minSize=(30, 30)
        )
        if len(faces) == 0:
            return None

        processed_faces = []
        for (x, y, w, h) in faces:
            face = frame[y:y + h, x:x + w]
            face = cv2.resize(face, (128, 128))
            processed_faces.append(face)
        return processed_faces

    def process_video(self, video_path, output_dir):
        """Extract frames from a video and save them as images"""
        try:
            clip = VideoFileClip(video_path)
            for i, frame in enumerate(clip.iter_frames()):
                if i % self.sample_rate == 0:
                    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                    faces = self.extract_faces_from_frame(frame)
                    if faces:
                        for j, face in enumerate(faces):
                            face_path = os.path.join(output_dir, f"{os.path.basename(video_path)}_{i}_{j}.jpg")
                            cv2.imwrite(face_path, face)
            clip.close()
        except Exception as e:
            print(f"Error processing video {video_path}: {e}")

In [7]:
def prepare_dataset(real_dir='../dataset/real_videos', fake_dir='../dataset/fake_videos'):
    """Prepare dataset and save frames to disk"""
    extractor = VideoFrameExtractor(sample_rate=30)
    # Process real videos
    print("Processing real videos...")
    for video in os.listdir(real_dir):
        video_path = os.path.join(real_dir, video)
        output_dir = '../processed_dataset/real'
        extractor.process_video(video_path, output_dir)

    # Process fake videos
    print("\nProcessing fake videos...")
    for video in os.listdir(fake_dir):
        video_path = os.path.join(fake_dir, video)
        output_dir = '../processed_dataset/fake'
        extractor.process_video(video_path, output_dir)

In [9]:
# Prepare dataset
print("Starting dataset preparation...")
prepare_dataset()

Starting dataset preparation...
Processing real videos...

Processing fake videos...


In [8]:
# Corrected data generator
def data_generator(batch_size, file_list, label):
    """Generate data batches from specific file lists."""
    num_samples = len(file_list)
    while True:
        np.random.shuffle(file_list)
        for i in range(0, num_samples, batch_size):
            batch_files = file_list[i:i + batch_size]
            batch_images = []
            batch_labels = []
            for file in batch_files:
                img = cv2.imread(file)
                if img is not None:
                    img = cv2.resize(img, (128, 128))  # Resize images to 128x128
                    img = img / 255.0  # Normalize pixel values
                    batch_images.append(img)
                    batch_labels.append(label)
            yield np.array(batch_images), np.array(batch_labels)

In [9]:
# Define data generators
# Wrap Python generators with tf.data.Dataset
batch_size = 32

# Split dataset into training and validation
all_real_files = [os.path.join('../processed_dataset/real', f) for f in os.listdir('../processed_dataset/real')]
all_fake_files = [os.path.join('../processed_dataset/fake', f) for f in os.listdir('../processed_dataset/fake')]
real_train, real_val = train_test_split(all_real_files, test_size=0.2, random_state=42)
fake_train, fake_val = train_test_split(all_fake_files, test_size=0.2, random_state=42)

In [10]:
# Training datasets
def combined_data_generator(batch_size, real_files, fake_files):
    """Combines real and fake datasets into a single generator."""
    real_gen = data_generator(batch_size, real_files, 0)
    fake_gen = data_generator(batch_size, fake_files, 1)
    while True:
        real_batch = next(real_gen)
        fake_batch = next(fake_gen)
        combined_images = np.concatenate((real_batch[0], fake_batch[0]), axis=0)
        combined_labels = np.concatenate((real_batch[1], fake_batch[1]), axis=0)
        indices = np.arange(len(combined_labels))
        np.random.shuffle(indices)
        yield combined_images[indices], combined_labels[indices]

# Training dataset
train_dataset = tf.data.Dataset.from_generator(
    lambda: combined_data_generator(batch_size, real_train, fake_train),
    output_signature=(
        tf.TensorSpec(shape=(None, 128, 128, 3), dtype=tf.float32),  # Fixed to 128x128
        tf.TensorSpec(shape=(None,), dtype=tf.int32)
    )
).shuffle(1000)

In [17]:
# Validation datasets
def combined_val_data_generator(batch_size, real_val_files, fake_val_files):
    """Combines real and fake validation datasets."""
    real_gen = data_generator(batch_size, real_val_files, 0)
    fake_gen = data_generator(batch_size, fake_val_files, 1)
    while True:
        real_batch = next(real_gen)
        fake_batch = next(fake_gen)
        combined_images = np.concatenate((real_batch[0], fake_batch[0]), axis=0)
        combined_labels = np.concatenate((real_batch[1], fake_batch[1]), axis=0)
        indices = np.arange(len(combined_labels))
        np.random.shuffle(indices)
        yield combined_images[indices], combined_labels[indices]

# Validation dataset
val_dataset = tf.data.Dataset.from_generator(
    lambda: combined_data_generator(batch_size, real_val, fake_val),
    output_signature=(
        tf.TensorSpec(shape=(None, 128, 128, 3), dtype=tf.float32),  # Fixed to 128x128
        tf.TensorSpec(shape=(None,), dtype=tf.int32)
    )
)

In [12]:
def create_model():
    """Create the model architecture"""
    base_model = tf.keras.applications.EfficientNetB0(
        include_top=False,
        weights='imagenet',
        input_shape=(128, 128, 3)
    )

    # Freeze the base model
    base_model.trainable = False

    model = models.Sequential([
        base_model,
        layers.GlobalAveragePooling2D(),
        layers.Dense(512, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])

    return model

In [13]:
# Create and compile model
model = create_model()
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC()]
)

In [6]:
# Setup callbacks
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
    f'../models/checkpoints/model_{timestamp}_{{epoch:02d}}-{{val_accuracy:.4f}}.keras',
    monitor='val_accuracy',
    save_best_only=True,
    mode='max',
    verbose=1
)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

In [14]:
# Train model
print("\nStarting model training...")
history = model.fit(
    train_dataset ,
    validation_data=val_dataset,
    epochs=20,  # Increased epochs for better training
    steps_per_epoch=len(real_train) // batch_size,
    validation_steps=len(real_val) // batch_size,
    callbacks=[model_checkpoint, early_stopping]
)


Starting model training...
Epoch 1/20
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.4982 - auc_1: 0.4955 - loss: 0.7079
Epoch 1: val_accuracy improved from -inf to 0.50000, saving model to ../models/checkpoints/model_20241217_173339_01-0.5000.keras
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1280s[0m 2s/step - accuracy: 0.4982 - auc_1: 0.4955 - loss: 0.7078 - val_accuracy: 0.5000 - val_auc_1: 0.5000 - val_loss: 0.6933
Epoch 2/20
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.5086 - auc_1: 0.5051 - loss: 0.6937
Epoch 2: val_accuracy did not improve from 0.50000
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m490s[0m 2s/step - accuracy: 0.5085 - auc_1: 0.5051 - loss: 0.6937 - val_accuracy: 0.5000 - val_auc_1: 0.5000 - val_loss: 0.6931
Epoch 3/20
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.5068 - auc_1: 0.4994 - loss: 0.6931
Epoch 3: v

In [15]:
# Save model and training history
model.save(f'../models/deepfake_detector_{timestamp}.keras')
with open(f'../logs/training_history/history_{timestamp}.json', 'w') as f:
    json.dump(history.history, f)


In [18]:
# Evaluate model
y_true = np.concatenate([
    np.zeros(len(real_val)),
    np.ones(len(fake_val))
])
y_pred = model.predict(val_dataset).flatten()

from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve

precision = precision_score(y_true, np.round(y_pred))
recall = recall_score(y_true, np.round(y_pred))
f1 = f1_score(y_true, np.round(y_pred))
auc = roc_auc_score(y_true, y_pred)
conf_matrix = confusion_matrix(y_true, np.round(y_pred))

  17100/Unknown [1m18171s[0m 1s/step

KeyboardInterrupt: 

In [19]:
# Save evaluation results
evaluation_results = {
    'precision': precision,
    'recall': recall,
    'f1_score': f1,
    'auc': auc,
    'confusion_matrix': conf_matrix.tolist()  # Convert to list for JSON serialization
}


NameError: name 'precision' is not defined

In [20]:
# Save confusion matrix image
plt.figure(figsize=(10, 7))
plt.matshow(conf_matrix, fignum=1)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.colorbar()
plt.savefig(f'../evaluation/confusion_matrix_{timestamp}.png')

with open(f'../evaluation/results_{timestamp}.json', 'w') as f:
    json.dump(evaluation_results, f)

print("Training and evaluation completed.")

NameError: name 'conf_matrix' is not defined

<Figure size 1000x700 with 0 Axes>