## Import Libraries

In [1]:
import os
import pandas as pd
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import hashlib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
import numpy as np
import cv2
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from flask import Flask, render_template, request, jsonify, send_from_directory
from keras.models import load_model
from keras.models import save_model
import io
from tensorflow.keras.models import load_model
from werkzeug.utils import secure_filename
from threading import Thread

## Define paths

In [2]:
real_images_path = 'dataset/train/REAL'
fake_images_path = 'dataset/train/FAKE'
real_test_path = 'dataset/test/REAL'
fake_test_path = 'dataset/test/FAKE'

## Load Images from Folders

In [3]:
def load_images_from_folder(folder, label):
    images = []
    for filename in os.listdir(folder):
        img_path = os.path.join(folder, filename)
        try:
            img = Image.open(img_path)
            img = img.resize((128, 128))
            img = np.array(img)
            images.append([img, label])
        except Exception as e:
            print(f"Error loading image {filename}: {e}")
    return images

### Convert to DataFrame

####  1. Train data

In [None]:
real_images = load_images_from_folder(real_images_path, label=0)  # Label 0 for real
fake_images = load_images_from_folder(fake_images_path, label=1)  # Label 1 for fake

# Combine fake and real
all_images = real_images + fake_images

# Convert to df
df = pd.DataFrame(all_images, columns=['image', 'label'])

print(df.head())

In [None]:
df.tail(10)

#### 2. Test data

In [None]:
real_test_images = load_images_from_folder(real_test_path, label=0)  # Label 0 for real
fake_test_images = load_images_from_folder(fake_test_path, label=1)  # Label 1 for fake

# Combine fake and real
all_test_images = real_test_images + fake_test_images

# Convert to df
df_test = pd.DataFrame(all_test_images, columns=['image', 'label'])

print(df_test.head())

In [None]:
df_test.tail(10)

### Define and Use Image Display Functions

Below we view what is contained in the df we created that is the images, since the images have been stretched they appear to be of low quality

#### 1. Train data

In [None]:
def display_image(image_array):
    plt.imshow(image_array)
    plt.axis('off')  
    plt.show()

In [None]:
display_image(df.iloc[0]['image'])

#### 2. Test data

In [None]:
# First image in the test df
display_image(df_test.iloc[0]['image'])

### Data Cleaning: Handling Duplicates, Missing Data, and Corrupted Images

#### 1. Train data

In [None]:
def clean_image_data(df):
    # Hashing
    print("Removing duplicate images...")
    df['img_hash'] = df['image'].apply(lambda img: hashlib.md5(img.tobytes()).hexdigest())
    df.drop_duplicates(subset='img_hash', inplace=True)
    # Drop the hash column after removing duplicates
    df.drop(columns=['img_hash'], inplace=True)  

    # Check for missing labels
    print("Checking for missing labels...")
    missing_labels = df['label'].isnull().sum()
    if missing_labels > 0:
        print(f"Found {missing_labels} missing labels. Dropping rows with missing labels...")
        df.dropna(subset=['label'], inplace=True)

    # Verify images are loaded correctly and not corrupted
    print("Verifying image integrity...")
    valid_images = []
    for i, row in df.iterrows():
        try:
            img = Image.fromarray(row['image'])
            img.verify()  # Is image corrupted
            valid_images.append(True)
        except Exception as e:
            print(f"Corrupted image detected at index {i}: {e}")
            valid_images.append(False)
    
    df = df[valid_images]

    # Reset index after cleaning
    df.reset_index(drop=True, inplace=True)
    
    return df


In [None]:
# Apply the cleaning function to train df
df_cleaned = clean_image_data(df)

# Display the cleaned train df
print("Cleaned DataFrame:")
print(df_cleaned.head())


#### 2. Test data

In [None]:
# Apply the cleaning function to test df
df_test_cleaned = clean_image_data(df_test)

# Display the cleaned test df
print("Cleaned DataFrame:")
print(df_test_cleaned.head())

### EDA

#### 1. Resize and Display Images

In [None]:
# Get sizes of all images
def get_image_sizes(image_list):
    sizes = [img.shape[:2] for img, _ in image_list]
    return sizes

In [None]:
sizes = get_image_sizes(all_images)
sizes_df = pd.DataFrame(sizes, columns=['Height', 'Width'])