In [None]:
# Import required libraries.
import numpy as np
import pandas as pd
import tensorflow as tf
import cv2
import matplotlib.pyplot as plt
import os
import kagglehub

# Download the CelebA dataset using kagglehub
jessicali9530_celeba_dataset_path = kagglehub.dataset_download('jessicali9530/celeba-dataset')
print('Data source import complete.')

# Importing scikit-learn tools for splitting and evaluating the dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Importing TensorFlow and Keras tools
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, Flatten, Dropout, GlobalAveragePooling2D

# Print confirmation
print('Libraries imported successfully.')


Downloading from https://www.kaggle.com/api/v1/datasets/download/jessicali9530/celeba-dataset?dataset_version_number=2...


100%|██████████| 1.33G/1.33G [00:17<00:00, 83.5MB/s]

Extracting files...





In [None]:
BASIC_PATH = "/kaggle/input/celeba-dataset"
IMG_PATH = os.path.join(BASIC_PATH,'img_align_celeba/img_align_celeba')
FEATURE_PATH = os.path.join(BASIC_PATH,'list_attr_celeba.csv')

In [None]:
BASIC_PATH = jessicali9530_celeba_dataset_path
IMG_PATH = os.path.join(BASIC_PATH,'img_align_celeba/img_align_celeba')
FEATURE_PATH = os.path.join(BASIC_PATH,'list_attr_celeba.csv')


In [None]:
LIST_ATTR_PATH = os.path.join(BASIC_PATH,'list_attr_celeba.csv')
DF_ATTR = pd.read_csv(BASIC_PATH + '/list_attr_celeba.csv', delimiter=',')
print(IMG_PATH, LIST_ATTR_PATH)
DF_ATTR.head()

In [None]:
DF_PARTITION = pd.read_csv(BASIC_PATH + '/list_eval_partition.csv')

In [None]:
DF_ATTR.set_index('image_id', inplace=True)
DF_ATTR.replace(to_replace=-1, value=0, inplace=True)
DF_ATTR.shape

In [None]:
DF_PARTITION['partition'].value_counts().sort_index()

In [None]:
# prompt: hitung dan tampilkan jumlah total dataset diatas

total_dataset = DF_ATTR.shape[0]
print(f"Total dataset: {total_dataset}")

In [None]:
# Mengecek File Duplikat, Menghitung Jumlahnya dan Menampilkan 5 File Contoh
import hashlib
# Library yang digunakan : os dan hashlib

# Mencari semua file dengan ekstensi gambar dalam folder
image_files = [file for file in os.listdir(IMG_PATH) if file.lower().endswith(('.jpg', '.jpeg', '.png'))]

# Menggunakan dictionary untuk menyimpan hash nilai dan daftar file dengan hash yang sama
duplicate_files = {}

# Memeriksa setiap file gambar
for file_name in image_files:
    file_path = os.path.join(IMG_PATH, file_name)
    with open(file_path, 'rb') as f:
        file_hash = hashlib.md5(f.read()).hexdigest()

    if file_hash not in duplicate_files:
        duplicate_files[file_hash] = [file_name]
    else:
        duplicate_files[file_hash].append(file_name)

# Menghitung total file yang duplikat
total_duplicate_files = sum(len(files) - 1 for files in duplicate_files.values())

# Menampilkan 8 contoh nama file yang duplikat
print("Contoh file duplikat:")
count = 0
for file_list in duplicate_files.values():
    if len(file_list) > 1:
        for file_name in file_list[:8]:
            count += 1
            print(file_name)
            if count == 8:
                break
    if count == 8:
        break

# Menampilkan total file yang duplikat
print(f"Total file duplikat: {total_duplicate_files}")

# Mengumpulkan 8 contoh file duplikat
duplicate_examples = []
for file_list in duplicate_files.values():
    if len(file_list) > 1:
        duplicate_examples.extend(file_list[:8])
    if len(duplicate_examples) >= 10:
        break

# Menampilkan 8 foto duplikat dengan judul sebagai nama file
fig, axes = plt.subplots(2, 4, figsize=(10, 6))
for i, ax in enumerate(axes.flatten()):
    if i < len(duplicate_examples):
        file_name = duplicate_examples[i]
        file_path = os.path.join(IMG_PATH, file_name)
        image = plt.imread(file_path)
        ax.imshow(image)
        ax.set_title(file_name)
        plt.suptitle('Contoh Foto Duplikat')
    ax.axis('off')

plt.tight_layout()
plt.show()

# Code Modif dari Chat GPT

In [None]:
# Membuat List Nama File Foto untuk Join dengan Nama File pada File list_attribute.csv
import glob
# Library yang digunakan : glob dan pandas

# Tentukan pola nama file foto (misal: JPEG, PNG)
file_ekstensi = '*.jpg' # Ganti dengan ekstensi file gambar yang sesuai

# Tentukan direktori tempat file-file gambar berada
direktori_foto = IMG_PATH

# Mendapatkan list nama file
nama_file = glob.glob(direktori_foto + '/' + file_ekstensi)

# List nama file dijadikan dataframe
data = pd.DataFrame(nama_file, columns = ['image_id'])

# Menampilkan dataframe
data.head()

# Code Modif dari Chat GPT

In [None]:
# Membuang karakter pertama hingga sebelum nama file
data['image_id'] = data['image_id'].str.split('\\').str[-1]
data.head()
# Note: menggunakan '\\' karena pemisah antar folder adalah '\'

# Code Modif dari Googling

In [None]:
# Load Dataset 'list_attribute.csv' tampilkan juga nama file

print(f"File yang digunakan: {LIST_ATTR_PATH}")
df_attr = pd.read_csv(LIST_ATTR_PATH)
df_attr.head()

In [None]:
# prompt: # Cek Informasi pada DataFrame

# Cek Informasi pada DataFrame DF_ATTR
print("Informasi DataFrame DF_ATTR:")
print(DF_ATTR.info())
print("\n")

# Cek Informasi pada DataFrame DF_PARTITION
print("Informasi DataFrame DF_PARTITION:")
print(DF_PARTITION.info())
print("\n")

# Deskripsi statistik DF_ATTR
print("Deskripsi Statistik DF_ATTR:")
print(DF_ATTR.describe())
print("\n")

# Melihat jumlah data missing per kolom
print("Jumlah Data Missing per Kolom DF_ATTR:")
print(DF_ATTR.isnull().sum())
print("\n")

# Melihat jumlah data missing per kolom
print("Jumlah Data Missing per Kolom DF_PARTITION:")
print(DF_PARTITION.isnull().sum())
print("\n")

# Melihat korelasi antar kolom pada DF_ATTR
print("Korelasi Antar Kolom DF_ATTR:")
print(DF_ATTR.corr())
print("\n")

# Melihat nilai unik pada kolom 'partition' di DF_PARTITION
print("Nilai Unik pada Kolom 'partition' di DF_PARTITION:")
print(DF_PARTITION['partition'].unique())
print("\n")

In [None]:
# prompt: # Inner Join antara Objek 'data' dengan Objek 'list_attribute'

# Inner Join antara Objek 'data' dengan Objek 'list_attribute'

# Gabungkan 'data' dan 'DF_ATTR' berdasarkan 'image_id'
merged_data = pd.merge(data, DF_ATTR, left_on='image_id', right_index=True, how='inner')

# Tampilkan hasil penggabungan
print("Hasil Inner Join:")
merged_data.head()

In [None]:
# prompt: # Cek Jumlah Baris dan kolom yang telah Join Inner

# Cek jumlah baris dan kolom setelah inner join
print("Jumlah baris dan kolom setelah inner join:")
print(merged_data.shape)

In [None]:
# prompt: # join the partition with the attributes

# Gabungkan 'merged_data' dan 'DF_PARTITION' berdasarkan 'image_id'
final_data = pd.merge(merged_data, DF_PARTITION, left_on='image_id', right_on='image_id', how='inner')

# Tampilkan hasil penggabungan
print("Hasil Inner Join antara merged_data dan DF_PARTITION:")
final_data.head()

# Cek jumlah baris dan kolom setelah inner join
print("Jumlah baris dan kolom setelah inner join:")
print(final_data.shape)
DF_PARTITION.head()

In [None]:
# Read the attributes csv files in a dataframe format.
df = pd.read_csv(FEATURE_PATH, usecols=['image_id','Male'])
df = df.sample(n=22000, random_state = 42).reset_index(drop=True)

# Reset the columns values to categorical./
df.loc[df['Male'] == -1,'Male'] = "Female"
df.loc[df['Male'] == 1,'Male'] = "Male"

# Change column names.
df.columns = ["image_id", "Gender"]

df.head(10)

In [None]:
# Display Multiple Sample Images.
for i in range(0, 6):
    plt.subplot(2, 3, i+1)

    # Display Multiple Sample Images.
    img = cv2.imread(IMG_PATH + '/' + df["image_id"][i])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # Display Image.
    plt.imshow(img)
    plt.title(img.shape)
    plt.axis('off')

plt.tight_layout()
plt.show()

In [None]:
# prompt: # Get the category distribution.

# Get the category distribution.
gender_distribution = df['Gender'].value_counts()
print(gender_distribution)

# Visualize the category distribution (optional).
plt.figure(figsize=(8, 6))
plt.bar(gender_distribution.index, gender_distribution.values)
plt.title('Gender Distribution in the Dataset')
plt.xlabel('Gender')
plt.ylabel('Number of Images')
plt.show()

In [None]:
# prompt: buat distribution gender menjadi seimbang berdasarkan code diatas

# Get the category distribution.
gender_distribution = df['Gender'].value_counts()
print(gender_distribution)

# Calculate the minimum number of samples for each class
min_samples = min(gender_distribution)

# Create balanced datasets for each class
balanced_df = pd.DataFrame()
for gender in gender_distribution.index:
  temp_df = df[df['Gender'] == gender].sample(n=min_samples, random_state=42)
  balanced_df = pd.concat([balanced_df, temp_df])

# Reset the index of the balanced dataframe
balanced_df = balanced_df.reset_index(drop=True)


# Get the new category distribution.
new_gender_distribution = balanced_df['Gender'].value_counts()
print(new_gender_distribution)

# Visualize the new category distribution (optional).
plt.figure(figsize=(8, 6))
plt.bar(new_gender_distribution.index, new_gender_distribution.values)
plt.title('Gender Distribution in the Balanced Dataset')
plt.xlabel('Gender')
plt.ylabel('Number of Images')
plt.show()


df = balanced_df

In [None]:
df["Gender"].value_counts().plot.bar()

In [None]:
train_df, test_df = train_test_split(df, test_size=0.3)
test_df, validation_df = train_test_split(test_df, test_size=0.33)

In [None]:
print("Total Train Sample Images : ", len(train_df))
print("Total Test Sample Images : ", len(test_df))
print("Total Validation Sample Images : ", len(validation_df))

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import VGG16

# Define image size and batch size
IMAGE_SIZE = (224, 224)  # VGG16 input size
BATCH_SIZE = 32  # Set your batch size

# Load VGG16 preprocessing function
from tensorflow.keras.applications.vgg16 import preprocess_input

# Generate Train Images Data Generator with VGG16 preprocessing
train_datagen = ImageDataGenerator(
    rotation_range=15,
    shear_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    width_shift_range=0.1,
    height_shift_range=0.1,
    preprocessing_function=preprocess_input  # Use VGG16 preprocessing
)

# Create the train generator
train_generator = train_datagen.flow_from_dataframe(
    train_df,
    IMG_PATH + "/",
    x_col='image_id',
    y_col='Gender',
    target_size=IMAGE_SIZE,
    class_mode='binary',
    batch_size=BATCH_SIZE
)


In [None]:
# prompt: # Generate Validation Images Data Generator with VGG16 preprocessing

# Generate Validation Images Data Generator with VGG16 preprocessing
validation_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

# Create the validation generator
validation_generator = validation_datagen.flow_from_dataframe(
    validation_df,
    IMG_PATH + "/",
    x_col='image_id',
    y_col='Gender',
    target_size=IMAGE_SIZE,
    class_mode='binary',
    batch_size=BATCH_SIZE
)

In [None]:
# Generate Test Images Data Generator with VGG16 preprocessing
test_gen = ImageDataGenerator(
    preprocessing_function=preprocess_input  # Use VGG16 preprocessing
)

# Create the test generator
test_generator = test_gen.flow_from_dataframe(
    test_df,
    IMG_PATH + "/",
    x_col='image_id',
    y_col=None,  # No labels for test data
    class_mode=None,  # No class mode for test data
    target_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=False  # Set to False for test data to maintain order
)

In [None]:
import tensorflow as tf

# Load the VGG16 base model
base_vgg16_model = tf.keras.applications.VGG16(weights='imagenet', include_top=False, input_shape=IMAGE_SIZE + (3,))

# Unfreeze the last 5 layers
for layer in base_vgg16_model.layers[-5:]:
    layer.trainable = True  # Set trainable to True for the last 5 layers

# Optionally, freeze the rest of the layers if you want to fine-tune the model
for layer in base_vgg16_model.layers[:-5]:
    layer.trainable = False


In [None]:
from keras import Sequential
from tensorflow import keras
from keras.layers import Dense, Flatten, BatchNormalization, Dropout, GlobalAveragePooling2D
import tensorflow as tf

# Create the VGG16 model architecture
vgg16_model = Sequential(
    [
        base_vgg16_model,  # Use the VGG16 base model
        GlobalAveragePooling2D(),  # Pooling layer
        BatchNormalization(),  # Normalization layer
        Dense(256, activation='relu'),  # Fully connected layer with ReLU activation
        BatchNormalization(),  # Another normalization layer
        Dense(2, activation='softmax')  # Output layer for binary classification
    ]
)

# Alternatively, if you want to use predictions separately (not necessary in this case)
# x = base_vgg16_model.output
# predictions = Dense(2, activation='sigmoid')(x)

# Compile the VGG16 model
base_learning_rate = 0.00001
vgg16_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=base_learning_rate),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),  # Loss function for multi-class classification
    metrics=['accuracy']  # Metric for evaluation
)


In [None]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Early stopping to prevent overfitting
earlystop = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=10,         # Stop training after 10 epochs with no improvement
    restore_best_weights=True  # Restore weights of the best epoch
)

# Learning rate reduction when a plateau is detected
learning_rate_reduction = ReduceLROnPlateau(
    monitor='val_accuracy',  # Monitor validation accuracy for reducing the learning rate
    patience=4,              # Wait for 4 epochs with no improvement
    verbose=1,               # Print a message when learning rate is reduced
    factor=0.5,              # Reduce learning rate by half
    min_lr=0.0001            # Minimum learning rate to reach
)

# Combine callbacks
callbacks = [earlystop, learning_rate_reduction]


In [None]:
# prompt: # Train the VGG16 model

# Train the VGG16 model
epochs = 10  # Adjust the number of training epochs as needed

history = vgg16_model.fit(
    train_generator,
    epochs=epochs,
    validation_data=validation_generator,
    callbacks=callbacks
)