In [None]:
# Import library yang diperlukan
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import random
from datetime import datetime

# Atur seed untuk reproduksibilitas
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

def load_cifake_dataset(base_dir, sample_size=5000, val_split=0.15, random_state=42):
    """
    Load dataset CIFAKE dari struktur folder yang sudah ada
    
    Args:
        base_dir: direktori utama dataset ('dataset')
        sample_size: jumlah total sampel yang diambil (dibagi sama antara real dan fake)
        val_split: proporsi data validasi (diambil dari data training)
        random_state: nilai random seed untuk reproduksibilitas
    
    Returns:
        x_train, y_train, x_val, y_val, x_test, y_test
    """
    # Set random seed
    np.random.seed(random_state)
    random.seed(random_state)
    
    # Path ke folder train dan test
    train_real_path = os.path.join(base_dir, 'train', 'REAL')
    train_fake_path = os.path.join(base_dir, 'train', 'FAKE')
    test_real_path = os.path.join(base_dir, 'test', 'REAL')
    test_fake_path = os.path.join(base_dir, 'test', 'FAKE')
    
    # List file-file gambar
    train_real_files = [os.path.join(train_real_path, f) for f in os.listdir(train_real_path) if f.endswith('.jpg')]
    train_fake_files = [os.path.join(train_fake_path, f) for f in os.listdir(train_fake_path) if f.endswith('.jpg')]
    test_real_files = [os.path.join(test_real_path, f) for f in os.listdir(test_real_path) if f.endswith('.jpg')]
    test_fake_files = [os.path.join(test_fake_path, f) for f in os.listdir(test_fake_path) if f.endswith('.jpg')]
    
    print(f"Total data asli: {len(train_real_files)} train real, {len(train_fake_files)} train fake")
    print(f"Total data test: {len(test_real_files)} test real, {len(test_fake_files)} test fake")
    
    # Sampling random untuk masing-masing kelas
    samples_per_class = sample_size // 4  # Bagi 4 untuk train/test dan real/fake
    
    train_real_sampled = random.sample(train_real_files, min(samples_per_class, len(train_real_files)))
    train_fake_sampled = random.sample(train_fake_files, min(samples_per_class, len(train_fake_files)))
    test_real_sampled = random.sample(test_real_files, min(samples_per_class, len(test_real_files)))
    test_fake_sampled = random.sample(test_fake_files, min(samples_per_class, len(test_fake_files)))
    
    # Gabungkan file train dan label
    train_files = train_real_sampled + train_fake_sampled
    train_labels = [0] * len(train_real_sampled) + [1] * len(train_fake_sampled)  # 0=real, 1=fake
    
    # Gabungkan file test dan label
    test_files = test_real_sampled + test_fake_sampled
    test_labels = [0] * len(test_real_sampled) + [1] * len(test_fake_sampled)
    
    # Shuffle data
    combined_train = list(zip(train_files, train_labels))
    random.shuffle(combined_train)
    train_files, train_labels = zip(*combined_train)
    
    combined_test = list(zip(test_files, test_labels))
    random.shuffle(combined_test)
    test_files, test_labels = zip(*combined_test)
    
    # Load dan preprocess gambar training
    train_images = []
    for file_path in train_files:
        img = tf.keras.preprocessing.image.load_img(file_path, target_size=(64, 64))
        img_array = tf.keras.preprocessing.image.img_to_array(img)
        train_images.append(img_array)
    
    # Load dan preprocess gambar test
    test_images = []
    for file_path in test_files:
        img = tf.keras.preprocessing.image.load_img(file_path, target_size=(64, 64))
        img_array = tf.keras.preprocessing.image.img_to_array(img)
        test_images.append(img_array)
    
    # Konversi ke numpy array
    X_train = np.array(train_images)
    y_train = np.array(train_labels)
    X_test = np.array(test_images)
    y_test = np.array(test_labels)
    
    # Normalisasi piksel ke [0,1]
    X_train = X_train / 255.0
    X_test = X_test / 255.0
    
    # Split data training untuk membuat validation set
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=val_split, random_state=random_state, stratify=y_train
    )
    
    # One-hot encoding untuk label
    y_train = tf.keras.utils.to_categorical(y_train, 2)
    y_val = tf.keras.utils.to_categorical(y_val, 2)
    y_test = tf.keras.utils.to_categorical(y_test, 2)
    
    print(f"Data yang digunakan: {len(X_train)} training, {len(X_val)} validation, {len(X_test)} testing samples")
    
    return X_train, y_train, X_val, y_val, X_test, y_test

def create_cnn_model(config_num, input_shape=(64, 64, 3), optimizer_name='adam'):
    """
    Membuat model CNN berdasarkan konfigurasi yang ditentukan
    
    Args:
        config_num: nomor konfigurasi (1-4)
        input_shape: dimensi input gambar
        optimizer_name: nama optimizer ('sgd', 'adam', 'adagrad', 'adadelta')
    
    Returns:
        model Keras yang sudah dikompilasi
    """
    model = models.Sequential()
    
    # Input layer
    model.add(layers.Input(shape=input_shape))
    
    # Konfigurasi 1: 1 Conv, 1 MaxPool
    if config_num == 1:
        model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
        model.add(layers.MaxPooling2D((2, 2)))
        
    # Konfigurasi 2: 2 Conv, 1 MaxPool
    elif config_num == 2:
        model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
        model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
        model.add(layers.MaxPooling2D((2, 2)))
        
    # Konfigurasi 3: 3 Conv, 2 MaxPool
    elif config_num == 3:
        model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
        model.add(layers.MaxPooling2D((2, 2)))
        model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
        model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
        model.add(layers.MaxPooling2D((2, 2)))
        
    # Konfigurasi 4: 4 Conv, 2 MaxPool
    elif config_num == 4:
        model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
        model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
        model.add(layers.MaxPooling2D((2, 2)))
        model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
        model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
        model.add(layers.MaxPooling2D((2, 2)))
    
    # Flatten dan Fully Connected Layers
    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(2, activation='softmax'))
    
    # Pilih optimizer berdasarkan nama
    if optimizer_name.lower() == 'sgd':
        opt = optimizers.SGD(learning_rate=0.01, momentum=0.9)
    elif optimizer_name.lower() == 'adam':
        opt = optimizers.Adam()
    elif optimizer_name.lower() == 'adagrad':
        opt = optimizers.Adagrad()
    elif optimizer_name.lower() == 'adadelta':
        opt = optimizers.Adadelta()
    else:
        raise ValueError(f"Optimizer {optimizer_name} tidak tersedia")
    
    # Kompilasi model
    model.compile(optimizer=opt,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

def k_fold_cross_validation(X, y, config_num, optimizer_name, n_splits=5, epochs=20, batch_size=32):
    """
    Melakukan k-fold cross-validation
    
    Args:
        X: data gambar
        y: label
        config_num: nomor konfigurasi CNN (1-4)
        optimizer_name: nama optimizer
        n_splits: jumlah fold
        epochs: jumlah epochs per training
        batch_size: ukuran batch
    
    Returns:
        Dictionary dengan hasil metrics
    """
    # Inisialisasi K-Fold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Metrics untuk tiap fold
    fold_metrics = {
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1_score': [],
        'loss': []
    }
    
    fold_num = 1
    
    # Iterasi untuk setiap fold
    for train_idx, val_idx in kf.split(X):
        print(f"\nTraining fold {fold_num}/{n_splits}")
        
        # Split data untuk fold ini
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        
        # Convert indices to integers for y (in case they're not)
        train_idx = [int(i) for i in train_idx]
        val_idx = [int(i) for i in val_idx]
        
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]
        
        # Buat dan latih model
        model = create_cnn_model(config_num, input_shape=X.shape[1:], optimizer_name=optimizer_name)
        
        # Early stopping untuk mencegah overfitting
        early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
        
        # Train model
        history = model.fit(
            X_train_fold, y_train_fold,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=(X_val_fold, y_val_fold),
            callbacks=[early_stop],
            verbose=1
        )
        
        # Evaluasi model
        loss, accuracy = model.evaluate(X_val_fold, y_val_fold, verbose=0)
        
        # Prediksi untuk metrics tambahan
        y_pred = model.predict(X_val_fold, verbose=0)
        y_pred_classes = np.argmax(y_pred, axis=1)
        y_val_classes = np.argmax(y_val_fold, axis=1)
        
        # Hitung metrics tambahan
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_val_classes, y_pred_classes, average='weighted')
        
        # Simpan metrics
        fold_metrics['accuracy'].append(accuracy)
        fold_metrics['precision'].append(precision)
        fold_metrics['recall'].append(recall)
        fold_metrics['f1_score'].append(f1)
        fold_metrics['loss'].append(loss)
        
        fold_num += 1
    
    # Hitung rata-rata metrics dari semua fold
    avg_metrics = {key: np.mean(values) for key, values in fold_metrics.items()}
    std_metrics = {key: np.std(values) for key, values in fold_metrics.items()}
    
    print("\nK-Fold Cross-Validation Results:")
    print(f"Average Accuracy: {avg_metrics['accuracy']:.4f} ± {std_metrics['accuracy']:.4f}")
    print(f"Average Precision: {avg_metrics['precision']:.4f} ± {std_metrics['precision']:.4f}")
    print(f"Average Recall: {avg_metrics['recall']:.4f} ± {std_metrics['recall']:.4f}")
    print(f"Average F1-Score: {avg_metrics['f1_score']:.4f} ± {std_metrics['f1_score']:.4f}")
    
    return fold_metrics, avg_metrics, std_metrics

def visualize_results(results_df, results=None):
    """
    Visualisasi hasil eksperimen
    
    Args:
        results_df: DataFrame dengan hasil eksperimen
        results: Dictionary dengan data hasil lengkap (opsional)
    """
    # Buat folder untuk hasil visualisasi
    os.makedirs('results', exist_ok=True)
    
    # Ekstrak data untuk visualisasi
    configs = results_df['Configuration'].unique()
    optimizers = results_df['Optimizer'].unique()
    
    # Konversi accuracy string ke float
    accuracies = []
    for acc_str in results_df['Accuracy']:
        if '±' in acc_str:
            acc_value = float(acc_str.split('±')[0].strip())
        else:
            acc_value = float(acc_str)
        accuracies.append(acc_value)
    
    results_df['Accuracy_Value'] = accuracies
    
    # Plot perbandingan akurasi untuk setiap konfigurasi dan optimizer
    plt.figure(figsize=(12, 8))
    
    # Buat grouped bar chart
    bar_width = 0.2
    index = np.arange(len(configs))
    
    for i, optimizer in enumerate(optimizers):
        opt_data = results_df[results_df['Optimizer'] == optimizer]
        plt.bar(index + i*bar_width, opt_data['Accuracy_Value'], bar_width,
                label=optimizer)
    
    plt.xlabel('CNN Configuration')
    plt.ylabel('Validation Accuracy')
    plt.title('Performance Comparison of CNN Configurations and Optimizers')
    plt.xticks(index + bar_width * (len(optimizers)-1)/2, configs)
    plt.legend()
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.savefig('results/accuracy_comparison.png', dpi=300, bbox_inches='tight')
    
    # Heatmap untuk visualisasi performa
    pivot_df = results_df.pivot_table(
        index='Configuration', 
        columns='Optimizer',
        values='Accuracy_Value'
    )
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(pivot_df, annot=True, cmap='viridis', fmt='.4f')
    plt.title('Accuracy Heatmap: Configuration vs Optimizer')
    plt.savefig('results/accuracy_heatmap.png', dpi=300, bbox_inches='tight')
    
    # Plot F1-Score juga
    f1_scores = []
    for f1_str in results_df['F1-Score']:
        if '±' in f1_str:
            f1_value = float(f1_str.split('±')[0].strip())
        else:
            f1_value = float(f1_str)
        f1_scores.append(f1_value)
    
    results_df['F1_Value'] = f1_scores
    
    # Heatmap untuk F1-score
    pivot_f1 = results_df.pivot_table(
        index='Configuration', 
        columns='Optimizer',
        values='F1_Value'
    )
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(pivot_f1, annot=True, cmap='plasma', fmt='.4f')
    plt.title('F1-Score Heatmap: Configuration vs Optimizer')
    plt.savefig('results/f1_heatmap.png', dpi=300, bbox_inches='tight')
    
    plt.close('all')
    print("Visualisasi disimpan di folder 'results/'")

def run_k_fold_experiments(dataset_dir, sample_size=5000):
    """
    Menjalankan eksperimen dengan k-fold cross-validation
    
    Args:
        dataset_dir: direktori dataset
        sample_size: jumlah sampel yang digunakan
    """
    # Load dataset
    x_train, y_train, x_val, y_val, x_test, y_test = load_cifake_dataset(
        dataset_dir, sample_size=sample_size)
    
    # Gabung data train dan validation untuk k-fold CV
    X_combined = np.concatenate([x_train, x_val])
    y_combined = np.concatenate([y_train, y_val])
    
    # Konfigurasi yang akan diuji
    configs = [1, 2, 3, 4]
    optimizers = ['SGD', 'Adam', 'Adagrad', 'Adadelta']
    
    # Hasil untuk semua eksperimen
    results = {}
    
    # Buat DataFrame untuk menyimpan hasil
    results_df = pd.DataFrame(columns=['Configuration', 'Optimizer', 'Accuracy', 'Precision', 
                                      'Recall', 'F1-Score'])
    row_idx = 0
    
    # Iterasi untuk setiap konfigurasi dan optimizer
    for config in configs:
        results[f"config_{config}"] = {}
        for optimizer in optimizers:
            print(f"\n\n======= Running Config {config} with {optimizer} Optimizer (K-Fold) =======")
            
            # Lakukan k-fold cross-validation
            fold_metrics, avg_metrics, std_metrics = k_fold_cross_validation(
                X_combined, y_combined, config, optimizer, n_splits=5, epochs=20)
            
            # Simpan hasil
            results[f"config_{config}"][optimizer] = {
                'fold_metrics': fold_metrics,
                'avg_metrics': avg_metrics,
                'std_metrics': std_metrics
            }
            
            # Tambahkan ke DataFrame
            results_df.loc[row_idx] = [
                f"Config {config}",
                optimizer,
                f"{avg_metrics['accuracy']:.4f} ± {std_metrics['accuracy']:.4f}",
                f"{avg_metrics['precision']:.4f} ± {std_metrics['precision']:.4f}",
                f"{avg_metrics['recall']:.4f} ± {std_metrics['recall']:.4f}",
                f"{avg_metrics['f1_score']:.4f} ± {std_metrics['f1_score']:.4f}"
            ]
            row_idx += 1
    
    # Simpan hasil ke CSV
    results_df.to_csv('results/k_fold_experiment_results.csv', index=False)
    
    # Visualisasi hasil
    visualize_results(results_df, results)
    
    # Cari konfigurasi terbaik
    best_row = results_df.iloc[results_df['Accuracy_Value'].idxmax()]
    best_config = int(best_row['Configuration'].split()[1])
    best_optimizer = best_row['Optimizer']
    
    print(f"\nKonfigurasi terbaik: Config {best_config} dengan {best_optimizer}")
    print(f"Accuracy: {best_row['Accuracy']}")
    print(f"F1-Score: {best_row['F1-Score']}")
    
    # Train model final dengan konfigurasi terbaik
    print("\n\n======= Training Final Model dengan Konfigurasi Terbaik =======")
    final_model = create_cnn_model(best_config, input_shape=x_train.shape[1:], optimizer_name=best_optimizer)
    
    # Callbacks
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    checkpoint = ModelCheckpoint('results/best_model.h5', save_best_only=True, monitor='val_accuracy')
    
    # Train model
    history = final_model.fit(
        x_train, y_train,
        epochs=30,
        batch_size=32,
        validation_data=(x_val, y_val),
        callbacks=[early_stop, checkpoint],
        verbose=1
    )
    
    # Evaluasi pada data test
    test_loss, test_acc = final_model.evaluate(x_test, y_test)
    print(f"\nTest accuracy: {test_acc:.4f}")
    
    # Prediksi untuk confusion matrix
    y_pred = final_model.predict(x_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)
    
    # Metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test_classes, y_pred_classes, average='weighted')
    
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1-Score: {f1:.4f}")
    
    # Confusion matrix
    cm = confusion_matrix(y_test_classes, y_pred_classes)
    
    # Visualisasi confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Real', 'Fake'],
                yticklabels=['Real', 'Fake'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.savefig('results/confusion_matrix.png', dpi=300, bbox_inches='tight')
    
    # Plot learning curves
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('results/learning_curves.png', dpi=300, bbox_inches='tight')
    
    return results, results_df, (best_config, best_optimizer)

def main():
    # Cetak waktu mulai
    start_time = datetime.now()
    print(f"Eksperimen dimulai pada: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Path ke dataset CIFAKE
    dataset_dir = "dataset"  # Sesuaikan dengan lokasi dataset Anda
    
    # Buat folder untuk menyimpan hasil
    os.makedirs('results', exist_ok=True)
    
    # Jalankan eksperimen dengan k-fold cross-validation
    # Gunakan sample_size yang lebih kecil untuk pengujian awal
    # Dengan spesifikasi komputer Anda, 5000 sampel seharusnya tidak masalah
    results, results_df, best_config = run_k_fold_experiments(dataset_dir, sample_size=5000)
    
    # Cetak waktu selesai dan durasi
    end_time = datetime.now()
    duration = end_time - start_time
    print(f"\nEksperimen selesai pada: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Total durasi: {duration}")
    print(f"Konfigurasi terbaik: Config {best_config[0]} dengan optimizer {best_config[1]}")
    print("\nHasil eksperimen disimpan di folder 'results/'")
    
    return results, results_df, best_config

# Jalankan program jika script dijalankan langsung
if __name__ == "__main__":
    main()