# Exploratory Data Analysis - Data Leakage CheckNotebook ini menyusun proses Exploratory Data Analysis (EDA) sederhana untuk memeriksa kemungkinan kebocoran data antara set latih dan set uji pada dataset citra suku bangsa.

## 1. Import dan Konfigurasi AwalBagian ini menyiapkan seluruh dependensi yang diperlukan beserta path direktori data.

In [None]:
import os
from collections import Counter
from pathlib import Path

from PIL import Image
import imagehash
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use("seaborn-v0_8")

In [None]:
# Ganti jika struktur folder Anda berbeda
TRAIN_DIR = Path("./Train/Train")
TEST_DIR = Path("./Test/Test")

FILES_TO_EXCLUDE = {
    "balinese_train_000771.jpg", "dayak_train_000052.jpg", "balinese_train_000767.jpg",
    "balinese_train_000762.jpg", "balinese_train_000760.jpg", "minangkabau_train_000015.jpg",
    "javanese_train_000012.jpg", "balinese_train_000768.jpg", "minangkabau_train_000459.jpg",
    "balinese_train_000764.jpg", "batak_train_000067.jpg", "javanese_train_000248.jpg",
    "balinese_train_000776.jpg", "javanese_train_000246.jpg", "javanese_train_000213.jpg",
    "balinese_train_000772.jpg", "minangkabau_train_000012.jpg", "dayak_train_000004.jpg",
    "balinese_train_000766.jpg", "minangkabau_train_000017.jpg", "balinese_train_000773.jpg",
    "balinese_train_000765.jpg", "minangkabau_train_000416.jpg", "balinese_train_000769.jpg",
    "minangkabau_train_000021.jpg", "batak_train_000092.jpg", "batak_train_000010.jpg",
    "balinese_train_00061.jpg", "minangkabau_train_000411.jpg", "batak_train_000056.jpg",
    "minangkabau_train_000170.jpg", "balinese_train_000774.jpg", "balinese_train_000770.jpg",
    "javanese_train_000249.jpg", "batak_train_000086.jpg", "batak_train_000095.jpg",
    "batak_train_000071.jpg", "batak_train_000002.jpg", "batak_train_000040.jpg",
    "batak_train_000059.jpg", "dayak_train_000019.jpg", "javanese_train_000048.jpg",
    "javanese_train_000050.jpg", "javanese_train_000032.jpg", "batak_train_000007.jpg",
    "batak_train_000093.jpg", "javanese_train_000202.jpg", "minangkabau_train_000005.jpg",
    "minangkabau_train_000008.jpg", "balinese_train_000761.jpg", "minangkabau_train_000021.jpg",
    "minangkabau_train_000017.jpg", "minangkabau_train_000079.jpg", "minangkabau_train_000200.jpg",
    "minangkabau_train_000180.jpg", "javanese_train_000245.jpg", "balinese_train_000721.jpg", "batak_train_000029.jpg",
    "batak_train_000072.jpg", "batak_train_000074.jpg", "batak_train_000083.jpg", "batak_train_000084.jpg", "batak_train_000088.jpg",
    "dayak_train_000030.jpg", "dayak_train_000059.jpg", "javanese_train_000062.jpg", "javanese_train_000129.jpg", "javanese_train_000151.jpg",
    "javanese_train_000154.jpg", "javanese_train_000219.jpg", "javanese_train_000231.jpg",
    "minangkabau_train_000061.jpg", "minangkabau_train_000218.jpg", "minangkabau_train_000245.jpg",
    "minangkabau_train_000250.jpg", "minangkabau_train_000289.jpg", "minangkabau_train_000302.jpg",
    "minangkabau_train_000346.jpg", "minangkabau_train_000374.jpg", "minangkabau_train_000533.jpg",
    "minangkabau_train_000543.jpg"
}

## 2. Fungsi HelperKumpulan fungsi berikut menangani proses hashing gambar dan pengecekan kebocoran data. Fungsi tambahan untuk visualisasi juga disertakan.

In [None]:
def calculate_hashes(directory: Path, files_to_exclude, description="Calculating Hashes"):
    image_hashes = {}
    all_files = []
    if not directory.exists():
        print(f"Error: Direktori tidak ditemukan -> {directory}")
        return {}

    for label_folder in directory.iterdir():
        if label_folder.is_dir():
            for filename in label_folder.iterdir():
                if filename.suffix.lower() in {'.png', '.jpg', '.jpeg'}:
                    all_files.append(filename)

    for filepath in tqdm(all_files, desc=description):
        if filepath.name in files_to_exclude:
            continue
        try:
            with Image.open(filepath) as img:
                h = imagehash.phash(img)
                image_hashes[h] = str(filepath)
        except Exception as e:
            print(f"Tidak bisa memproses file {filepath}: {e}")
    return image_hashes


def check_for_leaks(train_hashes, test_directory: Path):
    leaked_files = []
    if not test_directory.exists():
        print(f"Error: Direktori tidak ditemukan -> {test_directory}")
        return []

    test_files_path = [p for p in test_directory.iterdir() if p.suffix.lower() in {'.png', '.jpg', '.jpeg'}]

    for test_filepath in tqdm(test_files_path, desc="Checking for Leaks in Test Set"):
        try:
            with Image.open(test_filepath) as img:
                test_hash = imagehash.phash(img)
                if test_hash in train_hashes:
                    train_filepath = train_hashes[test_hash]
                    leaked_files.append((train_filepath, str(test_filepath)))
        except Exception as e:
            print(f"Tidak bisa memproses file {test_filepath}: {e}")
    return leaked_files


def visualize_leaks(leaked_files):
    if not leaked_files:
        return

    print("
Menampilkan visualisasi gambar yang bocor...")
    print("(Tutup window gambar untuk melanjutkan ke gambar berikutnya)")

    for train_filepath, test_filepath in leaked_files:
        try:
            train_img = Image.open(train_filepath)
            test_img = Image.open(test_filepath)

            fig, ax = plt.subplots(1, 2, figsize=(12, 6))

            ax[0].imshow(train_img)
            ax[0].set_title(f"Train Image
{Path(train_filepath).name}", fontsize=10)
            ax[0].axis('off')

            ax[1].imshow(test_img)
            ax[1].set_title(f"Test Image
{Path(test_filepath).name}", fontsize=10)
            ax[1].axis('off')

            fig.suptitle("POTENSI KEBOCORAN DATA TERDETEKSI", fontsize=16, weight='bold')
            plt.tight_layout(rect=[0, 0, 1, 0.96])
            plt.show()

        except Exception as e:
            print(f"Gagal menampilkan gambar: {e}")

## 3. Distribusi Jumlah Citra per Label (Train Set)Langkah ini membantu memahami keseimbangan kelas pada data latih dengan menampilkan jumlah citra di setiap label.

In [None]:
def count_images_per_label(directory: Path):
    counts = {}
    if not directory.exists():
        print(f"Error: Direktori tidak ditemukan -> {directory}")
        return counts

    for label_folder in sorted(directory.iterdir()):
        if label_folder.is_dir():
            num_images = sum(1 for file in label_folder.iterdir() if file.suffix.lower() in {'.png', '.jpg', '.jpeg'})
            counts[label_folder.name] = num_images
    return counts

train_label_counts = count_images_per_label(TRAIN_DIR)
train_label_counts

In [None]:
def plot_label_counts(counts, title):
    if not counts:
        print("Tidak ada data untuk divisualisasikan.")
        return

    labels = list(counts.keys())
    values = list(counts.values())

    plt.figure(figsize=(10, 5))
    bars = plt.bar(labels, values, color="#1f77b4")
    plt.title(title)
    plt.xlabel("Label")
    plt.ylabel("Jumlah Citra")
    plt.xticks(rotation=45, ha='right')

    for bar, value in zip(bars, values):
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), str(value), ha='center', va='bottom')

    plt.tight_layout()
    plt.show()

plot_label_counts(train_label_counts, "Distribusi Jumlah Citra per Label (Train Set)")

## 4. Pengecekan Kebocoran DataEksekusi bagian ini untuk menghitung hash gambar pada data latih, kemudian memeriksa apakah ada gambar di data uji yang identik dengan data latih.

In [None]:
print(f"Mulai menghitung hash untuk direktori TRAIN: {TRAIN_DIR}")
train_image_hashes = calculate_hashes(TRAIN_DIR, FILES_TO_EXCLUDE, "Hashing Train Images")

if train_image_hashes:
    print(f"Selesai. Ditemukan {len(train_image_hashes)} hash unik dari gambar latih.
")

    print(f"Mulai memeriksa kebocoran data pada direktori TEST: {TEST_DIR}")
    found_leaks = check_for_leaks(train_image_hashes, TEST_DIR)

    print("
" + "=" * 50)
    print("=== HASIL PENGECEKAN KEBOCORAN DATA ===")
    print("=" * 50)

    if not found_leaks:
        print("✅ Tidak ditemukan adanya kebocoran data antara set Train dan Test.")
    else:
        print(f"🔥 PERINGATAN: Ditemukan {len(found_leaks)} potensi kebocoran data!")
        for train_file, test_file in found_leaks:
            print(f"
 -> File Test: {Path(test_file).name}")
            print(f"    Sama dengan File Train: {train_file}")
    print("=" * 50)

    if found_leaks:
        visualize_leaks(found_leaks)