In [None]:
# ============================================================
# üìå Image Dataset EDA (Jupyter + VS Code compatible version)
# ============================================================

import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from collections import Counter
import hashlib
import seaborn as sns

# Jupyter Notebook ‡¶è Plot ‡¶∏‡ßÅ‡¶®‡ßç‡¶¶‡¶∞ ‡¶¶‡ßá‡¶ñ‡¶æ‡¶®‡ßã‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø
%matplotlib inline
sns.set(style="whitegrid")

# ========================
# 1Ô∏è‚É£ Dataset Path ‡¶¶‡¶ø‡¶®
# ========================
DATASET_DIR = "path/to/dataset"  # <-- ‡¶Ü‡¶™‡¶®‡¶æ‡¶∞ path ‡¶¶‡¶ø‡¶®

# ========================
# 2Ô∏è‚É£ ‡¶°‡ßá‡¶ü‡¶æ ‡¶∏‡¶Ç‡¶ó‡ßç‡¶∞‡¶π‡ßá‡¶∞ ‡¶ú‡¶®‡ßç‡¶Ø ‡¶≤‡¶ø‡¶∏‡ßç‡¶ü
# ========================
image_info = []
hashes = set()
duplicate_files = []
corrupt_files = []

# ========================
# 3Ô∏è‚É£ ‡¶°‡ßá‡¶ü‡¶æ‡¶∏‡ßá‡¶ü ‡¶∏‡ßç‡¶ï‡ßç‡¶Ø‡¶æ‡¶® ‡¶ï‡¶∞‡¶æ
# ========================
for root, dirs, files in os.walk(DATASET_DIR):
    for file in files:
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            file_path = os.path.join(root, file)
            try:
                img = Image.open(file_path)
                img.verify()  # corrupt ‡¶ö‡ßá‡¶ï
                img = Image.open(file_path).convert("RGB")
                width, height = img.size
                img_np = np.array(img)

                # ‡¶π‡ßç‡¶Ø‡¶æ‡¶∂ ‡¶¶‡¶ø‡ßü‡ßá ‡¶°‡ßÅ‡¶™‡ßç‡¶≤‡¶ø‡¶ï‡ßá‡¶ü ‡¶ö‡ßá‡¶ï
                file_hash = hashlib.md5(img.tobytes()).hexdigest()
                if file_hash in hashes:
                    duplicate_files.append(file_path)
                else:
                    hashes.add(file_hash)

                # ‡¶ï‡ßç‡¶≤‡¶æ‡¶∏ ‡¶®‡¶æ‡¶Æ ‡¶¨‡ßá‡¶∞ ‡¶ï‡¶∞‡¶æ (‡¶ß‡¶∞‡¶æ ‡¶π‡ßü‡ßá‡¶õ‡ßá ‡¶´‡ßã‡¶≤‡ßç‡¶°‡¶æ‡¶∞ ‡¶®‡¶æ‡¶Æ = ‡¶ï‡ßç‡¶≤‡¶æ‡¶∏)
                label = os.path.basename(root)

                # ‡¶ï‡¶æ‡¶≤‡¶æ‡¶∞ ‡¶∏‡ßç‡¶ü‡ßç‡¶Ø‡¶æ‡¶ü‡¶∏
                mean_colors = np.mean(img_np, axis=(0, 1))  # R,G,B mean
                std_colors = np.std(img_np, axis=(0, 1))    # R,G,B std

                image_info.append({
                    "path": file_path,
                    "label": label,
                    "width": width,
                    "height": height,
                    "mean_R": mean_colors[0],
                    "mean_G": mean_colors[1],
                    "mean_B": mean_colors[2],
                    "std_R": std_colors[0],
                    "std_G": std_colors[1],
                    "std_B": std_colors[2]
                })

            except Exception as e:
                corrupt_files.append(file_path)

# ========================
# 4Ô∏è‚É£ DataFrame ‡¶§‡ßà‡¶∞‡¶ø
# ========================
df = pd.DataFrame(image_info)

print("\n--- Dataset Summary ---")
print(f"Total images: {len(df)}")
print(f"Unique classes: {df['label'].nunique()}")
print("\nClass distribution:\n", df['label'].value_counts())

print(f"\nCorrupt files: {len(corrupt_files)}")
print(f"Duplicate files: {len(duplicate_files)}")

# ========================
# 5Ô∏è‚É£ Class distribution plot
# ========================
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x="label", order=df['label'].value_counts().index)
plt.title("Class Distribution")
plt.xticks(rotation=45)
plt.show()

# ========================
# 6Ô∏è‚É£ Image size distribution
# ========================
plt.figure(figsize=(8, 5))
sns.scatterplot(data=df, x="width", y="height", hue="label")
plt.title("Image Resolution Distribution")
plt.show()

# ========================
# 7Ô∏è‚É£ Color channel mean distribution
# ========================
plt.figure(figsize=(8, 5))
sns.histplot(df[['mean_R', 'mean_G', 'mean_B']], kde=True)
plt.title("Color Mean Distribution")
plt.show()

# ========================
# 8Ô∏è‚É£ Example images preview
# ========================
sample_df = df.sample(min(9, len(df)), random_state=42)  # 9 ‡¶¨‡¶æ ‡¶ï‡¶Æ ‡¶õ‡¶¨‡¶ø
plt.figure(figsize=(8, 8))
for i, row in enumerate(sample_df.itertuples(), 1):
    img = Image.open(row.path)
    plt.subplot(3, 3, i)
    plt.imshow(img)
    plt.axis("off")
    plt.title(row.label)
plt.show()

# ========================
# 9Ô∏è‚É£ CSV ‡¶§‡ßá save
# ========================
pd.Series(duplicate_files).to_csv("duplicate_images.csv", index=False)
pd.Series(corrupt_files).to_csv("corrupt_images.csv", index=False)

print("\n‚úÖ EDA Completed! Duplicate & Corrupt files saved as CSV.")


ModuleNotFoundError: No module named 'cv2'