# SpaceNet Dataset Analysis

Source: [SpaceNet - Kaggle](https://www.kaggle.com/datasets/razaimam45/spacenet-an-optimally-distributed-astronomy-data)

Checking out the data distribution and sample images to understand what we're working with.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob
from PIL import Image

# standard kaggle input path usually
DATA_DIR = '../input/spacenet-an-optimally-distributed-astronomy-data'

if not os.path.exists(DATA_DIR):
    print("Dataset not found at", DATA_DIR)
    print("Please check the path.")

### Class Distribution

In [None]:
classes = [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]
print("Found classes:", classes)

class_counts = {}
for c in classes:
    class_path = os.path.join(DATA_DIR, c)
    # just counting files
    count = len(os.listdir(class_path))
    class_counts[c] = count

# quick conversions for plotting
df = pd.DataFrame(index=class_counts.keys(), data=class_counts.values(), columns=['count'])
df.sort_values(by='count', ascending=False, inplace=True)

print(df)

plt.figure(figsize=(10, 5))
plt.bar(df.index, df['count'])
plt.xticks(rotation=45)
plt.title('Distribution of Classes')
plt.show()

### Sample Images

In [None]:
def show_samples(num_samples=3):
    plt.figure(figsize=(15, 15))
    
    for i, c in enumerate(classes):
        class_path = os.path.join(DATA_DIR, c)
        images = os.listdir(class_path)
        
        # grab random ones
        choices = np.random.choice(images, num_samples, replace=False)
        
        for j, img_name in enumerate(choices):
            img_path = os.path.join(class_path, img_name)
            img = Image.open(img_path)
            
            plt.subplot(len(classes), num_samples, i * num_samples + j + 1)
            plt.imshow(img)
            plt.title(c)
            plt.axis('off')
            
    plt.tight_layout()
    plt.show()

show_samples()

### Image Sizes

Checking if all images are uniform or if we need resizing.

In [None]:
# checking just a few randoms
sizes = []
for root, dirs, files in os.walk(DATA_DIR):
    for f in files[:100]:
        if f.lower().endswith(('.png', '.jpg', '.jpeg')):
            p = os.path.join(root, f)
            try:
                with Image.open(p) as img:
                    sizes.append(img.size)
            except:
                pass

unique_sizes = set(sizes)
print("Unique sizes in sample:", unique_sizes)