In [1]:
import os
import json
import h5py
import numpy as np
from datetime import datetime


# ============================================================
# Synthetic Dataset Generator & Saver
# ============================================================

class SyntheticDatasetGenerator:
    def __init__(self, base_dir="./synthetic_datasets", image_size=64, seed=42):
        self.base_dir = base_dir
        self.image_size = image_size
        np.random.seed(seed)

        os.makedirs(self.base_dir, exist_ok=True)
        self._create_readme()

    # --------------------------------------------------------
    # README
    # --------------------------------------------------------
    def _create_readme(self):
        readme = """
# Synthetic Scientific Datasets

## Description
This directory contains fully synthetic scientific image datasets.
All data is generated procedurally using mathematical functions.

## Datasets
1. Synthetic Galaxy Morphology Dataset
2. Synthetic Crystal Structures Dataset

## Properties
- Image size: 64×64
- Grayscale images
- Automatic labeling
- No real-world data
- No copyright restrictions

## Intended Use
- Machine learning experiments
- Clustering and representation learning
- Educational and benchmarking purposes
"""
        with open(os.path.join(self.base_dir, "README.md"), "w") as f:
            f.write(readme.strip())

    # --------------------------------------------------------
    # Galaxy Simulation
    # --------------------------------------------------------
    def generate_galaxies(self, samples_per_class=125):
        classes = {
            0: "Elliptical",
            1: "Spiral",
            2: "Lenticular",
            3: "Irregular"
        }

        images = []
        labels = []

        for label in classes:
            for _ in range(samples_per_class):
                img = self._simulate_galaxy(label)
                images.append(img)
                labels.append(label)

        return np.array(images), np.array(labels), classes

    def _simulate_galaxy(self, galaxy_type):
        size = self.image_size
        x, y = np.meshgrid(np.linspace(-1, 1, size), np.linspace(-1, 1, size))
        r = np.sqrt(x**2 + y**2)

        if galaxy_type == 0:  # Elliptical
            img = np.exp(-4 * r**2)

        elif galaxy_type == 1:  # Spiral
            theta = np.arctan2(y, x)
            img = np.exp(-3 * r**2) * (1 + 0.5 * np.sin(6 * theta + 10 * r))

        elif galaxy_type == 2:  # Lenticular
            img = np.exp(-6 * r**2)
            img[r > 0.6] *= 0.2

        else:  # Irregular
            img = np.random.rand(size, size)

        img += 0.05 * np.random.randn(size, size)
        return np.clip(img, 0, 1)

    # --------------------------------------------------------
    # Crystal Simulation
    # --------------------------------------------------------
    def generate_crystals(self, samples_per_class=100):
        classes = {
            0: "Cubic",
            1: "Hexagonal",
            2: "Tetragonal",
            3: "Orthorhombic"
        }

        images = []
        labels = []

        for label in classes:
            for _ in range(samples_per_class):
                img = self._simulate_crystal(label)
                images.append(img)
                labels.append(label)

        return np.array(images), np.array(labels), classes

    def _simulate_crystal(self, crystal_type):
        size = self.image_size
        img = np.zeros((size, size))

        if crystal_type == 0:  # Cubic
            for i in range(0, size, 8):
                img[i:i+2, :] = 1
                img[:, i:i+2] = 1

        elif crystal_type == 1:  # Hexagonal
            for i in range(0, size, 10):
                for j in range(0, size, 10):
                    img[i:i+2, j:j+2] = 1

        elif crystal_type == 2:  # Tetragonal
            for i in range(0, size, 6):
                img[i:i+1, :] = 1

        else:  # Orthorhombic
            for i in range(0, size, 12):
                img[:, i:i+2] = 1

        img += 0.05 * np.random.randn(size, size)
        return np.clip(img, 0, 1)

    # --------------------------------------------------------
    # Saving
    # --------------------------------------------------------
    def save_dataset(self, name, images, labels, class_names):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"{name}_{timestamp}.h5"
        path = os.path.join(self.base_dir, filename)

        with h5py.File(path, "w") as f:
            f.create_dataset("images", data=images, compression="gzip")
            f.create_dataset("labels", data=labels, compression="gzip")

            metadata = {
                "dataset": name,
                "created": timestamp,
                "num_samples": len(images),
                "image_shape": images.shape[1:],
                "classes": class_names,
                "synthetic": True
            }

            for k, v in metadata.items():
                f.attrs[k] = json.dumps(v)

        meta_path = path.replace(".h5", "_metadata.json")
        with open(meta_path, "w") as f:
            json.dump(metadata, f, indent=2)

        print(f"✓ Saved {name}")
        print(f"  → {path}")
        print(f"  → {meta_path}")

        return path


# ============================================================
# Main entry point
# ============================================================

if __name__ == "__main__":
    generator = SyntheticDatasetGenerator()

    print("\nGenerating Galaxy Dataset...")
    g_images, g_labels, g_classes = generator.generate_galaxies()
    generator.save_dataset("galaxy_morphology_synthetic", g_images, g_labels, g_classes)

    print("\nGenerating Crystal Dataset...")
    c_images, c_labels, c_classes = generator.generate_crystals()
    generator.save_dataset("crystal_structures_synthetic", c_images, c_labels, c_classes)

    print("\n✓ All synthetic datasets generated and saved locally.")



Generating Galaxy Dataset...
✓ Saved galaxy_morphology_synthetic
  → ./synthetic_datasets/galaxy_morphology_synthetic_20260127_102632.h5
  → ./synthetic_datasets/galaxy_morphology_synthetic_20260127_102632_metadata.json

Generating Crystal Dataset...
✓ Saved crystal_structures_synthetic
  → ./synthetic_datasets/crystal_structures_synthetic_20260127_102632.h5
  → ./synthetic_datasets/crystal_structures_synthetic_20260127_102632_metadata.json

✓ All synthetic datasets generated and saved locally.
