In [1]:

import h5py
import json
import numpy as np
import os
from datetime import datetime

class DatasetSaver:
    """Class to save synthetic datasets and documentation"""
   
    def __init__(self, base_dir="./synthetic_datasets"):
        self.base_dir = base_dir
        os.makedirs(base_dir, exist_ok=True)
       
        # Create README with methodology documentation
        self._create_readme()
   
    def _create_readme(self):
        """Create comprehensive README documenting data creation methodology"""
        readme_content = """
# Synthetic Scientific Datasets - Methodology Documentation

## Overview
This repository contains two synthetically generated scientific datasets:
1. **Synthetically Simulated Galaxy Morphology Dataset**
2. **Synthetically Simulated Crystal Structures Dataset**

## Data Collection & Generation Methodology

### 1. Data Creation Process
Both datasets were **synthetically generated** using algorithmic simulation methods, NOT collected from external sources.

#### Galaxy Morphology Dataset:
- **Generation Method**: Procedural generation using mathematical functions
- **Galaxy Types Simulated**:
  - Elliptical: Smooth Gaussian distributions
  - Spiral: Sinusoidal spiral arm patterns
  - Lenticular: Disk-like structures with sharp cutoffs
  - Irregular: Random clump distributions
- **Parameters**: 64×64 grayscale images, 500 samples

#### Crystal Structures Dataset:
- **Generation Method**: Lattice-based procedural generation
- **Crystal Systems Simulated**:
  - Cubic: Square grid patterns
  - Hexagonal: Hexagonal close packing
  - Tetragonal: Rectangular lattices
  - Orthorhombic: Varied spacing lattices
- **Parameters**: 64×64 grayscale images, 400 samples

### 2. Annotation Process
- **Who Annotated**: Automatic algorithmic labeling
- **Quality Control**: Each generated sample is automatically labeled based on its generation parameters
- **Annotation Method**: Class labels assigned during procedural generation
- **Validation**: Visual inspection of representative samples

### 3. Copyright & Licensing
- **Status**: No copyright restrictions
- **Reason**: All data is synthetically generated (no real observational data)
- **License**: MIT License (open for academic and research use)
- **Permissions Required**: None (completely synthetic data)

### 4. Data Quality
- **Consistency**: High (generated with controlled parameters)
- **Noise Level**: Controlled Gaussian noise added for realism
- **Class Balance**: Approximately balanced across categories
- **Resolution**: 64×64 pixels, grayscale

### 5. Intended Use
- Research in metric learning and dimensionality reduction
- Testing clustering algorithms on scientific image data
- Educational purposes in machine learning for science
- Benchmarking tensor decomposition methods

### 6. Dataset Statistics
#### Galaxy Morphology:
- Total samples: 500
- Classes: 4 (Elliptical, Spiral, Lenticular, Irregular)
- Image size: 64×64 pixels
- Format: Grayscale (normalized)

#### Crystal Structures:
- Total samples: 400
- Classes: 4 (Cubic, Hexagonal, Tetragonal, Orthorhombic)
- Image size: 64×64 pixels
- Format: Grayscale (normalized)

### 7. Contact
For questions about dataset generation methodology:
- Methodology described in this document
- Code available in the associated repository
- Synthetic nature ensures no copyright issues

---

**Note**: These are simulated datasets for research purposes. They do not contain real astronomical or crystallographic data.
"""
       
        with open(os.path.join(self.base_dir, "README.md"), "w") as f:
            f.write(readme_content)
   
    def save_galaxy_dataset(self, images, labels, class_names):
        """Save the synthetic galaxy morphology dataset"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"galaxy_morphology_synthetic_{timestamp}.h5"
        filepath = os.path.join(self.base_dir, filename)
       
        # Save to HDF5
        with h5py.File(filepath, 'w') as f:
            # Save images
            f.create_dataset('images', data=images, compression='gzip')
            f.create_dataset('labels', data=labels, compression='gzip')
           
            # Save metadata
            metadata = {
                'dataset_name': 'Synthetic Galaxy Morphology',
                'creation_date': timestamp,
                'num_samples': len(images),
                'image_shape': images[0].shape,
                'num_classes': len(np.unique(labels)),
                'class_names': {str(k): v for k, v in class_names.items()},
                'generation_method': 'Procedural simulation with mathematical functions',
                'license': 'MIT',
                'copyright': 'None - Synthetic data',
                'contact': 'See README.md for methodology details'
            }
           
            # Add metadata as attributes
            for key, value in metadata.items():
                if isinstance(value, dict):
                    f.attrs[key] = json.dumps(value)
                else:
                    f.attrs[key] = str(value)
       
        # Create separate metadata file
        metadata_file = f"galaxy_metadata_{timestamp}.json"
        with open(os.path.join(self.base_dir, metadata_file), 'w') as f:
            json.dump(metadata, f, indent=2)
       
        print(f"✓ Galaxy dataset saved to: {filepath}")
        print(f"✓ Metadata saved to: {metadata_file}")
       
        return filepath, metadata_file
   
    def save_crystal_dataset(self, images, labels, class_names):
        """Save the synthetic crystal structures dataset"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"crystal_structures_synthetic_{timestamp}.h5"
        filepath = os.path.join(self.base_dir, filename)
       
        # Save to HDF5
        with h5py.File(filepath, 'w') as f:
            # Save images
            f.create_dataset('images', data=images, compression='gzip')
            f.create_dataset('labels', data=labels, compression='gzip')
           
            # Save metadata
            metadata = {
                'dataset_name': 'Synthetic Crystal Structures',
                'creation_date': timestamp,
                'num_samples': len(images),
                'image_shape': images[0].shape,
                'num_classes': len(np.unique(labels)),
                'class_names': {str(k): v for k, v in class_names.items()},
                'generation_method': 'Lattice-based procedural generation',
                'license': 'MIT',
                'copyright': 'None - Synthetic data',
                'contact': 'See README.md for methodology details'
            }
           
            # Add metadata as attributes
            for key, value in metadata.items():
                if isinstance(value, dict):
                    f.attrs[key] = json.dumps(value)
                else:
                    f.attrs[key] = str(value)
       
        # Create separate metadata file
        metadata_file = f"crystal_metadata_{timestamp}.json"
        with open(os.path.join(self.base_dir, metadata_file), 'w') as f:
            json.dump(metadata, f, indent=2)
       
        print(f"✓ Crystal dataset saved to: {filepath}")
        print(f"✓ Metadata saved to: {metadata_file}")
       
        return filepath, metadata_file
   
    def create_dataset_links(self):
        """Create public access links documentation"""
        links_content = """
# Public Dataset Links

## Galaxy Morphology Synthetic Dataset
**Format**: HDF5 (.h5)
**Size**: ~10-20 MB (compressed)
**Download**: [Link to be added - dataset available upon request]
**Alternative**: Run the generation code to create your own synthetic version

## Crystal Structures Synthetic Dataset
**Format**: HDF5 (.h5)
**Size**: ~8-15 MB (compressed)
**Download**: [Link to be added - dataset available upon request]
**Alternative**: Run the generation code to create your own synthetic version

## How to Access
1. **Direct Download**: Links will be provided upon publication
2. **Regenerate**: Use the provided Python code to generate identical synthetic data
3. **Contact**: For immediate access, contact the authors

## Note on Public Availability
Since these are synthetic datasets generated on-the-fly, you can:
1. Run the generation code yourself (recommended)
2. Request pre-generated files from the authors
3. Use the code to generate custom variations

The generation methodology is fully documented and reproducible.
"""
       
        links_file = os.path.join(self.base_dir, "DATASET_LINKS.md")
        with open(links_file, 'w') as f:
            f.write(links_content)
       
        print(f"✓ Dataset links documentation created: {links_file}")
       
        return links_file

# Add these lines to your ScientificDatasetLoader class methods:

class ScientificDatasetLoader:
    """Download and load scientific datasets automatically"""
   
    def __init__(self):
        self.available_datasets = {
            '1': ('Galaxy Morphology Classification', self.load_galaxy_data),
            '2': ('Crystal Structure Prediction', self.load_crystal_data),
            '3': ('Solar Flare Detection', self.load_solar_flare_data)
        }
        self.saver = DatasetSaver()
   
    def load_galaxy_data(self, save_to_disk=True):
        """Load Galaxy Classification dataset with saving option"""
        print("Downloading/Gene rating Galaxy Classification data...")
       
        # try:
        #     # ... [your existing galaxy data loading code] ...
           
        except Exception as e:
            print(f"Error downloading galaxy data: {e}")
            print("Generating simulated galaxy data...")
            images, labels, shape, galaxy_classes = self._simulate_galaxy_data()
           
            # Save dataset if requested
            if save_to_disk:
                self.saver.save_galaxy_dataset(images, labels, galaxy_classes)
           
            return images, labels, shape, galaxy_classes
   
    def load_crystal_data(self, save_to_disk=True):
        """Load Crystal Structure dataset with saving option"""
        print("Downloading/Generating Crystal Structure data...")
       
        try:
            images, labels, shape, crystal_classes = self._simulate_crystal_data()
           
            # Save dataset if requested
            if save_to_disk:
                self.saver.save_crystal_dataset(images, labels, crystal_classes)
           
            return images, labels, shape, crystal_classes
           
        except Exception as e:
            print(f"Error: {e}")
            return self._simulate_crystal_data()

# Add this function to answer the comment:

# def provide_dataset_documentation():
#     """Function that answers the comment about dataset methodology"""
   
#     response = """
#     ================================================================
#     DATASET METHODOLOGY & AVAILABILITY RESPONSE
#     ================================================================
   
#     Thank you for your inquiry about our datasets. Here is detailed
#     information addressing all your concerns:
   
#     1. DATASET AVAILABILITY:
#     -------------------------
#     We have generated two synthetic scientific datasets:
#     - Synthetic Galaxy Morphology Dataset
#     - Synthetic Crystal Structures Dataset
   
#     Both datasets are available for download from:
#     [Repository Link to be added upon publication]
   
#     Alternative: You can regenerate them using our provided code.
   
#     2. DATA COLLECTION METHODOLOGY:
#     --------------------------------
#     IMPORTANT: These are SYNTHETICALLY GENERATED datasets, not
#     collected from real observations.
   
#     Galaxy Morphology Dataset:
#     - Generated using procedural algorithms simulating galaxy types
#     - Mathematical functions create elliptical, spiral, lenticular,
#       and irregular galaxy patterns
#     - 500 samples, 64×64 grayscale images
#     - Automatic labeling during generation
   
#     Crystal Structures Dataset:
#     - Generated using lattice-based procedural algorithms
#     - Simulates cubic, hexagonal, tetragonal, and orthorhombic systems
#     - 400 samples, 64×64 grayscale images
#     - Automatic labeling during generation
   
#     3. COPYRIGHT PERMISSIONS:
#     -------------------------
#     NO COPYRIGHT ISSUES: Since all data is synthetically generated,
#     there are no copyright restrictions. No real observational data
#     was used. The datasets are released under MIT License.
   
#     4. ANNOTATION PROCESS:
#     ----------------------
#     - Who annotated: Automated algorithmic labeling
#     - Quality: High consistency (controlled generation parameters)
#     - Method: Labels assigned during procedural generation
#     - Validation: Visual inspection of representative samples
   
#     5. DATA QUALITY:
#     ----------------
#     - Consistency: High (controlled parameters)
#     - Noise: Controlled Gaussian noise for realism
#     - Balance: Approximately balanced across classes
#     - Resolution: 64×64 grayscale, normalized
   
#     6. REPRODUCIBILITY:
#     -------------------
#     The datasets are fully reproducible using our code. Anyone can
#     generate identical synthetic datasets by running the provided
#     simulation functions.
   
#     7. INTENDED USE:
#     ----------------
#     - Research in metric learning and dimensionality reduction
#     - Testing clustering algorithms on scientific image data
#     - Educational purposes in machine learning for science
#     - Benchmarking tensor decomposition methods
   
#     For immediate access, you can:
#     1. Run the generation code yourself
#     2. Contact us for pre-generated files
#     3. Wait for publication with public links
   
#     All methodology is documented in the accompanying README files.
#     ================================================================
#     """
   
#     print(response)
   
    # Also save this response to a file
    with open("./dataset_methodology_response.txt", "w") as f:
        f.write(response)
   
    print("✓ Detailed response saved to: ./dataset_methodology_response.txt")
   
    return response

# Add this to your main function or call it separately:

def save_all_datasets():
    """Function to generate and save all datasets"""
    print("Generating and saving all synthetic datasets...")
   
    saver = DatasetSaver()
   
    # Generate galaxy dataset
    loader = ScientificDatasetLoader()
    print("\n1. Generating Galaxy Morphology Dataset...")
    galaxy_images, galaxy_labels, galaxy_shape, galaxy_classes = loader._simulate_galaxy_data()
    galaxy_path, galaxy_meta = saver.save_galaxy_dataset(galaxy_images, galaxy_labels, galaxy_classes)
   
    print("\n2. Generating Crystal Structures Dataset...")
    crystal_images, crystal_labels, crystal_shape, crystal_classes = loader._simulate_crystal_data()
    crystal_path, crystal_meta = saver.save_crystal_dataset(crystal_images, crystal_labels, crystal_classes)
   
    print("\n3. Creating dataset links documentation...")
    links_file = saver.create_dataset_links()
   
    print("\n" + "="*60)
    print("DATASET GENERATION COMPLETE")
    print("="*60)
    print(f"Galaxy dataset: {galaxy_path}")
    print(f"Galaxy metadata: {galaxy_meta}")
    print(f"Crystal dataset: {crystal_path}")
    print(f"Crystal metadata: {crystal_meta}")
    print(f"Links documentation: {links_file}")
    print("\nTo share methodology documentation, run:")
    print("provide_dataset_documentation()")
    print("="*60)
   
    return {
        'galaxy': galaxy_path,
        'crystal': crystal_path,
        'galaxy_meta': galaxy_meta,
        'crystal_meta': crystal_meta,
        'links': links_file
    }

SyntaxError: invalid syntax (3371304304.py, line 246)

In [1]:
import os
import json
import h5py
import numpy as np
from datetime import datetime


# ============================================================
# Synthetic Dataset Generator & Saver
# ============================================================

class SyntheticDatasetGenerator:
    def __init__(self, base_dir="./synthetic_datasets", image_size=64, seed=42):
        self.base_dir = base_dir
        self.image_size = image_size
        np.random.seed(seed)

        os.makedirs(self.base_dir, exist_ok=True)
        self._create_readme()

    # --------------------------------------------------------
    # README
    # --------------------------------------------------------
    def _create_readme(self):
        readme = """
# Synthetic Scientific Datasets

## Description
This directory contains fully synthetic scientific image datasets.
All data is generated procedurally using mathematical functions.

## Datasets
1. Synthetic Galaxy Morphology Dataset
2. Synthetic Crystal Structures Dataset

## Properties
- Image size: 64×64
- Grayscale images
- Automatic labeling
- No real-world data
- No copyright restrictions

## Intended Use
- Machine learning experiments
- Clustering and representation learning
- Educational and benchmarking purposes
"""
        with open(os.path.join(self.base_dir, "README.md"), "w") as f:
            f.write(readme.strip())

    # --------------------------------------------------------
    # Galaxy Simulation
    # --------------------------------------------------------
    def generate_galaxies(self, samples_per_class=125):
        classes = {
            0: "Elliptical",
            1: "Spiral",
            2: "Lenticular",
            3: "Irregular"
        }

        images = []
        labels = []

        for label in classes:
            for _ in range(samples_per_class):
                img = self._simulate_galaxy(label)
                images.append(img)
                labels.append(label)

        return np.array(images), np.array(labels), classes

    def _simulate_galaxy(self, galaxy_type):
        size = self.image_size
        x, y = np.meshgrid(np.linspace(-1, 1, size), np.linspace(-1, 1, size))
        r = np.sqrt(x**2 + y**2)

        if galaxy_type == 0:  # Elliptical
            img = np.exp(-4 * r**2)

        elif galaxy_type == 1:  # Spiral
            theta = np.arctan2(y, x)
            img = np.exp(-3 * r**2) * (1 + 0.5 * np.sin(6 * theta + 10 * r))

        elif galaxy_type == 2:  # Lenticular
            img = np.exp(-6 * r**2)
            img[r > 0.6] *= 0.2

        else:  # Irregular
            img = np.random.rand(size, size)

        img += 0.05 * np.random.randn(size, size)
        return np.clip(img, 0, 1)

    # --------------------------------------------------------
    # Crystal Simulation
    # --------------------------------------------------------
    def generate_crystals(self, samples_per_class=100):
        classes = {
            0: "Cubic",
            1: "Hexagonal",
            2: "Tetragonal",
            3: "Orthorhombic"
        }

        images = []
        labels = []

        for label in classes:
            for _ in range(samples_per_class):
                img = self._simulate_crystal(label)
                images.append(img)
                labels.append(label)

        return np.array(images), np.array(labels), classes

    def _simulate_crystal(self, crystal_type):
        size = self.image_size
        img = np.zeros((size, size))

        if crystal_type == 0:  # Cubic
            for i in range(0, size, 8):
                img[i:i+2, :] = 1
                img[:, i:i+2] = 1

        elif crystal_type == 1:  # Hexagonal
            for i in range(0, size, 10):
                for j in range(0, size, 10):
                    img[i:i+2, j:j+2] = 1

        elif crystal_type == 2:  # Tetragonal
            for i in range(0, size, 6):
                img[i:i+1, :] = 1

        else:  # Orthorhombic
            for i in range(0, size, 12):
                img[:, i:i+2] = 1

        img += 0.05 * np.random.randn(size, size)
        return np.clip(img, 0, 1)

    # --------------------------------------------------------
    # Saving
    # --------------------------------------------------------
    def save_dataset(self, name, images, labels, class_names):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"{name}_{timestamp}.h5"
        path = os.path.join(self.base_dir, filename)

        with h5py.File(path, "w") as f:
            f.create_dataset("images", data=images, compression="gzip")
            f.create_dataset("labels", data=labels, compression="gzip")

            metadata = {
                "dataset": name,
                "created": timestamp,
                "num_samples": len(images),
                "image_shape": images.shape[1:],
                "classes": class_names,
                "synthetic": True
            }

            for k, v in metadata.items():
                f.attrs[k] = json.dumps(v)

        meta_path = path.replace(".h5", "_metadata.json")
        with open(meta_path, "w") as f:
            json.dump(metadata, f, indent=2)

        print(f"✓ Saved {name}")
        print(f"  → {path}")
        print(f"  → {meta_path}")

        return path


# ============================================================
# Main entry point
# ============================================================

if __name__ == "__main__":
    generator = SyntheticDatasetGenerator()

    print("\nGenerating Galaxy Dataset...")
    g_images, g_labels, g_classes = generator.generate_galaxies()
    generator.save_dataset("galaxy_morphology_synthetic", g_images, g_labels, g_classes)

    print("\nGenerating Crystal Dataset...")
    c_images, c_labels, c_classes = generator.generate_crystals()
    generator.save_dataset("crystal_structures_synthetic", c_images, c_labels, c_classes)

    print("\n✓ All synthetic datasets generated and saved locally.")



Generating Galaxy Dataset...
✓ Saved galaxy_morphology_synthetic
  → ./synthetic_datasets/galaxy_morphology_synthetic_20260127_102632.h5
  → ./synthetic_datasets/galaxy_morphology_synthetic_20260127_102632_metadata.json

Generating Crystal Dataset...
✓ Saved crystal_structures_synthetic
  → ./synthetic_datasets/crystal_structures_synthetic_20260127_102632.h5
  → ./synthetic_datasets/crystal_structures_synthetic_20260127_102632_metadata.json

✓ All synthetic datasets generated and saved locally.
