In [19]:
# =========================================================================
# === NOTEBOOK CONFIGURATION ===
# Set this to False to quickly load the best parameters for the autoencoder.
# Set this to True to run a full Optuna study to find new best parameters.
# RUN_FULL_OPTUNA_STUDY = True
# =========================================================================

In [20]:
!pip install -U optuna
!pip install -U plotly
!pip install -U kaleido
!plotly_get_chrome
import pandas as pd
import numpy as np
import requests
import gzip
import json
import io
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import optuna
from plotly.io import show
from datetime import datetime
from optuna.trial import TrialState
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset


Plotly will install a copy of Google Chrome to be used for generating static images of plots.
Chrome will be installed at: None
Do you want to proceed? [y/n] y
Installing Chrome for Plotly...
Chrome installed successfully.
The Chrome executable is now located at: /usr/local/lib/python3.12/dist-packages/choreographer/cli/browser_exe/chrome-linux64/chrome


In [21]:
# --- Dataset Download  ---
def download_hyg_dataset():
    """
    Attempts to download the gzipped CSV file from a list of URLs.

    Returns:
        pd.DataFrame: A pandas DataFrame of the HYG data if successful, None otherwise.
    """
    # URLs to official public HYG data set repository and backup copy hosted on Google Drive.
    HYG_URLS = ['https://codeberg.org/astronexus/hyg/media/branch/main/data/hyg/CURRENT/hyg_v42.csv.gz',
                'https://drive.google.com/uc?export=download&id=1U2apsUPjQR_DllzF74y-pV3KjVTK3FJW']

    hyg_file = None
    print("\nStarting data pipeline: Attempting to download HYG star data...")

    for url in HYG_URLS:
        try:
            print(f"Trying URL: {url}")
            response = requests.get(url, timeout=30)
            response.raise_for_status()
            hyg_file = io.BytesIO(response.content)
            hyg_df = pd.read_csv(hyg_file, compression='gzip')
            print("Download successful.")
            return hyg_df
        except requests.exceptions.RequestException as e:
            print(f"Error downloading from {url}: {e}")
            print("Trying next URL...")
        except Exception as e:
            print(f"An unexpected error occurred during data processing: {e}")
            return None

    print("\nAll download attempts failed. Please check your internet connection or the URLs.")
    return None

# Download the data
df = download_hyg_dataset()


Starting data pipeline: Attempting to download HYG star data...
Trying URL: https://codeberg.org/astronexus/hyg/media/branch/main/data/hyg/CURRENT/hyg_v42.csv.gz
Download successful.


In [22]:
# --- Dataset Inspection ---
# Set pandas display options to see all columns
pd.set_option('display.max_columns', None)

print("\n--- HYG Dataset Head (First 5 Rows) ---")
print(df.head())

print("\n--- Dataset Info ---")
df.info()

unique_spect_count = df['spect'].nunique(dropna=True)
print(f"\nThere are {unique_spect_count} unique spectral types in the dataset.")

# Count the occurrences of each spectral type
spect_counts = df['spect'].value_counts()

# This will show spectral types with the lowest counts
rarest_spect_types = spect_counts.tail(10)
print("\n--- Rarest Spectral Types Sample ---")
print(rarest_spect_types)

# Get a summary of how many types appear only a few times
types_with_one_occurrence = (spect_counts == 1).sum()
types_with_two_occurrences = (spect_counts == 2).sum()

print(f"\nNumber of spectral types that appear only once: {types_with_one_occurrence}")
print(f"Number of spectral types that appear only twice: {types_with_two_occurrences}")


--- HYG Dataset Head (First 5 Rows) ---
   id  hip        hd  hr   gl   bf proper        ra        dec      dist  \
0   0  NaN       NaN NaN  NaN  NaN    Sol  0.000000   0.000000    0.0000   
1   1  1.0  224700.0 NaN  NaN  NaN    NaN  0.000060   1.089009  219.7802   
2   2  2.0  224690.0 NaN  NaN  NaN    NaN  0.000283 -19.498840   47.9616   
3   3  3.0  224699.0 NaN  NaN  NaN    NaN  0.000335  38.859279  442.4779   
4   4  4.0  224707.0 NaN  NaN  NaN    NaN  0.000569 -51.893546  134.2282   

     pmra  pmdec   rv    mag  absmag spect     ci           x         y  \
0    0.00   0.00  0.0 -26.70   4.850   G2V  0.656    0.000005  0.000000   
1   -5.20  -1.88  0.0   9.10   2.390    F5  0.482  219.740502  0.003449   
2  181.21  -0.93  0.0   9.27   5.866   K3V  0.999   45.210918  0.003365   
3    5.24  -2.91  0.0   6.61  -1.619    B9 -0.019  344.552785  0.030213   
4   62.85   0.16  0.0   8.06   2.421   F0V  0.370   82.835513  0.012476   

            z            vx        vy            vz

In [23]:
# --- 2. Data Preprocessing and Feature Selection ---

print(f"Original dataset size: {len(df)}")

# For simplicity, only use stars with non-null values for the chosen features.
features = ['absmag', 'ci', 'spect']
df_filtered = df.dropna(subset=features)
print(f"Filtered dataset size after dropping rows with missing features: {len(df_filtered)}")

# Prepare numerical and categorical data
numerical_features = df_filtered[['absmag', 'ci']]
categorical_features = df_filtered['spect']

# Convert the categorical 'spect' column into numerical indices
# This is a critical step for using an embedding layer
spect_encoded, unique_spects = pd.factorize(categorical_features)
NUM_SPECT_TYPES = len(unique_spects)
print(f"\nThere are {NUM_SPECT_TYPES} unique spectral types after filtering.")

# We also need the IDs and star names for our final JSON export
star_info = df_filtered[['id', 'proper', 'gl', 'hip']]

# Scale the numerical features
scaler = StandardScaler()
scaled_numerical_features = scaler.fit_transform(numerical_features)

# Split the data into training and validation sets
# We split the numerical features, the encoded spectral types, and the star info
X_num_train, X_num_val, X_cat_train, X_cat_val, star_info_train, star_info_val = train_test_split(
    scaled_numerical_features, spect_encoded, star_info, test_size=0.2, random_state=42)

# Convert the NumPy arrays to PyTorch tensors
X_num_train = torch.tensor(X_num_train, dtype=torch.float32)
X_num_val = torch.tensor(X_num_val, dtype=torch.float32)
# Ensure categorical tensor is of long type
X_cat_train = torch.tensor(X_cat_train, dtype=torch.long)
X_cat_val = torch.tensor(X_cat_val, dtype=torch.long)

# Create TensorDatasets for training and validation
# Note that we are passing both the numerical and categorical tensors
train_dataset = TensorDataset(X_num_train, X_cat_train)
val_dataset = TensorDataset(X_num_val, X_cat_val)

# Create DataLoaders for training and validation
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

print(f"\nTraining set size: {len(X_num_train)} samples")
print(f"Validation set size: {len(X_num_val)} samples")

Original dataset size: 119626
Filtered dataset size after dropping rows with missing features: 115368

There are 4222 unique spectral types after filtering.

Training set size: 92294 samples
Validation set size: 23074 samples


In [24]:
def print_model_parameters(model):
    """
    Prints the name and shape of each parameter in a PyTorch model.
    """
    print("\n--- Model Parameters ---")
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(f"Layer: {name:<25} | Shape: {list(param.shape)}")
    print("------------------------\n")

def save_checkpoint(trial, model, best_val_loss, file_path):
    """
    Saves a dictionary containing the model's state, hyperparameters, and trial info.

    Args:
        trial (optuna.trial.Trial): The current Optuna trial object.
        model (torch.nn.Module): The model to save.
        best_val_loss (float): The best validation loss achieved so far.
        file_path (str): The full path for the checkpoint file.
    """
    # Create the checkpoint dictionary
    checkpoint = {
        'trial_number': trial.number,
        'hyperparameters': trial.params,
        'model_state_dict': model.state_dict(),
        'val_loss': best_val_loss
    }

    # Save the checkpoint to a uniquely named file
    torch.save(checkpoint, file_path)
    print(f"Checkpoint saved for Trial {trial.number} with a new best Val Loss of {best_val_loss:.4f} to {file_path}")

def save_study_metadata(study: optuna.study.Study, file_path: str):
    """
    Saves the metadata of an Optuna study to a JSON file.

    Args:
        study (optuna.study.Study): The Optuna study object to save.
        file_path (str): The full path to the output JSON file in Google Drive.
    """
    try:
        # Get the current time for the log
        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

        # Extract essential information from the best trial
        best_trial = study.best_trial
        best_params = best_trial.params
        best_value = best_trial.value

        # Create a dictionary to hold the metadata
        metadata = {
            "study_name": study.study_name,
            "timestamp": timestamp,
            "best_value": best_value,
            "best_params": best_params,
            "state": str(study.best_trial.state),
            "user_attributes": study.best_trial.user_attrs,
            "study_direction": str(study.direction),
            "trials_completed": len(study.trials)
        }

        # Convert the dictionary to a JSON string
        json_metadata = json.dumps(metadata, indent=4, sort_keys=True)

        # Write the JSON string to the specified file
        with open(file_path, 'w') as f:
            f.write(json_metadata)

        print(f"Study metadata successfully saved to: {file_path}")

    except Exception as e:
        print(f"An error occurred while saving study metadata: {e}")

In [25]:
class Autoencoder(nn.Module):
    def __init__(self,
                 num_numerical_features: int,
                 num_spect_types: int,
                 spect_embedding_dim: int,
                 n_encoder_layers: int,
                 n_hidden_neurons: int,
                 dropout_rate: float,
                 latent_dim: int=3):

        super(Autoencoder, self).__init__()

        # Save these for the forward pass
        self.num_numerical_features = num_numerical_features
        self.spect_embedding_dim = spect_embedding_dim
        self.num_spect_types = num_spect_types

        # Define the embedding layer for the spectral types
        self.spect_embedding = nn.Embedding(num_embeddings=num_spect_types, embedding_dim=spect_embedding_dim)

        # Calculate the total input dimension after embedding
        total_input_dim = num_numerical_features + spect_embedding_dim

        # Dynamically build the encoder
        encoder_layers = []
        # Store layer dimensions to mirror in the decoder
        layer_dims = [total_input_dim]
        for i in range(n_encoder_layers):
            next_dim = n_hidden_neurons // (2**i) if i < n_encoder_layers - 1 else latent_dim
            encoder_layers.append(nn.Linear(layer_dims[-1], next_dim))
            encoder_layers.append(nn.ReLU())
            encoder_layers.append(nn.Dropout(dropout_rate))
            layer_dims.append(next_dim)
        self.encoder = nn.Sequential(*encoder_layers)

        # Dynamically build the decoder by reversing the layer dimensions
        decoder_layers = []
        reversed_dims = list(reversed(layer_dims))
        for i in range(len(reversed_dims) - 1):
            decoder_layers.append(nn.Linear(reversed_dims[i], reversed_dims[i+1]))
            # Add ReLU for all but the final layer
            if i < len(reversed_dims) - 2:
                decoder_layers.append(nn.ReLU())

        self.decoder = nn.Sequential(*decoder_layers)

        # Add final layers for reconstructing numerical and categorical features
        # The decoder's last output is `total_input_dim`, so we split it here
        self.reconstruct_numerical = nn.Linear(total_input_dim, num_numerical_features)
        self.reconstruct_categorical = nn.Linear(total_input_dim, num_spect_types)

    def forward(self, x_num, x_cat):
        # 1. Pass the categorical features through the embedding layer
        x_cat_embedded = self.spect_embedding(x_cat)

        # 2. Concatenate the numerical and embedded categorical features
        x_combined = torch.cat((x_num, x_cat_embedded), dim=1)

        # 3. Pass the combined tensor through the encoder to get the latent representation
        latent_coords = self.encoder(x_combined)

        # 4. Pass the latent representation through the decoder to get a reconstruction
        reconstruction = self.decoder(latent_coords)

        # 5. Separate the reconstructed output into numerical and categorical parts
        reconstructed_num = self.reconstruct_numerical(reconstruction)
        reconstructed_cat_logits = self.reconstruct_categorical(reconstruction)

        return latent_coords, reconstructed_num, reconstructed_cat_logits

In [26]:
# --- 5. Main Execution and Export ---
def objective(trial, device, study_name):

    # Hyperparameters to be tuned by Optuna
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True)
    dropout_rate = trial.suggest_float("dropout_rate", 0.0, 0.15)
    spect_embedding_dim = trial.suggest_int("spect_embedding_dim", 48, 96)
    n_encoder_layers = trial.suggest_int("n_encoder_layers", 4, 5)
    n_hidden_neurons = trial.suggest_int("n_hidden_neurons", 128, 320, step=16)
    categorical_loss_weight = trial.suggest_float("categorical_loss_weight", 0.1, 0.4, log=True)

    # Define the model with the suggested hyperparameters
    model = Autoencoder(
            num_numerical_features=2,  # 'absmag' and 'ci'
            num_spect_types=NUM_SPECT_TYPES, # From the preprocessing step
            spect_embedding_dim=spect_embedding_dim,
            n_encoder_layers=n_encoder_layers,
            n_hidden_neurons=n_hidden_neurons,
            dropout_rate=dropout_rate
            ).to(device)

    # Define the optimizer and scheduler
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',
    factor=0.5,
    patience=20)

    print_model_parameters(model)

    # Training Loop with Early Stopping
    patience = 25
    epochs_no_improve = 0
    best_val_loss = float('inf')
    epochs = 100 # Set a reasonable max number of epochs per trial

    for epoch in range(epochs):
        # Training
        model.train()
        for x_num, x_cat in train_dataloader:
            x_num = x_num.to(device)
            x_cat = x_cat.to(device)
            optimizer.zero_grad()

            # Corrected forward pass with new returns
            latent_coords, reconstructed_num, reconstructed_cat_logits = model(x_num, x_cat)

            # Use MSE for numerical and CrossEntropy for categorical
            # Cross-Entropy expects logits as input and integer labels as target
            numerical_loss = F.mse_loss(x_num, reconstructed_num)
            categorical_loss = F.cross_entropy(reconstructed_cat_logits, x_cat)

            loss = numerical_loss + categorical_loss_weight * categorical_loss

            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for x_num, x_cat in val_dataloader:
                x_num = x_num.to(device)
                x_cat = x_cat.to(device)

                # Corrected forward pass with new returns
                latent_coords, reconstructed_num, reconstructed_cat_logits = model(x_num, x_cat)

                numerical_loss = F.mse_loss(x_num, reconstructed_num)
                categorical_loss = F.cross_entropy(reconstructed_cat_logits, x_cat)

                loss = numerical_loss + categorical_loss_weight * categorical_loss
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_dataloader)

        # Early Stopping and Pruning
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_no_improve = 0
            checkpoint_path = f'/content/drive/MyDrive/Checkpoints/{study_name}_trial_{trial.number}_best.pth'
            os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
            save_checkpoint(trial, model, best_val_loss, checkpoint_path)

        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print(f"Trial {trial.number}: Early stopping triggered.")
                break

        # Report the loss to Optuna
        trial.report(avg_val_loss, epoch)

        # Prune the trial if it's not performing well
        if trial.should_prune():
            print(f"Trial {trial.number}: Pruning trial at epoch {epoch + 1}.")
            raise optuna.exceptions.TrialPruned()

        print(f"Trial {trial.number}, Epoch [{epoch + 1}/{epochs}], Val Loss: {avg_val_loss:.4f}")

    return best_val_loss

if __name__ == '__main__':
    study_name = 'AE_StellarParams_Latent_Wide_r3'
    # 1. Set up the device (CPU or GPU)
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print('GPU is available and will be used.')
    else:
        device = torch.device('cpu')
        print('GPU not available, using CPU.')
    # 1. Hyperparameter Tuning with Optuna
    # This will find the best hyperparameters by running multiple trials.
    # The 'objective' function is defined separately and contains the training loop.
    study = optuna.create_study(
            direction='minimize',
            storage=f'sqlite:////content/drive/MyDrive/Data/{study_name}.db',
            study_name=f'{study_name}',
            load_if_exists=True)
    study.optimize(lambda trial: objective(trial, device, study_name), n_trials=5)

    # Call the function to save the metadata
    output_path = f'/content/drive/MyDrive/Logs/{study_name}.json'
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    save_study_metadata(study, output_path)

GPU not available, using CPU.


[I 2025-09-22 00:10:39,418] Using an existing study with name 'AE_StellarParams_Latent_Wide_r3' instead of creating a new one.



--- Model Parameters ---
Layer: spect_embedding.weight    | Shape: [4222, 54]
Layer: encoder.0.weight          | Shape: [256, 56]
Layer: encoder.0.bias            | Shape: [256]
Layer: encoder.3.weight          | Shape: [128, 256]
Layer: encoder.3.bias            | Shape: [128]
Layer: encoder.6.weight          | Shape: [64, 128]
Layer: encoder.6.bias            | Shape: [64]
Layer: encoder.9.weight          | Shape: [3, 64]
Layer: encoder.9.bias            | Shape: [3]
Layer: decoder.0.weight          | Shape: [64, 3]
Layer: decoder.0.bias            | Shape: [64]
Layer: decoder.2.weight          | Shape: [128, 64]
Layer: decoder.2.bias            | Shape: [128]
Layer: decoder.4.weight          | Shape: [256, 128]
Layer: decoder.4.bias            | Shape: [256]
Layer: decoder.6.weight          | Shape: [56, 256]
Layer: decoder.6.bias            | Shape: [56]
Layer: reconstruct_numerical.weight | Shape: [2, 56]
Layer: reconstruct_numerical.bias | Shape: [2]
Layer: reconstruct_categoric

[I 2025-09-22 01:00:22,734] Trial 96 finished with value: 0.10702821409582597 and parameters: {'learning_rate': 0.0010068565288337352, 'dropout_rate': 0.018291674591159473, 'spect_embedding_dim': 54, 'n_encoder_layers': 4, 'n_hidden_neurons': 256, 'categorical_loss_weight': 0.10681071248767215}. Best is trial 80 with value: 0.0973087496787227.


Trial 96, Epoch [100/100], Val Loss: 0.1105

--- Model Parameters ---
Layer: spect_embedding.weight    | Shape: [4222, 49]
Layer: encoder.0.weight          | Shape: [272, 51]
Layer: encoder.0.bias            | Shape: [272]
Layer: encoder.3.weight          | Shape: [136, 272]
Layer: encoder.3.bias            | Shape: [136]
Layer: encoder.6.weight          | Shape: [68, 136]
Layer: encoder.6.bias            | Shape: [68]
Layer: encoder.9.weight          | Shape: [3, 68]
Layer: encoder.9.bias            | Shape: [3]
Layer: decoder.0.weight          | Shape: [68, 3]
Layer: decoder.0.bias            | Shape: [68]
Layer: decoder.2.weight          | Shape: [136, 68]
Layer: decoder.2.bias            | Shape: [136]
Layer: decoder.4.weight          | Shape: [272, 136]
Layer: decoder.4.bias            | Shape: [272]
Layer: decoder.6.weight          | Shape: [51, 272]
Layer: decoder.6.bias            | Shape: [51]
Layer: reconstruct_numerical.weight | Shape: [2, 51]
Layer: reconstruct_numerical.bi

[I 2025-09-22 01:44:50,380] Trial 97 finished with value: 0.1084718197181697 and parameters: {'learning_rate': 0.001107246650319292, 'dropout_rate': 0.026066678520000136, 'spect_embedding_dim': 49, 'n_encoder_layers': 4, 'n_hidden_neurons': 272, 'categorical_loss_weight': 0.10698590563430858}. Best is trial 80 with value: 0.0973087496787227.


Trial 97, Epoch [100/100], Val Loss: 0.1144

--- Model Parameters ---
Layer: spect_embedding.weight    | Shape: [4222, 50]
Layer: encoder.0.weight          | Shape: [272, 52]
Layer: encoder.0.bias            | Shape: [272]
Layer: encoder.3.weight          | Shape: [136, 272]
Layer: encoder.3.bias            | Shape: [136]
Layer: encoder.6.weight          | Shape: [68, 136]
Layer: encoder.6.bias            | Shape: [68]
Layer: encoder.9.weight          | Shape: [3, 68]
Layer: encoder.9.bias            | Shape: [3]
Layer: decoder.0.weight          | Shape: [68, 3]
Layer: decoder.0.bias            | Shape: [68]
Layer: decoder.2.weight          | Shape: [136, 68]
Layer: decoder.2.bias            | Shape: [136]
Layer: decoder.4.weight          | Shape: [272, 136]
Layer: decoder.4.bias            | Shape: [272]
Layer: decoder.6.weight          | Shape: [52, 272]
Layer: decoder.6.bias            | Shape: [52]
Layer: reconstruct_numerical.weight | Shape: [2, 52]
Layer: reconstruct_numerical.bi

[I 2025-09-22 01:49:53,529] Trial 98 pruned. 


Trial 98: Pruning trial at epoch 9.

--- Model Parameters ---
Layer: spect_embedding.weight    | Shape: [4222, 49]
Layer: encoder.0.weight          | Shape: [256, 51]
Layer: encoder.0.bias            | Shape: [256]
Layer: encoder.3.weight          | Shape: [128, 256]
Layer: encoder.3.bias            | Shape: [128]
Layer: encoder.6.weight          | Shape: [64, 128]
Layer: encoder.6.bias            | Shape: [64]
Layer: encoder.9.weight          | Shape: [3, 64]
Layer: encoder.9.bias            | Shape: [3]
Layer: decoder.0.weight          | Shape: [64, 3]
Layer: decoder.0.bias            | Shape: [64]
Layer: decoder.2.weight          | Shape: [128, 64]
Layer: decoder.2.bias            | Shape: [128]
Layer: decoder.4.weight          | Shape: [256, 128]
Layer: decoder.4.bias            | Shape: [256]
Layer: decoder.6.weight          | Shape: [51, 256]
Layer: decoder.6.bias            | Shape: [51]
Layer: reconstruct_numerical.weight | Shape: [2, 51]
Layer: reconstruct_numerical.bias | Sha

[I 2025-09-22 01:50:22,048] Trial 99 pruned. 


Checkpoint saved for Trial 99 with a new best Val Loss of 0.5389 to /content/drive/MyDrive/Checkpoints/AE_StellarParams_Latent_Wide_r3_trial_99_best.pth
Trial 99: Pruning trial at epoch 1.

--- Model Parameters ---
Layer: spect_embedding.weight    | Shape: [4222, 54]
Layer: encoder.0.weight          | Shape: [304, 56]
Layer: encoder.0.bias            | Shape: [304]
Layer: encoder.3.weight          | Shape: [152, 304]
Layer: encoder.3.bias            | Shape: [152]
Layer: encoder.6.weight          | Shape: [76, 152]
Layer: encoder.6.bias            | Shape: [76]
Layer: encoder.9.weight          | Shape: [3, 76]
Layer: encoder.9.bias            | Shape: [3]
Layer: decoder.0.weight          | Shape: [76, 3]
Layer: decoder.0.bias            | Shape: [76]
Layer: decoder.2.weight          | Shape: [152, 76]
Layer: decoder.2.bias            | Shape: [152]
Layer: decoder.4.weight          | Shape: [304, 152]
Layer: decoder.4.bias            | Shape: [304]
Layer: decoder.6.weight          | Sha

[I 2025-09-22 02:04:36,784] Trial 100 pruned. 


Trial 100: Pruning trial at epoch 34.
Study metadata successfully saved to: /content/drive/MyDrive/Logs/AE_StellarParams_Latent_Wide_r3.json


In [27]:
fig_1 = optuna.visualization.plot_optimization_history(study)
fig_1.write_image(f'/content/drive/MyDrive/Plots/{study_name}_history.png')
show(fig_1)
fig_2 = optuna.visualization.plot_param_importances(study)
fig_2.write_image(f'/content/drive/MyDrive/Plots/{study_name}_importances.png')
show(fig_2)
fig_3 = optuna.visualization.plot_parallel_coordinate(study)
fig_3.write_image(f'/content/drive/MyDrive/Plots/{study_name}_parallel.png')
show(fig_3)

In [28]:
# Get the best trial's parameters and value
best_params = study.best_trial.params
best_loss = study.best_trial.value
best_trial = study.best_trial.number
best_weights_url = f'/content/drive/MyDrive/Checkpoints/{study_name}_trial_{best_trial}_best.pth'
best_weights = torch.load(best_weights_url, map_location=torch.device('cpu'))['model_state_dict']
print(f'\nBest study trial:', best_trial)
print(f"Best validation loss: {best_loss}")
print(f'Best hyperparameters found by Optuna:', best_params)

# 2. Re-instantiate and Train the Best Model
# We create a new model with the best parameters found by Optuna
best_model = Autoencoder(
             num_numerical_features=2,
             num_spect_types=NUM_SPECT_TYPES,
             spect_embedding_dim=best_params['spect_embedding_dim'],
             n_encoder_layers=best_params['n_encoder_layers'],
             n_hidden_neurons=best_params['n_hidden_neurons'],
             dropout_rate=best_params['dropout_rate'])

# Load the best model weights that were saved during the tuning process
best_model.load_state_dict(best_weights)

# The best model is already trained. No need to re-train.
print(f"Best model loaded from '{best_weights_url}'")

# 3. Generate 3D Coordinates for ALL Stars and Export to JSON
# This is a crucial step to get the full star map, not just the validation set.
print("\nGenerating 3D coordinates for all stars.")
best_model.eval() # Set model to evaluation mode

# First, get the combined full dataset (numerical + categorical)
# from the original pre-processing step before the train/val split.
# Re-create the full tensor datasets for a clean, deterministic output.
scaled_numerical_features = scaler.fit_transform(df_filtered[['absmag', 'ci']])
spect_encoded, _ = pd.factorize(df_filtered['spect'])

full_dataset = TensorDataset(
    torch.tensor(scaled_numerical_features, dtype=torch.float32),
    torch.tensor(spect_encoded, dtype=torch.long))
full_dataloader = DataLoader(full_dataset, batch_size=64, shuffle=False)

all_coords = []
with torch.no_grad():
    for x_num, x_cat in full_dataloader:
        coords, _, _ = best_model(x_num, x_cat)
        all_coords.extend(coords.numpy().tolist())


Best study trial: 80
Best validation loss: 0.0973087496787227
Best hyperparameters found by Optuna: {'learning_rate': 0.0015468527826189955, 'dropout_rate': 0.005803352612374085, 'spect_embedding_dim': 56, 'n_encoder_layers': 4, 'n_hidden_neurons': 272, 'categorical_loss_weight': 0.1029695659605807}
Best model loaded from '/content/drive/MyDrive/Checkpoints/AE_StellarParams_Latent_Wide_r3_trial_80_best.pth'

Generating 3D coordinates for all stars.


In [29]:
def process_spectral_type(spect_string):
    """
    Cleans and simplifies the spectral type string for visualization.

    Args:
        spect_string (str): The raw spectral type string.

    Returns:
        str: The processed spectral type string (e.g., 'G2').
    """
    if pd.isna(spect_string):
        return 'UNKNOWN'

    spect_string = spect_string.upper().strip()

    # Handle stars with multiple values (e.g., 'F3/F5V') by taking the first one
    if '/' in spect_string:
        spect_string = spect_string.split('/')[0]

    # Take the first two characters.
    simplified = spect_string[:2]

    # Check if the second character is not a digit.
    if len(simplified) < 2 or not simplified[1].isdigit():
        # Assign a default of '5' for stars with no number (e.g., 'M', 'K', 'O')
        simplified = simplified[0] + '5'

    return simplified

In [30]:
def export_star_data_to_csv_gz(df_filtered, all_coords, output_path):
    """
    Combines filtered star data with latent space coordinates and exports it
    to a compressed CSV file.

    This function removes unnecessary fields and includes only the data
    required for the front-end visualization, resulting in a significantly
    smaller and more efficient file.

    Args:
        df_filtered (pd.DataFrame): DataFrame containing filtered star data
                                     (e.g., from the HYG database).
        all_coords (list of lists): The latent space coordinates for each star.
        output_path (str): The full path to save the gzipped CSV file.
    """
    # 1. Create a DataFrame for the latent space coordinates
    latent_df = pd.DataFrame(all_coords, columns=['latent_x', 'latent_y', 'latent_z'])

    # Reset index of the filtered DataFrame for a clean merge
    df_filtered = df_filtered.reset_index(drop=True)

    # 2. Combine the original star data with the new latent space data
    combined_df = pd.concat([df_filtered, latent_df], axis=1)

    # 3. Process the 'spect' column using the new function
    combined_df['spect'] = combined_df['spect'].apply(process_spectral_type)

    # 4. Select and reorder only the essential columns
    essential_df = combined_df[[
        'id',
        'latent_x',
        'latent_y',
        'latent_z',
        'x',
        'y',
        'z',
        'absmag',
        'spect'
    ]]

    # 5. Round the floating-point numbers to reduce file size
    essential_df = essential_df.round({
        'latent_x': 4,
        'latent_y': 4,
        'latent_z': 4,
        'x': 4,
        'y': 4,
        'z': 4,
        'absmag': 4
    })

    # 6. Save the DataFrame directly to a gzipped CSV file
    essential_df.to_csv(output_path, index=False, compression='gzip')

    print(f"Essential star data saved to '{output_path}'")

export_star_data_to_csv_gz(df_filtered, all_coords, f'/content/drive/MyDrive/Data/{study_name}_front.csv.gz')

Essential star data saved to '/content/drive/MyDrive/Data/AE_StellarParams_Latent_Wide_r3_front.csv.gz'
