## Train and Generate the Autoencode Features

In [None]:
import torch
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import StandardScaler

In [None]:
# X_train = pd.read_csv('X_train_bal_roc.csv')
# y_train = pd.read_csv('y_train_bal_roc.csv')

# # encoding target class
# y, clas = pd.factorize(y_train['class_labels']) #getting the class 0 = agn, 1 =notagn, 2 = no class
# y_target = pd.DataFrame(y, columns = ['labels'])

In [None]:
# This is the orignal data
X = pd.read_csv('normalised/all_color_X.csv')
y = pd.read_csv('normalised/all_color_y.csv')

print('Shape of X: ', X.shape)
print('Shape of y: ', y.shape)


In [None]:
important_feat = ['class_star', 
                  'Mstar', 
                  'log(S8/S45)', 
                  'log(S58/S36)', 
                  'log(i/z)', 
                  'log(r/z)', 
                  'log(g/z)', 
                  'log(Y/H)', 
                  'log(S45/S36)'
                 ]

X_top9 = X[important_feat]

In [None]:
num_samples = len(X_top9) #1000
num_input_features = len (important_feat) #10  # Can be ANY number (e.g., 7, 18, etc.)
# X = np.random.rand(num_samples, num_input_features)  

# Standardize data (critical for autoencoders)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_top9)
X_tensor = torch.FloatTensor(X_scaled)

# Define a flexible autoencoder
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, input_dim // 2),  # Compress by half first
            nn.ReLU(),
            nn.Linear(input_dim // 2, 2)            # Final latent space (2 features)
        )
        self.decoder = nn.Sequential(
            nn.Linear(2, input_dim // 2),
            nn.ReLU(),
            nn.Linear(input_dim // 2, input_dim)   # Reconstruct original input
        )

    def forward(self, x):
        latent = self.encoder(x)
        reconstructed = self.decoder(latent)
        return latent, reconstructed

# Initialize and train
model = Autoencoder(input_dim=num_input_features)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 10000
for epoch in range(epochs):
    latent, reconstructed = model(X_tensor)
    loss = criterion(reconstructed, X_tensor)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if epoch % 100 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')

# Extract 2D latent features
latent_features = model.encoder(X_tensor).detach().numpy()
print("Latent features shape:", latent_features.shape)  # (1000, 2)

In [None]:
latent_features

In [None]:
auto_data = pd.DataFrame(latent_features, columns=['auto1', 'auto2'])

auto_data['qir'] = X['qir']

In [None]:
auto_data['Mstar'] = X['Mstar']
auto_data['class_star'] = X['class_star']
auto_data['log(S8/S45)'] = X['log(S8/S45)']
auto_data['log(S58/S36)'] = X['log(S58/S36)']
auto_data['log(S45/S36)'] = X['log(S45/S36)']


In [None]:
auto_data.to_csv('X_auto_feats.csv', index=False)