In [19]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
import keras.ops as ops


In [10]:
df = pd.read_csv('data_wrt_stress.csv')

df = df.dropna()

In [None]:
df['smoke_status_num']= df['smoke_status'].apply(lambda x: 0 if x=='never' else 1)

In [15]:
df['smoke_status_num'].value_counts()

smoke_status_num
0    1658
1    1160
Name: count, dtype: int64

In [16]:
X = df[["dpq_total", "hscrp_mg_l_raw", "smoke_status_num",
        "pir", "bmx_bmi"]].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [26]:

from keras.optimizers import Adam
from keras import Model

# --- Configuration ---
# Assuming X_scaled and df are defined (e.g., from a previous data loading step)
latent_dim = 2 
input_dim = X_scaled.shape[1]

# --- Sampling Layer Function (Remains the same) ---
def sampling(args):
    z_mean, z_log_var = args
    eps = tf.random.normal(shape=tf.shape(z_mean))
    return z_mean + ops.exp(0.5 * z_log_var) * eps

# --- 1. Define Encoder (Functional Model) ---
encoder_inputs = keras.Input(shape=(input_dim,))
h_enc = layers.Dense(32, activation="relu")(encoder_inputs)
h_enc = layers.Dense(16, activation="relu")(h_enc)
z_mean = layers.Dense(latent_dim, name="z_mean")(h_enc)
z_log_var = layers.Dense(latent_dim, name="z_log_var")(h_enc)
z = layers.Lambda(sampling, output_shape=(latent_dim,), name="z_sample")([z_mean, z_log_var])
encoder = Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")


# --- 2. Define Decoder (Functional Model) ---
decoder_input = keras.Input(shape=(latent_dim,), name="decoder_input")
d_dec = layers.Dense(16, activation="relu")(decoder_input)
d_dec = layers.Dense(32, activation="relu")(d_dec)
outputs = layers.Dense(input_dim, name="decoder_output")(d_dec)
decoder = Model(decoder_input, outputs, name="decoder")


# --- 3. Define Custom VAE Model (Subclassing) ---
class VAE(Model):
    def __init__(self, encoder, decoder, **kwargs):
        super().__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        # self.mse_loss_fn = keras.losses.MeanSquaredError(reduction='none') # REMOVED

    def call(self, inputs):
        # Forward pass: Encode, Sample, Decode
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstruction = self.decoder(z)
        return reconstruction

    def train_step(self, data):
        # Unpack the data (since X_scaled is both input and target)
        x = data[0]

        with tf.GradientTape() as tape:
            # Forward pass
            z_mean, z_log_var, z = self.encoder(x)
            reconstruction = self.decoder(z)

            # --- VAE Loss Calculation (FIXED) ---
            
            # 1. Reconstruction Loss (FIX: Use manual ops calculation)
            # This explicitly calculates the squared difference element-wise,
            # ensuring the shape is (batch_size, input_dim).
            reconstruction_error = ops.square(x - reconstruction)

            # Sum across the feature dimension (axis=1), then mean across the batch
            reconstruction_loss = ops.mean(ops.sum(reconstruction_error, axis=1))

            # 2. KL Divergence Loss
            kl_loss = -0.5 * ops.mean(
                ops.sum(1 + z_log_var - ops.square(z_mean) - ops.exp(z_log_var), axis=1)
            )

            # Total VAE Loss
            total_loss = reconstruction_loss + kl_loss

        # Compute gradients and update weights
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(total_loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        # Return a dict mapping metric names to current values
        return {
            "loss": total_loss,
            "reconstruction_loss": reconstruction_loss,
            "kl_loss": kl_loss,
        }

# --- Initialization and Training (Unchanged) ---
vae = VAE(encoder, decoder)
vae.compile(optimizer=Adam())

print("Starting VAE training...")
vae.fit(X_scaled, X_scaled, epochs=50, batch_size=32, verbose=1)
print("VAE training complete.")

# --- Encoding and Saving (using the original encoder for simplicity) ---
# NOTE: The custom VAE class uses an internal encoder, which is the same model defined above.
# We can use the separate encoder model for prediction.
latent = encoder.predict(X_scaled)[0] # Get z_mean for embedding

df["latent1"] = latent[:, 0]
df["latent2"] = latent[:, 1]

df.to_csv("vae_latent_scores.csv", index=False)

print("Saved latent scores to vae_latent_scores.csv")

Starting VAE training...
Epoch 1/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - kl_loss: 0.0839 - loss: 3.3907 - reconstruction_loss: 3.3068
Epoch 2/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - kl_loss: 0.5249 - loss: 4.3889 - reconstruction_loss: 3.8640
Epoch 3/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - kl_loss: 1.1213 - loss: 5.3731 - reconstruction_loss: 4.2519
Epoch 4/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - kl_loss: 1.3361 - loss: 5.5080 - reconstruction_loss: 4.1719
Epoch 5/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - kl_loss: 0.6209 - loss: 2.8371 - reconstruction_loss: 2.2161
Epoch 6/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - kl_loss: 0.8466 - loss: 4.1976 - reconstruction_loss: 3.3510
Epoch 7/50
[1m89/89[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - kl_loss