In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
class TimeGAN(keras.Model):
    def __init__(self, seq_len, n_features, hidden_dim):
        super(TimeGAN, self).__init__()
        self.seq_len = seq_len
        self.n_features = n_features
        self.hidden_dim = hidden_dim

        # Generator
        self.embedder = self._build_network(name="embedder")
        self.recovery = self._build_network(name="recovery")
        self.generator = self._build_network(name="generator")

        # Discriminator
        self.discriminator = self._build_discriminator()

        # Supervisor
        self.supervisor = self._build_network(name="supervisor")

    def _build_network(self, name):
        return keras.Sequential([
            keras.layers.GRU(units=self.hidden_dim, return_sequences=True),
            keras.layers.GRU(units=self.hidden_dim, return_sequences=True),
            keras.layers.TimeDistributed(keras.layers.Dense(units=self.n_features))
        ], name=name)

    def _build_discriminator(self):
        return keras.Sequential([
            keras.layers.GRU(units=self.hidden_dim, return_sequences=True),
            keras.layers.GRU(units=self.hidden_dim, return_sequences=True),
            keras.layers.TimeDistributed(keras.layers.Dense(units=1, activation='sigmoid'))
        ], name="discriminator")

    def embed(self, x):
        return self.embedder(x)

    def supervise(self, h):
        return self.supervisor(h)

    def generate(self, z):
        return self.generator(z)

    def reconstruct(self, h):
        return self.recovery(h)

    def discriminate(self, x):
        return self.discriminator(x)

    @tf.function
    def train_step(self, real_data):
        # Embedding
        batch_size = tf.shape(real_data)[0]
        random_noise = tf.random.normal(shape=(batch_size, self.seq_len, self.hidden_dim))
        hidden = self.embed(real_data)
        
        with tf.GradientTape() as tape_g, tf.GradientTape() as tape_e, tf.GradientTape() as tape_d:
            # Generator
            fake_data = self.generate(random_noise)
            
            # Supervisor
            generated_hidden = self.embed(fake_data)
            supervised_fake = self.supervise(hidden)

            # Discriminator
            real_output = self.discriminate(real_data)
            fake_output = self.discriminate(fake_data)

            # Losses
            # Reconstruction loss
            e_loss_t0 = tf.reduce_mean((real_data - self.reconstruct(hidden))**2)
            e_loss_0 = 10 * tf.sqrt(e_loss_t0)
            
            # Supervised loss
            g_loss_s = tf.reduce_mean((hidden[:, 1:, :] - supervised_fake[:, :-1, :])**2)

            # Unsupervised loss
            g_loss_u = tf.reduce_mean(tf.keras.losses.binary_crossentropy(tf.ones_like(fake_output), fake_output))
            
            # Discriminator loss
            d_loss_real = tf.reduce_mean(tf.keras.losses.binary_crossentropy(tf.ones_like(real_output), real_output))
            d_loss_fake = tf.reduce_mean(tf.keras.losses.binary_crossentropy(tf.zeros_like(fake_output), fake_output))
            d_loss = d_loss_real + d_loss_fake

            # Generator loss
            g_loss = g_loss_u + 100 * g_loss_s

            # Embedding network loss
            e_loss = e_loss_0 + 0.1 * g_loss_s

        # Compute gradients
        e_gradients = tape_e.gradient(e_loss, self.embedder.trainable_variables + self.recovery.trainable_variables)
        g_gradients = tape_g.gradient(g_loss, self.generator.trainable_variables + self.supervisor.trainable_variables)
        d_gradients = tape_d.gradient(d_loss, self.discriminator.trainable_variables)

        # Apply gradients
        self.optimizer.apply_gradients(zip(e_gradients, self.embedder.trainable_variables + self.recovery.trainable_variables))
        self.optimizer.apply_gradients(zip(g_gradients, self.generator.trainable_variables + self.supervisor.trainable_variables))
        self.optimizer.apply_gradients(zip(d_gradients, self.discriminator.trainable_variables))

        return {"e_loss": e_loss, "g_loss": g_loss, "d_loss": d_loss}

    def generate_samples(self, n_samples):
        random_noise = tf.random.normal(shape=(n_samples, self.seq_len, self.hidden_dim))
        return self.generate(random_noise)

In [None]:
def prepare_data(data, seq_len, train_split=0.8):
    # Normalize the data
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(data)
    
    # Create sequences
    sequences = []
    for i in range(len(scaled_data) - seq_len + 1):
        sequences.append(scaled_data[i:i+seq_len])
    sequences = np.array(sequences)
    
    # Split into train and test sets
    train_data, test_data = train_test_split(sequences, train_size=train_split, shuffle=False)
    
    return train_data, test_data, scaler

# 2. Initialize and train the model
def train_timegan(train_data, epochs=100, batch_size=32):
    seq_len, n_features = train_data.shape[1:]
    hidden_dim = 24  # You can adjust this

    model = TimeGAN(seq_len, n_features, hidden_dim)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        model.fit(train_data, epochs=1, batch_size=batch_size)

    return model

# 3. Generate synthetic samples
def generate_synthetic_samples(model, n_samples):
    return model.generate_samples(n_samples)

# 4. Combine original and synthetic data
def augment_dataset(original_data, synthetic_data):
    return np.concatenate([original_data, synthetic_data], axis=0)

# 5. Validate the augmented dataset
def validate_augmented_data(original_data, augmented_data):
    # Compare basic statistics
    print("Original data shape:", original_data.shape)
    print("Augmented data shape:", augmented_data.shape)
    
    print("\nOriginal data statistics:")
    print(np.mean(original_data, axis=(0,1)))
    print(np.std(original_data, axis=(0,1)))
    
    print("\nAugmented data statistics:")
    print(np.mean(augmented_data, axis=(0,1)))
    print(np.std(augmented_data, axis=(0,1)))

In [None]:
seq_len = 24  # Adjust based on your needs
train_data, test_data, scaler = prepare_data(data, seq_len)

# 2. Train the model
model = train_timegan(train_data, epochs=100)

# 3. Generate synthetic samples
n_synthetic = len(train_data)  # Generate as many synthetic samples as original
synthetic_data = generate_synthetic_samples(model, n_synthetic)

# 4. Combine original and synthetic data
augmented_data = augment_dataset(train_data, synthetic_data)

# 5. Validate the augmented dataset
validate_augmented_data(train_data, augmented_data)