In [13]:
import anndata
import scanpy as sc
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, optimizers
import pandas as pd
import umap
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

adata = sc.read('/tmp/work/RCproject_code/sce_export.h5ad')



In [14]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Get domain labels from the 'batch' column in your AnnData object (as a pandas Series)
domain_labels = adata.obs['batch'].values  # Convert Series to NumPy array

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Fit and transform the domain labels to get a one-hot encoded matrix
domain_one_hot = encoder.fit_transform(domain_labels.reshape(-1, 1))

# Print the one-hot encoded matrix
print(domain_one_hot)

[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [15]:
#normalizaiton

gene_expression_data = adata.layers['logcounts']
from sklearn.preprocessing import MinMaxScaler

# Min-max normalization
scaler = MinMaxScaler()
gene_expression_data = scaler.fit_transform(gene_expression_data)

number_samples = adata.shape[0]
number_genes = adata.shape[1]

In [4]:
# define the encooder

input_dim = number_genes
encoding_dim = 32  # Dimensionality of the encoding space

# Encoder Model
input_layer = layers.Input(shape=(input_dim,))
encoded = layers.Dense(128, activation='relu')(input_layer)
encoded = layers.BatchNormalization()(encoded)
encoded = layers.Dropout(0.2)(encoded)
encoded = layers.Dense(64, activation='relu')(encoded)
encoded = layers.BatchNormalization()(encoded)
encoded_output = layers.Dense(encoding_dim, activation='relu')(encoded)

# Build the encoder model
encoder = models.Model(inputs=input_layer, outputs=encoded_output)

In [5]:
# define the decoder

# Decoder Model
decoded = layers.Dense(64, activation='relu')(encoded_output)
decoded = layers.BatchNormalization()(decoded)
decoded = layers.Dense(128, activation='relu')(decoded)
decoded_output = layers.Dense(input_dim, activation='sigmoid')(decoded)

# Build the autoencoder model (encoder + decoder)
autoencoder = models.Model(inputs=input_layer, outputs=decoded_output)

In [6]:
# Define the domain discriminator

# Get the number of unique domains (batches)
num_domains = len(domain_labels)

# Domain Discriminator Model
discriminator_input = layers.Input(shape=(encoding_dim,))
discriminator_hidden = layers.Dense(64, activation='relu')(discriminator_input)
discriminator_hidden = layers.BatchNormalization()(discriminator_hidden)
discriminator_hidden = layers.Dropout(0.2)(discriminator_hidden)

# Use 'num_domains' to specify the number of output units (one for each domain)
discriminator_output = layers.Dense(num_domains, activation='softmax')(discriminator_hidden)

# Build the discriminator model
discriminator = models.Model(inputs=discriminator_input, outputs=discriminator_output)

In [7]:
# Compile the autoencoder (reconstruction task)
autoencoder.compile(optimizer='adam', loss='mse')

# Compile the domain discriminator (domain classification task)
discriminator.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [8]:
# Freeze the discriminator while training the encoder
discriminator.trainable = False

# Adversarial Model (Encoder + Discriminator)
encoded_repr = encoder(input_layer)  # Shared encoder
domain_pred = discriminator(encoded_repr)  # Domain prediction from the encoder

# Build the adversarial model (encoder tries to fool the discriminator)
adversarial_model = models.Model(inputs=input_layer, outputs=[decoded_output, domain_pred])
adversarial_model.compile(optimizer='adam', loss=['mse', 'categorical_crossentropy'], 
                          loss_weights=[1, 0.1])  # Weighted losses

In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from keras.callbacks import EarlyStopping

# OneHotEncode domain labels (not the neural network encoder)
one_hot_encoder = OneHotEncoder(sparse_output=False)

# Split data into training and testing
X_train, X_test, y_train_domains, y_test_domains = train_test_split(
    gene_expression_data, domain_labels, test_size=0.2
)

# Convert y_train_domains and y_test_domains to NumPy arrays (if needed)
y_train_domains_np = np.array(y_train_domains)
y_test_domains_np = np.array(y_test_domains)

# Apply OneHotEncoding to the domain labels
y_train_domains_np_onehot = one_hot_encoder.fit_transform(y_train_domains_np.reshape(-1, 1))
y_test_domains_np_onehot = one_hot_encoder.transform(y_test_domains_np.reshape(-1, 1))

# Define the neural network encoder (this is separate from the OneHotEncoder)
encoding_dim = 64  # Example latent space dimension
input_dim = X_train.shape[1]  # Number of features in your input data

encoder_input = layers.Input(shape=(input_dim,))
encoder_hidden = layers.Dense(128, activation='relu')(encoder_input)
encoder_output = layers.Dense(encoding_dim, activation='relu')(encoder_hidden)
encoder = models.Model(inputs=encoder_input, outputs=encoder_output)

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

epochs = 100
batch_size = 32

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    
    # Train the autoencoder
    autoencoder.fit(X_train, X_train, epochs=1, batch_size=batch_size, shuffle=True, validation_split=0.2, callbacks=[early_stopping])
    
    # Get encoded data from the neural network encoder, not the OneHotEncoder
    encoded_data = encoder.predict(X_train)  # The neural network encoder model
    
    # Train the domain discriminator using the encoded data and one-hot encoded domain labels
    discriminator.fit(encoded_data, y_train_domains_np_onehot, epochs=1, batch_size=batch_size, shuffle=True, validation_split=0.2)
    
    # Train the adversarial model (encoder trying to fool the discriminator)
    adversarial_model.fit(X_train, [X_train, y_train_domains_np_onehot], epochs=1, batch_size=batch_size, validation_split=0.2, shuffle=True)

# After training, get the encoded representations
encoded_representations = encoder.predict(gene_expression_data)

Epoch 1/100


ValueError: in user code:

    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1023, in train_step
        y_pred = self(x, training=True)
    File "/opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "model_2" is incompatible with the layer: expected shape=(None, 32), found shape=(32, 64)


In [10]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from keras.callbacks import EarlyStopping
from keras import layers, models

# OneHotEncode domain labels (not the neural network encoder)
one_hot_encoder = OneHotEncoder(sparse_output=False)

# Split data into training and testing
X_train, X_test, y_train_domains, y_test_domains = train_test_split(
    gene_expression_data, domain_labels, test_size=0.2
)

# Convert y_train_domains and y_test_domains to NumPy arrays (if needed)
y_train_domains_np = np.array(y_train_domains)
y_test_domains_np = np.array(y_test_domains)

# Apply OneHotEncoding to the domain labels
y_train_domains_np_onehot = one_hot_encoder.fit_transform(y_train_domains_np.reshape(-1, 1))
y_test_domains_np_onehot = one_hot_encoder.transform(y_test_domains_np.reshape(-1, 1))

# Define the neural network encoder (this is separate from the OneHotEncoder)
encoding_dim = 32  # Adjusted to match the discriminator input
input_dim = X_train.shape[1]  # Number of features in your input data

encoder_input = layers.Input(shape=(input_dim,))
encoder_hidden = layers.Dense(128, activation='relu')(encoder_input)
encoder_output = layers.Dense(encoding_dim, activation='relu')(encoder_hidden)
encoder = models.Model(inputs=encoder_input, outputs=encoder_output)

# Define the domain discriminator
discriminator_input = layers.Input(shape=(encoding_dim,))
discriminator_hidden = layers.Dense(64, activation='relu')(discriminator_input)
discriminator_output = layers.Dense(y_train_domains_np_onehot.shape[1], activation='softmax')(discriminator_hidden)
discriminator = models.Model(inputs=discriminator_input, outputs=discriminator_output)

# Compile models
encoder.compile(optimizer='adam', loss='mse')
discriminator.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

epochs = 100
batch_size = 32

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    
    # Train the autoencoder
    autoencoder.fit(X_train, X_train, epochs=1, batch_size=batch_size, shuffle=True, validation_split=0.2, callbacks=[early_stopping])
    
    # Get encoded data from the neural network encoder, not the OneHotEncoder
    encoded_data = encoder.predict(X_train)  # The neural network encoder model
    
    # Train the domain discriminator using the encoded data and one-hot encoded domain labels
    discriminator.fit(encoded_data, y_train_domains_np_onehot, epochs=1, batch_size=batch_size, shuffle=True, validation_split=0.2)
    
    # Train the adversarial model (encoder trying to fool the discriminator)
    adversarial_model.fit(X_train, [X_train, y_train_domains_np_onehot], epochs=1, batch_size=batch_size, validation_split=0.2, shuffle=True)

# After training, get the encoded representations
encoded_representations = encoder.predict(gene_expression_data)

Epoch 1/100


ValueError: in user code:

    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1024, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1082, in compute_loss
        return self.compiled_loss(
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/opt/conda/lib/python3.10/site-packages/keras/losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "/opt/conda/lib/python3.10/site-packages/keras/losses.py", line 284, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/opt/conda/lib/python3.10/site-packages/keras/losses.py", line 2004, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "/opt/conda/lib/python3.10/site-packages/keras/backend.py", line 5532, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (32, 9) and (32, 450) are incompatible


In [16]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from keras.callbacks import EarlyStopping
from keras import layers, models

# OneHotEncode domain labels
one_hot_encoder = OneHotEncoder(sparse_output=False)

# Split data into training and testing
X_train, X_test, y_train_domains, y_test_domains = train_test_split(
    gene_expression_data, domain_labels, test_size=0.2
)

# Convert y_train_domains and y_test_domains to NumPy arrays (if needed)
y_train_domains_np = np.array(y_train_domains)
y_test_domains_np = np.array(y_test_domains)

# Apply OneHotEncoding to the domain labels
y_train_domains_np_onehot = one_hot_encoder.fit_transform(y_train_domains_np.reshape(-1, 1))
y_test_domains_np_onehot = one_hot_encoder.transform(y_test_domains_np.reshape(-1, 1))

# Define the neural network encoder
encoding_dim = 32  # Set to match the expected input of the discriminator
input_dim = X_train.shape[1]  # Number of features in your input data

# Encoder Model
encoder_input = layers.Input(shape=(input_dim,))
encoder_hidden = layers.Dense(128, activation='relu')(encoder_input)
encoder_output = layers.Dense(encoding_dim, activation='relu')(encoder_hidden)
encoder = models.Model(inputs=encoder_input, outputs=encoder_output)

# Define the domain discriminator
discriminator_input = layers.Input(shape=(encoding_dim,))
discriminator_hidden = layers.Dense(64, activation='relu')(discriminator_input)
discriminator_output = layers.Dense(y_train_domains_np_onehot.shape[1], activation='softmax')(discriminator_hidden)
discriminator = models.Model(inputs=discriminator_input, outputs=discriminator_output)

# Compile models
encoder.compile(optimizer='adam', loss='mse')
discriminator.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Define the adversarial model
# This model takes the original input and the domain label and outputs the original input
adversarial_input = layers.Input(shape=(input_dim,))
adversarial_encoded = encoder(adversarial_input)
adversarial_output = discriminator(adversarial_encoded)  # Use the discriminator's output
adversarial_model = models.Model(inputs=adversarial_input, outputs=[adversarial_encoded, adversarial_output])

# Compile the adversarial model
adversarial_model.compile(optimizer='adam', 
                          loss={'encoder': 'mse', 'discriminator': 'categorical_crossentropy'},
                          metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

epochs = 100
batch_size = 32

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    
    # Train the autoencoder
    autoencoder.fit(X_train, X_train, epochs=1, batch_size=batch_size, shuffle=True, validation_split=0.2, callbacks=[early_stopping])
    
    # Get encoded data from the neural network encoder
    encoded_data = encoder.predict(X_train)  # The neural network encoder model
    
    # Train the domain discriminator using the encoded data and one-hot encoded domain labels
    discriminator.fit(encoded_data, y_train_domains_np_onehot, epochs=1, batch_size=batch_size, shuffle=True, validation_split=0.2)
    
    # Train the adversarial model
    adversarial_model.fit(X_train, [X_train, y_train_domains_np_onehot], epochs=1, batch_size=batch_size, validation_split=0.2, shuffle=True)

    # Assuming the model is defined with appropriate outputs
    adversarial_model = models.Model(inputs=input_layer, outputs=[encoder_output, discriminator_output])
    
    # Compile the model with correct loss functions
    adversarial_model.compile(optimizer='adam', loss={'model_7': 'mse', 'model_8': 'categorical_crossentropy'})
    
    # Fit the model with appropriate data
    adversarial_model.fit(X_train, [X_train, y_train_domains_np_onehot], epochs=1, batch_size=batch_size, validation_split=0.2, shuffle=True)

# After training, get the encoded representations
encoded_representations = encoder.predict(gene_expression_data)

Epoch 1/100


ValueError: in user code:

    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1024, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1082, in compute_loss
        return self.compiled_loss(
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 240, in __call__
        self.build(y_pred)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 181, in build
        self._losses = self._conform_to_outputs(y_pred, self._losses)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 60, in _conform_to_outputs
        struct = map_to_output_names(outputs, self._output_names, struct)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 805, in map_to_output_names
        raise ValueError(

    ValueError: Found unexpected losses or metrics that do not correspond to any Model output: dict_keys(['encoder', 'discriminator']). Valid mode output names: ['model_10', 'model_11']. Received struct is: {'encoder': 'mse', 'discriminator': 'categorical_crossentropy'}.


In [18]:
X_train.shape

(360, 12165)

In [19]:
y_train_domains_np_onehot.shape

(360, 9)