In [1]:
import anndata as ad
import scanpy as sc
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
import pandas as pd
import umap
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from keras.callbacks import EarlyStopping
import seaborn as sb
from scipy.spatial.distance import pdist
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import pearsonr
import time
from keras import layers, models

adata = sc.read('/tmp/work/RCproject_code/sce_export.h5ad')
epoch_count = 0

  from pandas.core.computation.check import NUMEXPR_INSTALLED
2024-10-22 14:32:48.431320: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#params
simplifly_categories = True
dropout_rate = 0.1  #dropout rate for regulraization
low_dim_rep_number = 128
current_lambda_value = 10

In [3]:
def quantile_normalization(adata):
    # Extract the counts matrix
    counts = adata.X.toarray() if hasattr(adata.X, 'toarray') else np.array(adata.X)

    # Sort each column (gene expression values) and store sorted values and original indices
    ordered_results = []
    for i in range(counts.shape[1]):
        ordered_indices = np.argsort(-counts[:, i])  # Sort in decreasing order
        ordered_values = counts[ordered_indices, i]
        ordered_results.append({'ordered_values': ordered_values, 'original_indices': ordered_indices})
    
    # Create matrix of ordered values
    ordered_matrix = np.column_stack([result['ordered_values'] for result in ordered_results])

    # Compute row means of the ordered matrix
    row_averages = np.mean(ordered_matrix, axis=1)

    # Replace each row's values with the row's average
    average_matrix = np.tile(row_averages[:, np.newaxis], counts.shape[1])

    # Reconstruct the original matrix based on the saved indices
    reconstructed_matrix = np.zeros_like(counts)
    for i in range(counts.shape[1]):
        original_indices = ordered_results[i]['original_indices']
        reconstructed_matrix[original_indices, i] = average_matrix[:, i]

    # Add quantile normalized counts as a new layer in AnnData (log-transformed)
    adata.layers['quantile_norm'] = np.log1p(reconstructed_matrix)

    return adata

#added quantile normlaization
adata = quantile_normalization(adata)

In [4]:
# generate numerical values for each batch category
# set up categories variable
categories = adata.obs['batch']

#collpse the categories to microARRAY vs sequencing

if simplifly_categories:
    category_map = {'GSE133057': 'micro', 'GSE145037': 'micro', 'GSE150082': 'micro','GSE190826':'seq','GSE209746':'seq',
                    'GSE45404_GPL1': 'micro', 'GSE45404_GPL2': 'micro', 'GSE93375': 'micro','GSE94104': 'micro'}
    categories = np.vectorize(category_map.get)(categories)
    
# Create a LabelEncoder instance
label_encoder = LabelEncoder()
# Fit and transform the categories to integers
numerical_categories = label_encoder.fit_transform(categories)
numerical_categories.shape

(450,)

In [5]:
#normalizaiton
gene_expression_data = adata.layers['scalelogcounts']

# Min-max normalization
scaler = MinMaxScaler()
gene_expression_data = scaler.fit_transform(gene_expression_data)
number_samples = adata.shape[0]
number_genes = adata.shape[1]
input_dim = number_genes

In [6]:
cat_count = len(np.unique(numerical_categories))

In [7]:
# current correlations betweent the groups

current = gene_expression_data
if len(np.unique(numerical_categories)) == 3:
    #caculate the average expression array for the categories of interest
    seq1_mean = np.mean(current[np.where(categories == 'seq1')],axis = 0, keepdims = True)
    seq2_mean = np.mean(current[np.where(categories == 'seq2')],axis = 0, keepdims = True)
    micro_mean = np.mean(current[np.where(categories == 'micro')],axis = 0, keepdims = True)
    #calculate the pearson correlations
    seq1_v_micro, pval = pearsonr(seq1_mean.ravel(), micro_mean.ravel())
    seq1_v_seq2, pval = pearsonr(seq1_mean.ravel(), seq2_mean.ravel())
    seq2_v_micro, pval = pearsonr(seq2_mean.ravel(), micro_mean.ravel())
    
    print(seq1_v_micro)
    print(seq1_v_seq2)
    print(seq2_v_micro)

elif len(np.unique(numerical_categories)) == 2:
    #caculate the average expression array for the categories of interest
    seq_mean = np.mean(current[np.where(categories == 'seq')],axis = 0, keepdims = True)
    micro_mean = np.mean(current[np.where(categories == 'micro')],axis = 0, keepdims = True)
    #calculate the pearson correlations
    seq_v_micro, pval = pearsonr(seq_mean.ravel(), micro_mean.ravel())
    
    print(seq_v_micro)


0.35582272048628216


In [8]:
# define the network architecture with functions for the encoder, decoder and discriminator

# Define the input shape
input_shape = (gene_expression_data.shape[1],)[0]  # Number of genes

# Define the encoder function
def build_encoder():
    model = models.Sequential()
    model.add(layers.Input(shape=(input_shape,)))  # Input shape matches your data

    # First layer with dropout
    model.add(layers.Dense((low_dim_rep_number*8), activation='linear'))  # Increased units
    model.add(layers.LeakyReLU())
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(dropout_rate))  # Add dropout for regularization

    # Second layer
    model.add(layers.Dense((low_dim_rep_number*4), activation='linear'))  # Increased units
    model.add(layers.LeakyReLU())
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(dropout_rate))  # Add dropout

    # Third layer
    model.add(layers.Dense((low_dim_rep_number*2), activation='linear'))  # Increased units
    model.add(layers.LeakyReLU())
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(dropout_rate))  # Add dropout

    # Encoded representation layer
    model.add(layers.Dense(low_dim_rep_number, activation='linear'))  # Output encoded representation
    return model

# Define the decoder function (adjusted as discussed)
def build_decoder():    
    model = models.Sequential()
    model.add(layers.Input(shape=(low_dim_rep_number,)))  # Input shape should match the output of the encoder
    model.add(layers.Dense((low_dim_rep_number*2), activation='linear'))  
    model.add(layers.LeakyReLU())    
    model.add(layers.BatchNormalization())
    model.add(layers.Dense((low_dim_rep_number*4), activation='linear'))  
    model.add(layers.LeakyReLU())    
    model.add(layers.BatchNormalization())
    model.add(layers.Dense(input_shape, activation='sigmoid'))  # Output layer should match the input shape of the original data
    return model

## defines is the discriminator function
def build_domain_classifier(num_domains):
    model = keras.Sequential()
    model.add(layers.Input(shape=low_dim_rep_number))
    
    #Increased capacity with more units and layers
    model.add(layers.Dense((low_dim_rep_number*2), activation='relu'))  # Increased units and changed activation
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(dropout_rate))  # Added dropout for regularization
    
    model.add(layers.Dense((low_dim_rep_number/2), activation='relu'))  # Increased units
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(dropout_rate))  # Added dropout for regularization

    model.add(layers.Dense((low_dim_rep_number/4), activation='relu'))  # Increased units
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(dropout_rate))  # Added dropout for regularization
    
    model.add(layers.Dense((low_dim_rep_number/8), activation='relu'))  # Maintain units
    model.add(layers.BatchNormalization())
    
    # Output layer with softmax activation
    model.add(layers.Dense(num_domains, activation='softmax'))  # num_domains is the number of classes
    
    return model

#gradient reversal takes lambda as an argument, will ensure that the encoder is trained to work against the discimatinator when the lambda is positive
class GradientReversalLayer(tf.keras.layers.Layer):
    def __init__(self, lambda_value=current_lambda_value, **kwargs):
        self.lambda_value = lambda_value
        super(GradientReversalLayer, self).__init__(**kwargs)

    def call(self, inputs):
        # Define custom gradient behavior
        @tf.custom_gradient
        def reverse_gradients(x):
            # Forward pass: output is just the input
            def grad(dy):
                # Gradient computation: reversed and scaled by lambda_value
                return -self.lambda_value * dy
            return x, grad
        
        return reverse_gradients(inputs)

    def get_config(self):
        config = super().get_config()
        config.update({"lambda_value": self.lambda_value})
        return config

In [9]:
# Define the encoder, decoder, and discriminator (assuming these functions are defined as in previous examples)
encoder = build_encoder()  # Assuming build_encoder() is defined
decoder = build_decoder()  # Assuming build_decoder() is defined
discriminator = build_domain_classifier(len(np.unique(categories)))  # Assuming build_domain_classifier() is defined

# Optimizers for each model
encoder_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
decoder_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# Assuming you have 1000 training samples as an example
num_samples = adata.shape[0]  # Replace this with your actual number of samples
batch_size = 128  #determines how many samples are processed per batch, each epoch will process multiple batches

# Calculate the number of steps per epoch
num_steps_per_epoch = num_samples // batch_size

# Compile the discriminator
discriminator.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Combine the encoder and decoder into a single model (autoencoder)
autoencoder = tf.keras.Model(inputs=encoder.input, outputs=decoder(encoder.output))

# Compile the autoencoder with the appropriate loss function (e.g., mean squared error)
autoencoder.compile(optimizer=encoder_optimizer, loss='mean_squared_error')

num_domains = len(np.unique(categories))
X_data = gene_expression_data
y_domains = numerical_categories

X_train = X_data
y_train = y_domains

In [None]:
num_epochs = 4000

# Define the training step
def train_step(data, labels):
    with tf.GradientTape(persistent=True) as tape:
        # Forward pass through the encoder and decoder
        encoded_output = encoder(data)
        reconstructed_output = decoder(encoded_output)

        # Compute the autoencoder loss
        ae_loss = tf.keras.losses.mean_squared_error(data, reconstructed_output)
        ae_loss = tf.reduce_mean(ae_loss)  # Average over the batch
        
        # Forward pass through the GRL and discriminator
        grl_output = GradientReversalLayer()(encoded_output)
        domain_predictions = discriminator(grl_output)

        # Compute the discriminator loss
        disc_loss = tf.keras.losses.sparse_categorical_crossentropy(labels, domain_predictions)
        disc_loss = tf.reduce_mean(disc_loss)  # Average over the batch

    # Compute gradients for the autoencoder (only for ae_loss)
    encoder_grads_ae = tape.gradient(ae_loss, encoder.trainable_variables)
    decoder_grads = tape.gradient(ae_loss, decoder.trainable_variables)

    # Compute gradients for the discriminator (from disc_loss)
    discriminator_grads = tape.gradient(disc_loss, discriminator.trainable_variables)

    # Compute gradients for the encoder from the discriminator loss
    encoder_grads_disc = tape.gradient(disc_loss, encoder.trainable_variables)

    # Update weights
    encoder_optimizer.apply_gradients(zip(encoder_grads_ae + encoder_grads_disc, encoder.trainable_variables))
    decoder_optimizer.apply_gradients(zip(decoder_grads, decoder.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(discriminator_grads, discriminator.trainable_variables))

    return ae_loss, disc_loss

# Training loop
for epoch in range(num_epochs):
    epoch_count += 1
    total_disc_loss = 0.0  # To accumulate discriminator loss
    
    for step in range(num_steps_per_epoch):
        # Get your batch data
        data = X_train[step * batch_size:(step + 1) * batch_size]
        labels = y_train[step * batch_size:(step + 1) * batch_size]

        ae_loss, disc_loss = train_step(data, labels)
        total_disc_loss += disc_loss.numpy()  # Accumulate the discriminator loss
        print(f'Epoch {epoch}, Step {step}, AE Loss: {ae_loss.numpy()}, Disc Loss: {disc_loss.numpy()}')


    # Calculate average discriminator loss
    avg_disc_loss = total_disc_loss / num_steps_per_epoch

    # Check the condition for discriminator loss
    if avg_disc_loss > np.log(cat_count) and epoch_count > 500:
        print('The discriminator does not have a clue!')
        break

Epoch 0, Step 0, AE Loss: 0.02782638743519783, Disc Loss: 0.7084677815437317
Epoch 0, Step 1, AE Loss: 0.03032797947525978, Disc Loss: 0.2228688895702362
Epoch 0, Step 2, AE Loss: 0.03563915565609932, Disc Loss: 4.297031402587891
Epoch 1, Step 0, AE Loss: 0.028111644089221954, Disc Loss: 0.36423593759536743
Epoch 1, Step 1, AE Loss: 0.02642027847468853, Disc Loss: 0.2351483404636383
Epoch 1, Step 2, AE Loss: 0.028538906946778297, Disc Loss: 2.767383098602295
Epoch 2, Step 0, AE Loss: 0.03232237696647644, Disc Loss: 0.15738537907600403
Epoch 2, Step 1, AE Loss: 0.02444988302886486, Disc Loss: 0.2999853491783142
Epoch 2, Step 2, AE Loss: 0.025213012471795082, Disc Loss: 2.1169919967651367
Epoch 3, Step 0, AE Loss: 0.02806149236857891, Disc Loss: 0.08937004953622818
Epoch 3, Step 1, AE Loss: 0.02324926294386387, Disc Loss: 0.2900339663028717
Epoch 3, Step 2, AE Loss: 0.021164868026971817, Disc Loss: 1.6256892681121826
Epoch 4, Step 0, AE Loss: 0.02780407853424549, Disc Loss: 0.56711524724

In [None]:
print(epoch_count)

In [None]:
#grab the represetations from the trained model
low_dimensional_representation = encoder.predict(gene_expression_data)

#verify the shape
print(low_dimensional_representation.shape)

#set up the umap

umap_model = umap.UMAP(n_neighbors=15,   # Controls local vs global structure
                        min_dist=0.1,    # Controls how tightly UMAP packs points together
                        metric='euclidean')  # Distance metric to use

umap_result = umap_model.fit_transform(low_dimensional_representation)

#checking batch separation
current_label =  adata.obs['batch']
umap_data = {'UMAP1': umap_result[:, 0], 'UMAP2': umap_result[:, 1],'batch':current_label}
umap_df = pd.DataFrame(data=umap_data)
ax = sb.scatterplot(data=umap_df, x='UMAP1', y='UMAP2', hue='batch')
sb.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

In [None]:
# use the embeddings for random forest classification
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [None]:
## random forest with original data
indices = [i for i, s in enumerate(adata.obs['Response']) if s in ["yes", "no"]]
y = adata.obs['Response'][indices]
y = [1 if x == "yes" else 0 for x in y]
y = np.array(y)
X = X_data[indices]

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y, test_size=0.3, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_r, y_train_r)

y_pred_prob = clf.predict_proba(X_test_r)[:, 1]  # Probabilities for the positive class (class 1)

fpr, tpr, thresholds = roc_curve(y_test_r, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Diagonal line for random guessing
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
## random forest with transformed data
indices = [i for i, s in enumerate(adata.obs['Response']) if s in ["yes", "no"]]
y = adata.obs['Response'][indices]
y = [1 if x == "yes" else 0 for x in y]
y = np.array(y)
X = low_dimensional_representation[indices]

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y, test_size=0.3, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_r, y_train_r)

y_pred_prob = clf.predict_proba(X_test_r)[:, 1]  # Probabilities for the positive class (class 1)

fpr, tpr, thresholds = roc_curve(y_test_r, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Diagonal line for random guessing
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()