In [1]:
import anndata as ad
import scanpy as sc
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
import pandas as pd
import umap
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from keras.callbacks import EarlyStopping
import seaborn as sb
from scipy.spatial.distance import pdist
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import pearsonr
import time
from keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.decomposition import PCA

adata = sc.read('/tmp/work/RCproject_code/sce_export.h5ad')
epoch_count = 0

#exclude partial
adata = adata[~adata.obs['Response'].isin(['partial']), :]

  from pandas.core.computation.check import NUMEXPR_INSTALLED
2024-10-29 13:40:24.689263: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
test_set_size = 0.2
dropout_rate = 0.3
balance = True

In [3]:
# Create a LabelEncoder instance
label_encoder = LabelEncoder()

#set up the response variable
categories_outcome = adata.obs['Response']
numerical_categories_outcome = label_encoder.fit_transform(categories_outcome)
print(numerical_categories_outcome.shape)

(410,)


In [4]:
#normalizaiton
gene_expression_data = adata.layers['scalelogcounts']

# Min-max normalization
scaler = MinMaxScaler()
gene_expression_data = scaler.fit_transform(gene_expression_data)
number_genes = adata.shape[1]
input_dim = number_genes

# Initialize PCA and fit it to X_train
n_components = 16  # You can adjust this based on your data
pca = PCA(n_components=n_components)
gene_expression_data_pca = pca.fit_transform(gene_expression_data)

gene_expression_data_pca.shape

(410, 16)

In [5]:
# setup the test and train datasets
X_train, X_test, y_train, y_test = train_test_split(gene_expression_data_pca, numerical_categories_outcome, test_size=test_set_size, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(gene_expression_data, numerical_categories_outcome, test_size=test_set_size, random_state=42)

In [6]:
# Define the input shape
# input_shape = (gene_expression_data.shape[1],)[0]  # Number of genes

input_shape = (X_train.shape[1],)[0]  # Number of genes


def build_outcome_classifier():
    model = keras.Sequential()
    model.add(layers.Input(shape=(input_shape,)))  # Input shape matches your data
    
    # Add more hidden layers and increase units with Leaky ReLU
    model.add(layers.Dense((64), activation='relu',kernel_regularizer=tf.keras.regularizers.l2(0.1),kernel_initializer='he_normal'))
    model.add(layers.LeakyReLU(alpha=0.1))  # Leaky ReLU helps with vanishing gradients
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(dropout_rate))
    
    model.add(layers.Dense((32),activation='relu',kernel_regularizer=tf.keras.regularizers.l2(0.1),kernel_initializer='he_normal'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(dropout_rate))
    
    # Reduce units in subsequent layers with Leaky ReLU
    model.add(layers.Dense((16),activation='relu',kernel_regularizer=tf.keras.regularizers.l2(0.1),kernel_initializer='he_normal'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(dropout_rate))
    
    # Output layer for binary classification with sigmoid activation
    model.add(layers.Dense(1, activation='sigmoid'))
    
    return model

In [7]:
# Define the mode
outcome_classifier = build_outcome_classifier()

# set up the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

# determine the sample and batch size
num_samples = adata.shape[0]  # Replace this with your actual number of samples

batch_size = 16  #determines how many samples are processed per batch, each epoch will process multiple batches

# batch_size = adata.shape[0]

# Calculate the number of steps per epoch
num_steps_per_epoch = num_samples // batch_size

# Compile the outcome discriminator
outcome_classifier.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
test_accuracy_list = []
train_accuracy_list = []

num_epochs = 1000000
report_frequency = 1000
accuracy_threshold = 0.8
num_outcomes = len(np.unique(y_test))


import tensorflow as tf


# Define the training step for only the outcome classifier
def train_step(data, outcome_labels):
    with tf.GradientTape() as tape:
        # Forward pass through the outcome classifier
        outcome_predictions = outcome_classifier(data)

        # Compute the biological discriminator loss
        outcome_loss = tf.keras.losses.binary_crossentropy(outcome_labels, outcome_predictions)
        outcome_loss = tf.reduce_mean(outcome_loss)  # Average over the batch

    # Compute gradients for the outcome classifier
    classifier_grads = tape.gradient(outcome_loss, outcome_classifier.trainable_variables)

    # Update weights
    optimizer.apply_gradients(zip(classifier_grads, outcome_classifier.trainable_variables))

    # Calculate accuracy for the outcome classifier
    predicted_outcome_labels = tf.cast(outcome_predictions > 0.5, tf.float32)  # Threshold at 0.5
    outcome_labels_float = tf.cast(outcome_labels, tf.float32)

    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted_outcome_labels, outcome_labels_float), tf.float32))

    return outcome_loss, accuracy


outcome_labels = y_train

# Training loop
for epoch in range(num_epochs):
    total_loss = 0.0  # To accumulate losses
    total_accuracy = 0.0  # To accumulate accuracy

    for step in range(num_steps_per_epoch):

        batch_indices = []
        if balance:
            for outcome in range(num_outcomes):
                outcome_indices = np.where(outcome_labels == outcome)[0]
                outcome_batch_indices = np.random.choice(outcome_indices, size=batch_size // num_outcomes, replace=False)
                batch_indices.append(outcome_batch_indices)
        else:
            # Correct all_indices to span the dataset, not outcomes
            all_indices = np.arange(len(X_train))
            random_indices = np.random.choice(all_indices, size=batch_size, replace=True)
            batch_indices.append(random_indices)
        
        X_batch = X_train[np.concatenate(batch_indices)]
        y_batch = y_train[np.concatenate(batch_indices)]
        y_batch = tf.expand_dims(y_batch, axis=-1)  # Adjust labels shape for binary_crossentropy
        
        # Perform the training step
        outcome_loss, accuracy = train_step(X_batch, y_batch)

        # Accumulate losses and accuracy
        total_loss += outcome_loss.numpy()
        total_accuracy += accuracy.numpy()

        # # Print the losses and accuracy
        # if epoch_count % report_frequency == 0:
        #     print(f'Epoch {epoch}, Step {step}, Outcome Loss: {outcome_loss.numpy()}, Accuracy: {accuracy.numpy()}')

    # Calculate average loss and accuracy
    avg_loss = total_loss / num_steps_per_epoch
    avg_accuracy = total_accuracy / num_steps_per_epoch

    # Print average accuracy at the end of each epoch and calculate the accuracy for the test set
    if epoch  % report_frequency == 0:
        # Forward pass of the test data
        outcome_predictions = outcome_classifier(X_test)
        predicted_outcome_labels = tf.cast(outcome_predictions > 0.5, tf.float32)  # Threshold at 0.5
        outcome_labels = tf.expand_dims(y_test, axis=-1)  # Reshape to match the logits shape
        outcome_labels_float = tf.cast(outcome_labels, tf.float32)

        # Calculate accuracy
        test_accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted_outcome_labels, outcome_labels_float), tf.float32))
        
        train_accuracy_list.append(avg_accuracy)
        test_accuracy_list.append(test_accuracy)
        
        print(f'Epoch {epoch}, Average Outcome Loss: {avg_loss}, Average Accuracy: {avg_accuracy}, Test Accuracy: {test_accuracy}')

        # Early stopping condition for accuracy
        if test_accuracy > accuracy_threshold:
            print('Early stopping: test set performance high enough')
            break

Epoch 0, Average Outcome Loss: 2.9833252358436586, Average Accuracy: 0.5, Test Accuracy: 0.6829268336296082
Epoch 1000, Average Outcome Loss: 0.6712268733978272, Average Accuracy: 0.61, Test Accuracy: 0.6829268336296082
Epoch 2000, Average Outcome Loss: 0.661772015094757, Average Accuracy: 0.625, Test Accuracy: 0.6829268336296082
Epoch 3000, Average Outcome Loss: 0.6382047057151794, Average Accuracy: 0.6675, Test Accuracy: 0.6829268336296082
Epoch 4000, Average Outcome Loss: 0.6509076285362244, Average Accuracy: 0.645, Test Accuracy: 0.6829268336296082
Epoch 5000, Average Outcome Loss: 0.6667433381080627, Average Accuracy: 0.6175, Test Accuracy: 0.6829268336296082
Epoch 6000, Average Outcome Loss: 0.6375273442268372, Average Accuracy: 0.67, Test Accuracy: 0.6829268336296082
Epoch 7000, Average Outcome Loss: 0.6662913656234741, Average Accuracy: 0.6175, Test Accuracy: 0.6829268336296082
Epoch 8000, Average Outcome Loss: 0.6481714773178101, Average Accuracy: 0.65, Test Accuracy: 0.682926

In [None]:
# After training, plot the metrics

fig, axs = plt.subplots(2, 1, figsize=(12, 6))

# Plot train accuracy
axs[0].plot(range(1, len(train_accuracy_list)+1), train_accuracy_list, label='Training Accuracy', color='blue')
axs[0].set_title('Training accuracy over epochs')
axs[0].set_xlabel('Epoch')
axs[0].set_ylabel('Training Accuracy')
axs[0].grid()
axs[0].legend()

# Plot test accuracay
axs[1].plot(range(1, len(test_accuracy_list)+1), test_accuracy_list, label='Test Accuracy', color='orange')
axs[1].set_title('Test Accuracy over epochs')
axs[1].set_xlabel('Epoch')
axs[1].set_ylabel('Test Accuracy')
axs[1].grid()
axs[1].legend()

plt.tight_layout()
plt.show()

In [None]:
count = np.sum(y_test == 0)
count/len(y_test)

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

y_pred_prob = clf.predict_proba(X_test)[:, 1]  # Probabilities for the positive class (class 1)

fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Diagonal line for random guessing
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
X_test.shape