In [1]:
import anndata as ad
import scanpy as sc
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
import pandas as pd
import umap
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from keras.callbacks import EarlyStopping
import seaborn as sb
from scipy.spatial.distance import pdist
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import pearsonr
import time
from keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.decomposition import PCA
import math
import random

adata = sc.read('/tmp/work/RCproject_code/sce_export.h5ad')
epoch_count = 0

#exclude partial
adata = adata[~adata.obs['Response'].isin(['partial']), :]

  from pandas.core.computation.check import NUMEXPR_INSTALLED
2024-10-31 13:47:09.109123: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
test_set_size = 0.2
dropout_rate = 0.1
balance = True
l2_reg = 0.01
batch_size = 32  #determines how many samples are processed per batch, each epoch will process multiple batches
learning_rate = 0.1
num_epochs = 1000000
report_frequency = 100
accuracy_threshold = 0.8
clipnorm = 1.0
simplifly_categories = True
holdout_size = 0.5

PCA_reduce = False
n_comp_PCA = 16

In [3]:
# generate numerical values for each batch category
# set up categories variable
categories_technology = adata.obs['batch']

#collpse the categories to microARRAY vs sequencing

if simplifly_categories:
    category_map = {'GSE133057': 'micro', 'GSE145037': 'micro', 'GSE150082': 'micro','GSE190826':'seq','GSE209746':'seq',
                    'GSE45404_GPL1': 'micro', 'GSE45404_GPL2': 'micro', 'GSE93375': 'micro','GSE94104': 'micro'}
    categories_technology = np.vectorize(category_map.get)(categories_technology)
    
# Create a LabelEncoder instance
label_encoder = LabelEncoder()
# Fit and transform the categories to integers
numerical_categories_technology = label_encoder.fit_transform(categories_technology)
print(numerical_categories_technology.shape)

#do the same for the response variable
categories_outcome = adata.obs['Response']
numerical_categories_outcome = label_encoder.fit_transform(categories_outcome)
print(numerical_categories_technology.shape)

(410,)
(410,)


In [4]:
frequency_counts = pd.Series(numerical_categories_technology).value_counts()
print('Technology frequencies')
print(frequency_counts)

print('')

print('Outcome frequencies')
frequency_counts = pd.Series(numerical_categories_outcome).value_counts()
print(frequency_counts)

unique_combinations_array = (numerical_categories_outcome + (numerical_categories_technology+1)*2)-2

np.unique(unique_combinations_array)

Technology frequencies
0    209
1    201
Name: count, dtype: int64

Outcome frequencies
0    267
1    143
Name: count, dtype: int64


array([0, 1, 2, 3])

In [5]:
adata.obs['numerical_categories_technology'] = numerical_categories_technology
adata.obs['numerical_categories_outcome'] = numerical_categories_outcome
adata.obs['combination_tech_outcome'] = unique_combinations_array

  adata.obs['numerical_categories_technology'] = numerical_categories_technology


In [6]:
#normalizaiton
gene_expression_data = adata.layers['scalelogcounts']

# Min-max normalization
scaler = MinMaxScaler()
gene_expression_data = scaler.fit_transform(gene_expression_data)
number_genes = adata.shape[1]
input_dim = number_genes

if PCA_reduce:
# Initialize PCA and fit it to X_train
    n_components = n_comp_PCA  # You can adjust this based on your data
    pca = PCA(n_components=n_components)
    gene_expression_data = pca.fit_transform(gene_expression_data)

In [7]:
# setup the test and train datasets
X_train, X_test, y_train, y_test = train_test_split(gene_expression_data, adata.obs, test_size=test_set_size, random_state=1)

y_train_outcome = y_train['numerical_categories_outcome']
y_test_outcome = y_test['numerical_categories_outcome']

y_train_tech = y_train['numerical_categories_technology']
y_test_tech = y_test['numerical_categories_technology']

y_train_comb = y_train['combination_tech_outcome']
y_test_comb = y_test['combination_tech_outcome']

In [8]:
# Define the input shape
# input_shape = (gene_expression_data.shape[1],)[0]  # Number of genes

input_shape = (X_train.shape[1],)[0]  # Number of genes


def build_outcome_classifier():
    model = keras.Sequential()
    model.add(layers.Input(shape=(input_shape,)))  # Input shape matches your data
    
    model.add(layers.Dense((512), activation='relu',kernel_regularizer=tf.keras.regularizers.l2(l2_reg),kernel_initializer='he_normal'))
    model.add(layers.LeakyReLU(alpha=0.1))  # Leaky ReLU helps with vanishing gradients
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(dropout_rate))
    
    model.add(layers.Dense((256),activation='relu',kernel_regularizer=tf.keras.regularizers.l2(l2_reg),kernel_initializer='he_normal'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(dropout_rate))

    model.add(layers.Dense((256),activation='relu',kernel_regularizer=tf.keras.regularizers.l2(l2_reg),kernel_initializer='he_normal'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(dropout_rate))
    
    model.add(layers.Dense((16),activation='relu',kernel_regularizer=tf.keras.regularizers.l2(l2_reg),kernel_initializer='he_normal'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(dropout_rate))
    
    model.add(layers.Dense((8),activation='relu',kernel_regularizer=tf.keras.regularizers.l2(l2_reg),kernel_initializer='he_normal'))
    model.add(layers.LeakyReLU(alpha=0.1))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(dropout_rate))
    
    # Output layer for binary classification with sigmoid activation
    model.add(layers.Dense(1, activation='sigmoid'))
    
    return model

In [9]:
# Define the mode
outcome_classifier = build_outcome_classifier()

# set up the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, clipnorm = clipnorm)

# determine the sample and batch size
num_samples = math.floor(X_train.shape[0]* (1-holdout_size))  # number of samples used in each training epoch

# batch_size = adata.shape[0]

# Calculate the number of steps per epoch
num_steps_per_epoch = num_samples // batch_size

# Compile the outcome discriminator
outcome_classifier.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [10]:
test_accuracy_list = []
train_accuracy_list = []

num_outcomes = len(np.unique(y_test_outcome))
num_conditions = len(np.unique(unique_combinations_array))

# Define the training step for only the outcome classifier
def train_step(data, outcome_labels):
    with tf.GradientTape() as tape:
        # Forward pass through the outcome classifier
        outcome_predictions = outcome_classifier(data)

        # Compute the biological discriminator loss
        outcome_loss = tf.keras.losses.binary_crossentropy(outcome_labels, outcome_predictions)
        outcome_loss = tf.reduce_mean(outcome_loss)  # Average over the batch

    # Compute gradients for the outcome classifier
    classifier_grads = tape.gradient(outcome_loss, outcome_classifier.trainable_variables)
    
    # Calculate accuracy for the outcome classifier
    predicted_outcome_labels = tf.cast(outcome_predictions > 0.5, tf.float32)  # Threshold at 0.5
    outcome_labels_float = tf.cast(outcome_labels, tf.float32)

    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted_outcome_labels, outcome_labels_float), tf.float32))

    return outcome_loss, accuracy, classifier_grads

# Training loop
for epoch in range(num_epochs):
    total_loss = 0.0  # To accumulate losses
    total_accuracy = 0.0  # To accumulate accuracy
    accumulated_grads = [tf.zeros_like(var) for var in outcome_classifier.trainable_variables]  # Initialize gradient accumulator

    # Split train data randomly, holding out a portion for generalization
    X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X_train, y_train, test_size=holdout_size, random_state=None)
    y_train_comb_temp = y_train_temp['combination_tech_outcome']
    y_train_temp = y_train_temp['numerical_categories_outcome']
    
    # Mini-batch training loop
    for step in range(num_steps_per_epoch):
        # Balance batches if necessary
        batch_indices = []
        if balance:
            for condition in range(num_conditions):
                condition_indices = np.where(y_train_comb_temp == condition)[0]
                condition_batch_indices = np.random.choice(condition_indices, size=batch_size // num_conditions, replace=True)
                batch_indices.append(condition_batch_indices)
        else:
            all_indices = np.arange(len(X_train_temp))
            random_indices = np.random.choice(all_indices, size=batch_size, replace=True)
            batch_indices.append(random_indices)
        X_batch = X_train_temp[np.concatenate(batch_indices)]
        y_batch = y_train_temp[np.concatenate(batch_indices)]
        y_batch = tf.expand_dims(y_batch, axis=-1)  # Adjust labels shape for binary_crossentropy
                
        # Perform the training step and collect gradients
        outcome_loss, accuracy, classifier_grads = train_step(X_batch, y_batch)
        
        # Accumulate gradients and losses
        total_loss += outcome_loss.numpy()
        total_accuracy += accuracy.numpy()
        accumulated_grads = [acc_grad + grad for acc_grad, grad in zip(accumulated_grads, classifier_grads)]

    # Average the accumulated gradients
    averaged_grads = [grad / num_steps_per_epoch for grad in accumulated_grads]

    # Apply averaged gradients to update model weights
    optimizer.apply_gradients(zip(averaged_grads, outcome_classifier.trainable_variables))

    # Calculate average loss and accuracy for the epoch
    avg_loss = total_loss / num_steps_per_epoch
    avg_accuracy = total_accuracy / num_steps_per_epoch

    # Print average accuracy at the end of each epoch and calculate the accuracy for the test set
    if epoch % report_frequency == 0:
        # Evaluate on test data
        outcome_predictions = outcome_classifier(X_test)
        predicted_outcome_labels = tf.cast(outcome_predictions > 0.5, tf.float32)  # Threshold at 0.5
        outcome_labels = tf.expand_dims(y_test_outcome, axis=-1)  # Reshape to match logits shape
        outcome_labels_float = tf.cast(outcome_labels, tf.float32)

        # Calculate accuracy
        test_accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted_outcome_labels, outcome_labels_float), tf.float32))
        
        # Store and print metrics
        train_accuracy_list.append(avg_accuracy)
        test_accuracy_list.append(test_accuracy)
        print(f'Epoch {epoch}, Average Outcome Loss: {avg_loss}, Average Accuracy: {avg_accuracy}, Test Accuracy: {test_accuracy}')

        # Early stopping condition for accuracy
        if test_accuracy > accuracy_threshold:
            print('Early stopping: test set performance high enough')
            break

  y_batch = y_train_temp[np.concatenate(batch_indices)]


Epoch 0, Average Outcome Loss: 0.6930513262748719, Average Accuracy: 0.5, Test Accuracy: 0.6463414430618286


KeyboardInterrupt: 

In [None]:
# After training, plot the metrics

frequency_counts = pd.Series(y_test_outcome).value_counts()
test_chance_level = frequency_counts[0]/len(y_test_outcome)

frequency_counts = pd.Series(y_train_outcome).value_counts()
train_chance_level = frequency_counts[0]/len(y_train_outcome)

fig, axs = plt.subplots(2, 1, figsize=(12, 6))

# Plot train accuracy
axs[0].plot(range(1, len(train_accuracy_list)+1), train_accuracy_list, label='Training Accuracy', color='blue')
axs[0].axhline(train_chance_level, color='black',linestyle ='--')
axs[0].set_title('Training set accuracy over epochs')
axs[0].set_xlabel('Epoch')
axs[0].set_ylabel('Training Accuracy')
axs[0].grid()
axs[0].legend()

# Plot test accuracay
axs[1].plot(range(1, len(test_accuracy_list)+1), test_accuracy_list, label='Test Accuracy', color='orange')
axs[1].axhline(test_chance_level, color='black',linestyle ='--')
axs[1].set_title('Test set accuracy over epochs')
axs[1].set_xlabel('Epoch')
axs[1].set_ylabel('Test Accuracy')
axs[1].grid()
axs[1].legend()

plt.tight_layout()
plt.show()

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train_outcome)

y_pred_prob = clf.predict_proba(X_test)[:, 1]  # Probabilities for the positive class (class 1)

fpr, tpr, thresholds = roc_curve(y_test_outcome, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')  # Diagonal line for random guessing
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()