In [8]:
# Import Libraries
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Bioinformatics Libraries
import scanpy as sc

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [9]:
# Load the pbmc3k dataset
adata = sc.datasets.pbmc3k()

print(adata)

AnnData object with n_obs × n_vars = 2700 × 32738
    var: 'gene_ids'


In [10]:
# Basic filtering
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

print(adata)


AnnData object with n_obs × n_vars = 2700 × 13714
    obs: 'n_genes'
    var: 'gene_ids', 'n_cells'


In [11]:
# Normalize the data
sc.pp.normalize_total(adata, target_sum=1e4)

# Logarithmize the data
sc.pp.log1p(adata)

# Store the raw data
adata.raw = adata



In [12]:
# Identify highly variable genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

# Filter the data
adata = adata[:, adata.var.highly_variable]

# Summarize the highly variable genes
print(adata.var.highly_variable.sum())


1872


In [13]:
adata

View of AnnData object with n_obs × n_vars = 2700 × 1872
    obs: 'n_genes'
    var: 'gene_ids', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'log1p', 'hvg'

## Dimension Reduction

### scVAE _ tensorflow

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.losses import binary_crossentropy

# Set the dimension of the latent space
latent_dim = 10

# Define the encoder
inputs = Input(shape=(X_counts.shape[1],))
h = Dense(128, activation='relu')(inputs)
z_mean = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)

# Define the sampling function
def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

# Call the sampling function
z = Lambda(sampling)([z_mean, z_log_var])

# Define the encoder model
encoder = Model(inputs, [z_mean, z_log_var, z])

# Define the decoder
latent_inputs = Input(shape=(latent_dim,))
x = Dense(128, activation='relu')(latent_inputs)
outputs = Dense(X_counts.shape[1], activation='sigmoid')(x)

# Define the decoder model
decoder = Model(latent_inputs, outputs)

# Define the VAE model
outputs = decoder(encoder(inputs)[2])
vae = Model(inputs, outputs)

# Define the VAE loss
def vae_loss(inputs, outputs):
    reconstruction_loss = binary_crossentropy(inputs, outputs) * X_counts.shape[1]
    kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
    kl_loss = K.sum(kl_loss, axis=-1)
    kl_loss *= -0.5
    return K.mean(reconstruction_loss + kl_loss)

# Compile the model
vae.compile(optimizer='adam', loss=vae_loss)


In [None]:
# Train the model
vae.fit(X_counts, X_counts, epochs=50, batch_size=256, shuffle=True)


In [None]:
# Extract the Latent Space Representation
z_mean, _, _ = encoder.predict(X_counts, batch_size=256)


In [None]:
# Visualize the scVAE embedding
plt.figure(figsize=(8, 6))
plt.scatter(z_mean[:, 0], z_mean[:, 1], alpha=0.7)
plt.title('scVAE - Latent Space')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.show()
