In [None]:
%pip install sciPENN

Note: you may need to restart the kernel to use updated packages.


In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
from matplotlib import pyplot
import os
from copy import deepcopy

from time import time

from math import ceil
from scipy.stats import spearmanr, gamma, poisson

from anndata import AnnData, read_h5ad
import scanpy as sc
from scanpy import read
import pandas as pd

from torch.utils.data import DataLoader, TensorDataset
from torch import tensor
from torch.cuda import is_available

from sciPENN.sciPENN_API import sciPENN_API
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from skimage.metrics import structural_similarity as ssim

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
data_dir = "data/GSE213264_RAW/"
results_dir = "results_sciPENN"
tissues = ['humanGBM', 'humanskin', 'humanthymus', 'humanspleen', 'humantonsil', 'mousekidney', 'mouseintestine', 'mousecolon', 'mousespleen']

for tissue in tissues:
  rna_data = None
  protein_data = None

  for filename in os.listdir(data_dir):
    file_path = os.path.join(data_dir, filename)
    if tissue in filename and filename.endswith("RNA.tsv.gz"):
        rna_data = pd.read_csv(file_path, compression="gzip", header=0, sep="\t")
    elif tissue in filename and filename.endswith("protein.tsv.gz"):
        protein_data = pd.read_csv(file_path, compression="gzip", header=0, sep="\t")

  print("Extracted data")


  # Creating training and testing sets

  rna_data.index = rna_data.index.astype(str)
  rna_data.columns = rna_data.columns.astype(str)

  rna_data[['X', 'Y']] = rna_data['X'].str.split('x', expand=True)
  rna_data['X'] = pd.to_numeric(rna_data['X'], errors='coerce')
  rna_data['Y'] = pd.to_numeric(rna_data['Y'], errors='coerce')

  protein_data.index = protein_data.index.astype(str)
  protein_data.columns = protein_data.columns.astype(str)

  rna_train, rna_test = train_test_split(rna_data, test_size=0.2, random_state=42)
  protein_train = protein_data.loc[rna_train.index].iloc[:,1:]
  protein_test = protein_data.loc[rna_test.index].iloc[:,1:]

  adata_rna_train = sc.AnnData(rna_train)
  adata_protein_train = sc.AnnData(protein_train)

  adata_rna_test = sc.AnnData(rna_test)
  adata_protein_test = sc.AnnData(protein_test)

  print("Created train and test datasets")


  # Run sciPENN model

  sciPENN = sciPENN_API(gene_trainsets = [adata_rna_train], protein_trainsets = [adata_protein_train],
                      gene_test = adata_rna_test)

  sciPENN.train(quantiles = [0.1, 0.25, 0.75, 0.9], n_epochs = 100, ES_max = 12, decay_max = 6,
             decay_step = 0.1, lr = 10**(-3), load = False)

  predicted_test = sciPENN.predict()

  pred_protein = predicted_test.X
  true_protein_norm = np.log(adata_protein_test.X)
  true_protein_norm[true_protein_norm == float('-inf')] = 0
  true_protein_cropped = true_protein_norm[:319, :]  # to match dimensions with predictions


  # Evaluate performance

  rmse = np.sqrt(mean_squared_error(true_protein_cropped, pred_protein))
  pcc = pd.DataFrame(true_protein_cropped).corrwith(pd.DataFrame(pred_protein), axis=1, method='pearson')
  avg_corr_pearson = pcc.mean()
  ssim_val = ssim(true_protein_cropped, pred_protein, data_range=pred_protein.max() - pred_protein.min())

  results_df = pd.DataFrame({
            'RMSE': [rmse],
            'Pearson Correlation': [avg_corr_pearson],
            'SSIM':ssim_val
        })

results_file_path = os.path.join(results_dir, f"{tissue}_results.csv")
results_df.to_csv(results_file_path, index=False)

print(results_df)
print(f"Processed {tissue} successfully.\n")

Extracted data




Created train and test datasets
Searching for GPU
GPU not detected, falling back to CPU

QC Filtering Training Cells
QC Filtering Testing Cells

QC Filtering Training Genes
QC Filtering Testing Genes

Normalizing Training Cells
Normalizing Testing Cells

Log-Normalizing Training Data
Log-Normalizing Testing Data

Finding HVGs


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


MemoryError: Unable to allocate 117. MiB for an array with shape (1284, 23901) and data type float32


Below this cell are the separate parts of the process - it works when I run separately but not all in one block (runs out of memory).

The results below are only for humanGBM.

In [None]:
data_dir = "data/GSE213264_RAW/"
results_dir = "results_sciPENN"
tissues = ['humanGBM'] #, 'humanskin', 'humanthymus', 'humanspleen', 'humantonsil', 'mousekidney', 'mouseintestine', 'mousecolon', 'mousespleen']

for tissue in tissues:
  rna_data = None
  protein_data = None

  for filename in os.listdir(data_dir):
    file_path = os.path.join(data_dir, filename)
    if tissue in filename and filename.endswith("RNA.tsv.gz"):
        rna_data = pd.read_csv(file_path, compression="gzip", header=0, sep="\t")
    elif tissue in filename and filename.endswith("protein.tsv.gz"):
        protein_data = pd.read_csv(file_path, compression="gzip", header=0, sep="\t")

In [None]:
# Creating training and testing sets

rna_data.index = rna_data.index.astype(str)
rna_data.columns = rna_data.columns.astype(str)

rna_data[['X', 'Y']] = rna_data['X'].str.split('x', expand=True)
rna_data['X'] = pd.to_numeric(rna_data['X'], errors='coerce')
rna_data['Y'] = pd.to_numeric(rna_data['Y'], errors='coerce')

protein_data.index = protein_data.index.astype(str)
protein_data.columns = protein_data.columns.astype(str)

rna_train, rna_test = train_test_split(rna_data, test_size=0.2, random_state=42)
protein_train = protein_data.loc[rna_train.index].iloc[:,1:]
protein_test = protein_data.loc[rna_test.index].iloc[:,1:]

adata_rna_train = sc.AnnData(rna_train)
adata_protein_train = sc.AnnData(protein_train)

  app.launch_new_instance()


In [None]:
adata_rna_test = sc.AnnData(rna_test)
adata_protein_test = sc.AnnData(protein_test)

  """Entry point for launching an IPython kernel.
  


In [None]:
sciPENN = sciPENN_API(gene_trainsets = [adata_rna_train], protein_trainsets = [adata_protein_train],
                      gene_test = adata_rna_test)

Searching for GPU
GPU not detected, falling back to CPU

QC Filtering Training Cells
QC Filtering Testing Cells

QC Filtering Training Genes
QC Filtering Testing Genes

Normalizing Training Cells
Normalizing Testing Cells

Log-Normalizing Training Data
Log-Normalizing Testing Data

Finding HVGs


  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],



Normalizing Gene Training Data by Batch


100%|██████████| 1/1 [00:00<00:00, 12.30it/s]



Normalizing Protein Training Data by Batch


100%|██████████| 1/1 [00:00<00:00,  8.74it/s]



Normalizing Gene Testing Data by Batch


100%|██████████| 1/1 [00:00<00:00, 90.89it/s]


In [None]:
sciPENN.train(quantiles = [0.1, 0.25, 0.75, 0.9], n_epochs = 100, ES_max = 12, decay_max = 6,
             decay_step = 0.1, lr = 10**(-3), weights_dir = "weights", load = False)

Epoch 0 prediction loss = 1.143
Epoch 1 prediction loss = 1.062
Epoch 2 prediction loss = 1.070
Epoch 3 prediction loss = 1.076
Epoch 4 prediction loss = 1.085
Epoch 5 prediction loss = 1.095
Epoch 6 prediction loss = 1.106
Decaying loss to 0.0001
Epoch 7 prediction loss = 1.101
Epoch 8 prediction loss = 1.098
Epoch 9 prediction loss = 1.099
Epoch 10 prediction loss = 1.097
Epoch 11 prediction loss = 1.097
Epoch 12 prediction loss = 1.095
Decaying loss to 1e-05
Epoch 13 prediction loss = 1.096


In [None]:
predicted_test = sciPENN.predict()

  imputed_test = AnnData(zeros(shape = (len(cells), len(proteins.var))))


In [None]:
predicted_test.X

array([[ 0.00147817, -0.4314466 ,  0.5979845 , ..., -0.00633523,
        -0.12941387,  0.28273675],
       [ 0.01139018, -0.18516386,  0.721429  , ..., -0.07820442,
        -0.07764927,  0.19643545],
       [-0.204282  , -0.7054657 ,  0.22438289, ..., -0.00374638,
         0.07973555,  0.03868665],
       ...,
       [-0.16044489, -0.46656895,  0.32845762, ...,  0.06550854,
        -0.04580078,  0.18342288],
       [-0.12856817, -0.49194428,  0.4411185 , ...,  0.02471163,
        -0.06149138, -0.03642452],
       [-0.27054924, -0.46203113,  0.3522766 , ..., -0.02863576,
        -0.06167051,  0.12905958]], dtype=float32)

In [None]:
adata_protein_test.X

array([[  2.,  16.,  20., ...,   0.,   0., 104.],
       [  0.,   7.,  10., ...,   0.,   0.,  36.],
       [  1.,   5.,   1., ...,   0.,   0.,  29.],
       ...,
       [  1.,  15.,  12., ...,   0.,   0.,  66.],
       [  2.,  15.,  18., ...,   0.,   0., 156.],
       [  0.,   0.,   0., ...,   0.,   0.,   5.]], dtype=float32)

In [None]:
pred_protein = predicted_test.X
true_protein_norm = np.log(adata_protein_test.X)
true_protein_norm[true_protein_norm == float('-inf')] = 0

  


In [None]:
true_protein_norm

array([[0.6931472, 2.7725887, 2.9957323, ..., 0.       , 0.       ,
        4.644391 ],
       [0.       , 1.9459102, 2.3025851, ..., 0.       , 0.       ,
        3.583519 ],
       [0.       , 1.609438 , 0.       , ..., 0.       , 0.       ,
        3.3672957],
       ...,
       [0.       , 2.7080503, 2.4849067, ..., 0.       , 0.       ,
        4.189655 ],
       [0.6931472, 2.7080503, 2.8903718, ..., 0.       , 0.       ,
        5.049856 ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        1.609438 ]], dtype=float32)

In [None]:
true_protein_cropped = true_protein_norm[:319, :]

In [None]:
from sklearn.metrics import mean_squared_error
from skimage.metrics import structural_similarity as ssim

In [None]:
rmse = np.sqrt(mean_squared_error(true_protein_cropped, pred_protein))
pcc = pd.DataFrame(true_protein_cropped).corrwith(pd.DataFrame(pred_protein), axis=1, method='pearson')
avg_corr_pearson = pcc.mean()
ssim_val = ssim(true_protein_cropped, pred_protein, data_range=pred_protein.max() - pred_protein.min())

In [None]:
results_df = pd.DataFrame({
            'RMSE': [rmse],
            'Pearson Correlation': [avg_corr_pearson],
            'SSIM':ssim_val
        })

print(results_df)

       RMSE  Pearson Correlation      SSIM
0  1.403809            -0.135941  0.032689
