In [None]:
import numpy as np
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42 # enables correct plotting of text
rcParams['figure.figsize'] = (7,7)

import random
import tensorflow as tf

seed = 42


random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)


# Import deepscore model # Add deepscore folder to path
from deepscore import deepscore 
import pickle



In [None]:

# LOAD the PROCESS the REFERENCE

# ## Preprocessing


cell_type = 'subclass.l1'

ref_py = sc.read('../HORIZONTAL_RNA/objects/local.h5ad')  # LOAD the HCA Atlas
ref_py = ref_py.raw.to_adata()
ref_py.layers['counts'] = ref_py.X.copy()

ref_py.var['ENSG'] = ref_py.var.index.copy()
ref_py.var.index = ref_py.var['feature_name'].copy()

# SUBSET THE ATLAS TO MATCH OUR SAMPLE BIOLOGY

cortex_celltypes_l1= ['DCT',
 'PEC',
 'CNT',
 'POD',
 'PT',
 'IC',
 'IMM',
 'NEU',
 'VSM/P',
 'TAL',
 'EC',
 'FIB',
 'PC']

cortex_celltypes_l3= ['DCT1',
 'DCT2',
 'IC-B',
 'B',
 'MD',
 'CNT-IC-A',
 'MC',
 'CNT',
 'PT-S1/2',
 'CCD-IC-A',
 'PEC',
 'pDC',
 'EC-GC',
 'POD',
 'VSMC',
 'aPT',
 'CNT-PC',
 'FIB',
 'cDC',
 'REN',
 'EC-PTC',
 'T',
 'PT-S3',
 'ncMON',
 'NKC/T',
 'aTAL1',
 'C-TAL',
 'PL',
 'CCD-PC',
 'SC/NEU',
 'EC-LYM',
 'MDC',
 'N',
 'MAST',
 'aTAL2',
 'aFIB',
 'MYOF',
 'MAC-M2',
 'EC-AEA',
 'VSMC/P']


ref_py = ref_py[ref_py.obs['subclass.l1'].isin(cortex_celltypes_l1)]

ref_py = ref_py[ref_py.obs['subclass.l3'].isin(cortex_celltypes_l3)]

ref_py = ref_py[ref_py.obs['condition.long'].isin(['Normal Reference'])]


ref_py = ref_py[ref_py.obs['state.l2'].isin(['reference','adaptive - epi','adaptive - str'])].copy()


cell_type = 'subclass.l1'
overlapping = False
compute = True


ref_py.X = ref_py.X.copy()
sc.pp.normalize_total(ref_py, target_sum=1e4)
sc.pp.log1p(ref_py)


markers_filename= f'HCA_l1'

# Identify HCA ATLAS differentially expressed genes between cell types 

if compute == True:
    sc.tl.rank_genes_groups(ref_py, cell_type, method='wilcoxon', use_raw=False)
    ranked_genes_populations = ref_py.uns['rank_genes_groups'].copy()
    with open(f'markers_ds/{markers_filename}.pickle', 'wb') as handle:
        pickle.dump(ranked_genes_populations, handle, protocol=pickle.HIGHEST_PROTOCOL)
else:
    with open(f'markers_ds/{markers_filename}.pickle', 'rb') as handle:
        ranked_genes_populations = pickle.load(handle) 

In [4]:
ref_py

AnnData object with n_obs × n_vars = 63638 × 33920
    obs: 'nCount_RNA', 'nFeature_RNA', 'library', 'percent.er', 'percent.mt', 'degen.score', 'aEpi.score', 'aStr.score', 'cyc.score', 'matrisome.score', 'collagen.score', 'glycoprotein.score', 'proteoglycan.score', 'S.Score', 'G2M.Score', 'experiment', 'specimen', 'condition.long', 'condition.l1', 'condition.l2', 'donor_id', 'region.l1', 'region.l2', 'percent.cortex', 'percent.medulla', 'tissue_type', 'id', 'pagoda_k100_infomap_coembed', 'subclass.full', 'subclass.l3', 'subclass.l2', 'subclass.l1', 'state.l2', 'state', 'class', 'structure', 'disease_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'eGFR', 'BMI', 'diabetes_history', 'hypertension', 'tissue_ontology_term_id', 'organism_ontology_term_id', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'is_primary_data', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self

In [None]:
## SET PARAMETERS
n_markers = 500 # Max number of markers to use per cell-type
overlapping = False # Parameter to control overlapping marker genes between cell types on the prediction.


ref_py_save = ref_py.copy()

for mod in ['scRNA','snRNA','scRNA5p']:
    if os.path.exists(f'csv/Deepscore_HCA_l1_{mod}_CLEAN.csv'):
        print(f'{mod} already exists!')

    adata = sc.read(f'../HORIZONTAL_RNA/objects/{mod}_raw.h5ad', compression='gzip')
    adata.X = adata.layers['counts'].copy()
    adata = adata[adata.obs['batch'].isin([mod])].copy()


    with open(f'markers_ds/{markers_filename}.pickle', 'rb') as handle:
        ranked_genes_populations = pickle.load(handle) 

    if overlapping:
        selected_markers =[]
        for cell_type_ in ref_py.obs[cell_type].unique():
            cell_type_markers = []
            for marker in ranked_genes_populations['names'][cell_type_][:n_markers]:
                if marker in adata.var.index: 
                    selected_markers.append(marker)
        selected_markers = set(selected_markers)

    else:
        # Step 2: Store markers for each subset
        subset_markers_dict ={}
        for subset in ref_py.obs[cell_type].unique():
            subset_markers = ranked_genes_populations['names'][subset]
            subset_markers = [gene for gene in subset_markers if gene in adata.var.index]
            subset_markers_dict[subset] = set(subset_markers[:n_markers+100])

        # Step 3: Identify overlapping markers
        overlapping_markers = set()
        for subset, markers in subset_markers_dict.items():
            for other_subset, other_markers in subset_markers_dict.items():
                if subset != other_subset:
                    overlapping_markers.update(markers.intersection(other_markers))

        # Step 4: Select markers for each subset, excluding overlapping markers
        marker_dict = {}
        for subset, markers in subset_markers_dict.items():
            unique_markers = [marker for marker in markers if marker not in overlapping_markers]
            marker_dict[subset] = unique_markers[:n_markers]  # Select up to TOP n_markers
        selected_markers = [marker for subset in marker_dict for marker in marker_dict[subset]]



    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)


    # Subset the data to the selected markers

    ref_py = ref_py_save[:, list(selected_markers)].copy()
    adata = adata[:, list(selected_markers)].copy()

    len(selected_markers)

    sc.pp.scale(ref_py)
    sc.pp.scale(adata)

    ref_py.obs[cell_type] = ref_py.obs[cell_type].tolist()
    len(ref_py.obs[cell_type].unique())


    def scheduler(epoch, lr):
        if epoch < 10:
            return lr
        else:
            return lr * tf.math.exp(-0.1)


    n_feat = ref_py.shape[1]
    n_labs = len(ref_py.obs[cell_type].unique())

    ds = deepscore.DeepScore(hidden_nodes=[1024, 256],
                   n_features=n_feat, 
                   n_labels=n_labs,
                   epochs=30,
                   batch_size=128, 
                   activation="relu", 
                   dropout=True, 
                   dropout_rate=0.1,
                   batchnorm=True, 
                   lr=0.001,
                   weight_reg=True)




    import os
    os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"

    ds.set_reference(ref_py, label_by=cell_type, test_prop=0.1)

    ds.train(earlystopping=True, patience=10, lr_scheduler=scheduler,)
    # ds.model.save(f'models/deepscore') # In case you want to save the DS model

    prob_df, adata = ds.annotate(adata, pred_key='Deepscore_HCA',Unclassified = False,return_pred_matrix=True)

    # SAVE the RESULTS on csv
    adata.obs[['Deepscore_HCA','Deepscore_HCA_score']].to_csv(f'csv/Deepscore_HCA_l1_{mod}_CLEAN.csv')

    prob_df.to_csv(f'csv/prob_matrix/Deepscore_HCA_l1_{mod}_CLEAN.csv')


scRNA already exists!


2024-01-29 11:48:59.300151: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2024-01-29 11:48:59.300194: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: IJC20724
2024-01-29 11:48:59.300204: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: IJC20724
2024-01-29 11:48:59.300386: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: Not found: was unable to find libcuda.so DSO loaded into this program
2024-01-29 11:48:59.300433: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 525.125.6
2024-01-29 11:48:59.300915: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild Tenso

Model: "deepscore"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization (BatchNo (None, 2565)              10260     
_________________________________________________________________
dense1024 (Dense)            (None, 1024)              2627584   
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 1024)              4096      
_________________________________________________________________
dense256 (Dense)             (None, 256)               262400    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
batch_normalization_2 (Batch (None, 256)               10

2024-01-29 11:48:59.631055: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 587631240 exceeds 10% of free system memory.
2024-01-29 11:48:59.894869: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2024-01-29 11:48:59.894896: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2024-01-29 11:48:59.916028: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2024-01-29 11:48:59.917813: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 528861960 exceeds 10% of free system memory.
2024-01-29 11:49:00.217774: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/30
  9/403 [..............................] - ETA: 9s - loss: 1.1736 - categorical_accuracy: 0.6736 

2024-01-29 11:49:01.230029: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2024-01-29 11:49:01.230051: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2024-01-29 11:49:01.249695: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2024-01-29 11:49:01.251707: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2024-01-29 11:49:01.256013: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: ./deepscore_logs/train/plugins/profile/2024_01_29_11_49_01

2024-01-29 11:49:01.257296: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for trace.json.gz to ./deepscore_logs/train/plugins/profile/2024_01_29_11_49_01/IJC20724.trace.json.gz
2024-01-29 11:49:01.261654: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: ./deepscore_logs/train/plugins/profile/2024_01_29_11_49_0

Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30

Evaluating model performance on unseen data (test data):


test loss: 0.20803, test accuracy:'               '0.95805


2024-01-29 11:50:41.081102: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 364363380 exceeds 10% of free system memory.
2024-01-29 11:50:41.237652: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 364363380 exceeds 10% of free system memory.


snRNA already exists!
Model: "deepscore"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_3 (Batch (None, 2565)              10260     
_________________________________________________________________
dense1024 (Dense)            (None, 1024)              2627584   
_________________________________________________________________
dropout_2 (Dropout)          (None, 1024)              0         
_________________________________________________________________
batch_normalization_4 (Batch (None, 1024)              4096      
_________________________________________________________________
dense256 (Dense)             (None, 256)               262400    
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
batch_normalization_5 (Batch (None,

2024-01-29 11:50:57.109355: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 587631240 exceeds 10% of free system memory.
2024-01-29 11:50:57.360644: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2024-01-29 11:50:57.360663: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2024-01-29 11:50:57.360691: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.


Epoch 1/30
  6/403 [..............................] - ETA: 13s - loss: 1.4632 - categorical_accuracy: 0.5911

2024-01-29 11:50:58.139110: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2024-01-29 11:50:58.139129: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2024-01-29 11:50:58.200394: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2024-01-29 11:50:58.201619: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2024-01-29 11:50:58.203792: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: ./deepscore_logs/train/plugins/profile/2024_01_29_11_50_58

2024-01-29 11:50:58.205042: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for trace.json.gz to ./deepscore_logs/train/plugins/profile/2024_01_29_11_50_58/IJC20724.trace.json.gz
2024-01-29 11:50:58.206975: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: ./deepscore_logs/train/plugins/profile/2024_01_29_11_50_5

Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30

Evaluating model performance on unseen data (test data):


test loss: 0.23064, test accuracy:'               '0.95475
scRNA5p already exists!
Model: "deepscore"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_6 (Batch (None, 2565)              10260     
_________________________________________________________________
dense1024 (Dense)            (None, 1024)              2627584   
_________________________________________________________________
dropout_4 (Dropout)          (None, 1024)              0         
_________________________________________________________________
batch_normalization_7 (Batch (None, 1024)              4096      
_________________________________________________________________
dense256 (Dense)             (None, 256)               262400    


2024-01-29 11:52:36.776479: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2024-01-29 11:52:36.776500: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2024-01-29 11:52:36.776566: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.


Epoch 1/30
  6/403 [..............................] - ETA: 12s - loss: 1.4345 - categorical_accuracy: 0.5794

2024-01-29 11:52:37.544067: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2024-01-29 11:52:37.544088: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2024-01-29 11:52:37.603690: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2024-01-29 11:52:37.604771: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2024-01-29 11:52:37.606774: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: ./deepscore_logs/train/plugins/profile/2024_01_29_11_52_37

2024-01-29 11:52:37.607931: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for trace.json.gz to ./deepscore_logs/train/plugins/profile/2024_01_29_11_52_37/IJC20724.trace.json.gz
2024-01-29 11:52:37.609667: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: ./deepscore_logs/train/plugins/profile/2024_01_29_11_52_3

Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30

Evaluating model performance on unseen data (test data):


test loss: 0.21356, test accuracy:'               '0.95239
