In [1]:

#!/usr/bin/env python
# coding: utf-8

# # CODE

# In[1]:


import os
import numpy as np
import scanpy as sc
import anndata
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

from scipy import sparse
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42 # enables correct plotting of text
rcParams['figure.figsize'] = (7,7)
import seaborn as sns
from scipy import sparse

import random
import tensorflow as tf

seed = 42


random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# In[2]:


palette = {'CNT':'#1f77b4',
 'DCT1':'#ff7f0e',
 'DCT2':'#279e68',
 'TL':'#279e68',
 'DCT':'#279e68',
 'ENDO':'#d62728',
 'FIB':'#aa40fc',
 'ICA':'#8c564b',
 'ICB':'#e377c2',
 'LEUK':'#b5bd61',
 'MES_FIB':'#17becf',
 'MES':'#17becf',
 'PC':'#aec7e8',
 'PEC':'#ffbb78',
 'PODO':'#98df8a',
 'PT':'#ff9896',
 'PT_VCAM1':'#c5b0d5',
 'TAL':'#c49c94',
 'Unclassified':'#808080',
 'Unknown':'#000000',
 'Low_Quality_RNA':'#808080'}


# In[3]:


import sys 
from deepscore import deepscore
from deepscore import marker_analysis
import pickle


# In[4]:
cell_type = 'subclass.l1'


# # FINE GRAINED LABEL TRANSFER PER COMPARTMENT

# ## Preprocessing

# In[5]:


ref_py = sc.read('../HORIZONTAL_RNA/objects/local.h5ad')
ref_py = ref_py.raw.to_adata()
ref_py.layers['counts'] = ref_py.X.copy()

ref_py.var['ENSG'] = ref_py.var.index.copy()
ref_py.var.index = ref_py.var['feature_name'].copy()

cortex_celltypes_l1= ['DCT',
 'PEC',
 'CNT',
 'POD',
 'PT',
 'IC',
 'IMM',
 'NEU',
 'VSM/P',
 'TAL',
 'EC',
 'FIB',
 'PC']

cortex_celltypes_l3= ['DCT1',
 'DCT2',
 'IC-B',
 'B',
 'MD',
 'CNT-IC-A',
 'MC',
 'CNT',
 'PT-S1/2',
 'CCD-IC-A',
 'PEC',
 'pDC',
 'EC-GC',
 'POD',
 'VSMC',
 'aPT',
 'CNT-PC',
 'FIB',
 'cDC',
 'REN',
 'EC-PTC',
 'T',
 'PT-S3',
 'ncMON',
 'NKC/T',
 'aTAL1',
 'C-TAL',
 'PL',
 'CCD-PC',
 'SC/NEU',
 'EC-LYM',
 'MDC',
 'N',
 'MAST',
 'aTAL2',
 'aFIB',
 'MYOF',
 'MAC-M2',
 'EC-AEA',
 'VSMC/P']


ref_py = ref_py[ref_py.obs['subclass.l1'].isin(cortex_celltypes_l1)]

ref_py = ref_py[ref_py.obs['subclass.l3'].isin(cortex_celltypes_l3)]

ref_py = ref_py[ref_py.obs['condition.long'].isin(['Normal Reference'])]


ref_py = ref_py[ref_py.obs['state.l2'].isin(['reference','adaptive - epi','adaptive - str'])].copy()


compute = False
cell_type = 'subclass.l1'
overlapping = False

# In[11]:


ref_py.X = ref_py.X.copy()
sc.pp.normalize_total(ref_py, target_sum=1e4)
sc.pp.log1p(ref_py)


markers_filename= f'HCA_l1'
# Identify differentially expressed genes between cell types
if compute == True:
    sc.tl.rank_genes_groups(ref_py, cell_type, method='wilcoxon', use_raw=False)
    ranked_genes_populations = ref_py.uns['rank_genes_groups'].copy()
    with open(f'markers_ds/{markers_filename}.pickle', 'wb') as handle:
        pickle.dump(ranked_genes_populations, handle, protocol=pickle.HIGHEST_PROTOCOL)
else:
    with open(f'markers_ds/{markers_filename}.pickle', 'rb') as handle:
        ranked_genes_populations = pickle.load(handle) 

2024-08-27 15:48:42.716777: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-27 15:48:42.859244: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-08-27 15:48:42.861614: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
overlapping = False

In [3]:
ref_py_save = ref_py.copy()

In [4]:
ref_py

AnnData object with n_obs × n_vars = 63638 × 33920
    obs: 'nCount_RNA', 'nFeature_RNA', 'library', 'percent.er', 'percent.mt', 'degen.score', 'aEpi.score', 'aStr.score', 'cyc.score', 'matrisome.score', 'collagen.score', 'glycoprotein.score', 'proteoglycan.score', 'S.Score', 'G2M.Score', 'experiment', 'specimen', 'condition.long', 'condition.l1', 'condition.l2', 'donor_id', 'region.l1', 'region.l2', 'percent.cortex', 'percent.medulla', 'tissue_type', 'id', 'pagoda_k100_infomap_coembed', 'subclass.full', 'subclass.l3', 'subclass.l2', 'subclass.l1', 'state.l2', 'state', 'class', 'structure', 'disease_ontology_term_id', 'sex_ontology_term_id', 'development_stage_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'eGFR', 'BMI', 'diabetes_history', 'hypertension', 'tissue_ontology_term_id', 'organism_ontology_term_id', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'is_primary_data', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self

In [6]:
n_markers = 500

redo = True
## SET PARAMETERS
mod = 'SmartSeq'

if os.path.exists(f'csv/Deepscore_HCA_l1_{mod}_CLEAN.csv'):
    print(f'{mod} already exists!')

adata = sc.read('../SmartSeq/data/SN0307813/seur_harmony_with_Raji_Jun6.h5ad', compression='gzip')
adata = adata.raw.to_adata()
# adata = adata[adata.obs['batch'].isin([mod])].copy()

# Identify differentially expressed genes between cell types

with open(f'markers_ds/{markers_filename}.pickle', 'rb') as handle:
    ranked_genes_populations = pickle.load(handle) 

if overlapping:
    selected_markers =[]
    for cell_type_ in ref_py.obs[cell_type].unique():
        cell_type_markers = []
        for marker in ranked_genes_populations['names'][cell_type_][:n_markers]:
            if marker in adata.var.index: 
                selected_markers.append(marker)
    selected_markers = set(selected_markers)
# Step 1: Create a dictionary to store markers for each subset
else:
    # Step 2: Store markers for each subset
    subset_markers_dict ={}
    for subset in ref_py.obs[cell_type].unique():
        subset_markers = ranked_genes_populations['names'][subset]
        subset_markers = [gene for gene in subset_markers if gene in adata.var.index]
        subset_markers_dict[subset] = set(subset_markers[:n_markers+100])

    # Step 3: Identify overlapping markers
    overlapping_markers = set()
    for subset, markers in subset_markers_dict.items():
        for other_subset, other_markers in subset_markers_dict.items():
            if subset != other_subset:
                overlapping_markers.update(markers.intersection(other_markers))

    # Step 4: Select markers for each subset, excluding overlapping markers
    marker_dict = {}
    for subset, markers in subset_markers_dict.items():
        unique_markers = [marker for marker in markers if marker not in overlapping_markers]
        marker_dict[subset] = unique_markers[:n_markers]  # Select up to 500 unique markers
        # print(subset,len(unique_markers[:n_markers]))
    selected_markers = [marker for subset in marker_dict for marker in marker_dict[subset]]

# print(selected_markers)
# In[13]:


sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)


# Subset the data to the selected markers
ref_py = ref_py_save[:, list(selected_markers)].copy()
adata = adata[:, list(selected_markers)].copy()

len(selected_markers)

sc.pp.scale(ref_py)
sc.pp.scale(adata)

ref_py.obs[cell_type] = ref_py.obs[cell_type].tolist()
len(ref_py.obs[cell_type].unique())


import tensorflow as tf



def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)


n_feat = ref_py.shape[1]
n_labs = len(ref_py.obs[cell_type].unique())

ds = deepscore.DeepScore(hidden_nodes=[1024, 256],
            n_features=n_feat, 
            n_labels=n_labs,
            epochs=30,
            batch_size=128, 
            activation="relu", 
            dropout=True, 
            dropout_rate=0.1,
            batchnorm=True, 
            lr=0.001,
            weight_reg=True)



#     sc.pp.neighbors(adata, n_neighbors=10, use_rep='X_lsi_harmonized')
#     sc.tl.umap(adata)

#     sc.pl.umap(adata, color = ['Deepscore_external','Deepscore','Deepscore_RNA'], size = 15, legend_loc='on data')

# from numba import cuda 
# device = cuda.get_current_device()
# device.reset()
# In[15]:


import os
# os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"


# In[16]:


import sklearn


#     partitions =6
#     interval = int(ref_py.shape[0]/partitions)
#     for i in range(partitions-1):
#         ref_ = ref_py[sklearn.utils.resample(ref_py.obs,random_state=42,n_samples=interval,replace=True,stratify=ref_py.obs[cell_type]).index].copy()

ds.set_reference(ref_py, label_by=cell_type, test_prop=0.1)

ds.train(earlystopping=True, patience=10, lr_scheduler=scheduler,)
# ds.model.save(f'models/deepscore')


prob_df, adata = ds.annotate(adata, pred_key='Deepscore_HCA',Unclassified = False,return_pred_matrix=True)


adata.obs[['Deepscore_HCA','Deepscore_HCA_score']].to_csv(f'csv/Deepscore_HCA_l1_{mod}_CLEAN.csv')

prob_df.to_csv(f'csv/prob_matrix/Deepscore_HCA_l1_{mod}_CLEAN.csv')



This is where adjacency matrices should go now.


Model: "deepscore"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization (Batch  (None, 2562)              10248     
 Normalization)                                                  
                                                                 
 dense1024 (Dense)           (None, 1024)              2624512   
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 batch_normalization_1 (Bat  (None, 1024)              4096      
 chNormalization)                                                
                                                                 
 dense256 (Dense)            (None, 256)               262400    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0 

In [7]:
mod

'SmartSeq'