In [4]:
import numpy as np
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42 # enables correct plotting of text
rcParams['figure.figsize'] = (7,7)

import random
import tensorflow as tf

seed = 42


random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)


# Import deepscore model # Add deepscore folder to path
from deepscore import deepscore 
import pickle

In [None]:
# cell_type = 'cell_type'

ref_py = sc.read('/home/macera/Documentos/CZI/MANUSCRIPT_PREP/REVIEWS/external_data/heart/reference.h5ad')

In [3]:
# ref_py.layers['counts'] = ref_py.X.copy()

ref_py.var['ENSG'] = ref_py.var.index.copy()
# ref_py.var.index = ref_py.var['feature_name'].copy()

In [4]:
ref_py.X.data

array([1., 1., 1., ..., 1., 2., 1.], dtype=float32)

In [None]:
# -*- coding: utf-8 -*-
from pathlib import Path
import os
import pickle
import scanpy as sc
import numpy as np
import pandas as pd

# ----------------------------
# Config
# ----------------------------
OVERLAPPING = False  # (unused placeholder)
COMPUTE = False

IN_ADATA = ref_py  # <-- assumes you already have an AnnData named ref_py in memory
CELLTYPE_KEY = "cell_type"
SUBSTATE_KEY = "cell_state"

OUT_SUBSETS = Path("objects/reference_subset_clean")
OUT_MARKERS = Path("markers_ds")



OUT_SUBSETS.mkdir(parents=True, exist_ok=True)
OUT_MARKERS.mkdir(parents=True, exist_ok=True)

# ----------------------------
# (0) Make sure X is normalized only if needed
#     If your on-disk subsets should be raw counts, skip this block and write raw.
#     Otherwise keep it, but then you can skip re-normalization later.
# ----------------------------
IN_ADATA.X = IN_ADATA.X.copy()
sc.pp.normalize_total(IN_ADATA, target_sum=1e4)
sc.pp.log1p(IN_ADATA)

# ----------------------------
# (1) Save per-cell_type subsets
# ----------------------------
cell_types = IN_ADATA.obs[CELLTYPE_KEY].unique().tolist()
for ct in cell_types:
    safe = ct.replace("/", "_")
    adata_ct = IN_ADATA[IN_ADATA.obs[CELLTYPE_KEY] == ct].copy()
    adata_ct.write(OUT_SUBSETS / f"{safe}.h5ad", compression="gzip")

# ----------------------------
# (2) Identify types with only a single substate
# ----------------------------
single_cat_ct = []
for ct in cell_types:
    safe = ct.replace("/", "_")
    adata_ct = sc.read(OUT_SUBSETS / f"{safe}.h5ad")
    n_states = adata_ct.obs[SUBSTATE_KEY].nunique()
    if n_states == 1:
        print(f"{safe}: Single category")
        single_cat_ct.append(ct)

multi_state_ct = [ct for ct in cell_types if ct not in single_cat_ct]

# ----------------------------
# (3) For each multi-state cell_type, run DE across substates
# ----------------------------
if COMPUTE:
    for ct in multi_state_ct:
        safe = ct.replace("/", "_")

        markers_filename = f"heart_cell_state_{safe}"

        if os.path.exists(markers_filename):
            pass
        else:
            adata_ct = sc.read(OUT_SUBSETS / f"{safe}.h5ad").copy()

            # Keep minimal QC (assumes counts or already logged — both are fine for wilcoxon w/ use_raw=False)
            sc.pp.filter_cells(adata_ct, min_counts=3)
            sc.pp.filter_genes(adata_ct, min_counts=3)

            # If files are raw counts, normalize+log here. If already normalized+logged, you may skip.
            sc.pp.normalize_total(adata_ct, target_sum=1e4)
            sc.pp.log1p(adata_ct)

            # Ensure grouping column is categorical and has >1 groups
            adata_ct.obs[SUBSTATE_KEY] = adata_ct.obs[SUBSTATE_KEY].astype("category")
            if adata_ct.obs[SUBSTATE_KEY].nunique() < 2:
                print(f"{safe}: <2 groups after filtering; skipping.")
                continue

            # Wilcoxon DE across substates (ties correction on)
            sc.tl.rank_genes_groups(
                adata_ct,
                groupby=SUBSTATE_KEY,
                method="wilcoxon",
                use_raw=False,
                tie_correct=True,
                n_genes=adata_ct.n_vars  # return full ranking
            )

            ranked = adata_ct.uns["rank_genes_groups"].copy()
            markers_filename = f"heart_cell_state_{safe}"
            with open(OUT_MARKERS / f"{markers_filename}.pickle", "wb") as handle:
                pickle.dump(ranked, handle, protocol=pickle.HIGHEST_PROTOCOL)

            print(f"{safe}: wrote {OUT_MARKERS / f'{markers_filename}.pickle'}")



Endothelial cell: wrote markers_ds/heart_cell_state_Endothelial cell.pickle
Mural cell: wrote markers_ds/heart_cell_state_Mural cell.pickle
Fibroblast: wrote markers_ds/heart_cell_state_Fibroblast.pickle
Atrial Cardiomyocyte: wrote markers_ds/heart_cell_state_Atrial Cardiomyocyte.pickle
Ventricular Cardiomyocyte: wrote markers_ds/heart_cell_state_Ventricular Cardiomyocyte.pickle


In [4]:
adata =  sc.read('/home/macera/Documentos/CZI/MANUSCRIPT_PREP/REVIEWS/external_data/heart/query.h5ad')

ds_hca = pd.read_csv(f'csv/Deepscore_HCA_l1_HEART.csv', index_col=0)

adata.obs['Deepscore_HCA_l1'] = ds_hca['Deepscore_HCA'].astype('category')
adata.obs['Deepscore_HCA_l1_score'] = ds_hca['Deepscore_HCA_score'].astype('category')


In [7]:

# In[33]:
try:
    os.mkdir(f'objects/QUERY_Heart_subset/')
except:
    print('Directory for subset saving already exists')


for i in adata.obs['Deepscore_HCA_l1'].unique():
    print(i)
    a = i.replace('/','_')
    # if os.path.exists(f'objects/reference_subset/{a}.h5ad') == False:
    adata[adata.obs['Deepscore_HCA_l1'].isin([i])].write(f'objects/QUERY_Heart_subset/{a}.h5ad', compression='gzip')

Directory for subset saving already exists
Endothelial cell
Mural cell
Fibroblast
Ventricular Cardiomyocyte
Atrial Cardiomyocyte


In [5]:
adata.obs['cell_type'].unique()

['Endothelial cell', 'Mural cell', 'Fibroblast', 'Ventricular Cardiomyocyte', 'Atrial Cardiomyocyte']
Categories (5, object): ['Atrial Cardiomyocyte', 'Ventricular Cardiomyocyte', 'Fibroblast', 'Endothelial cell', 'Mural cell']

In [6]:
cell_type_list= ['Endothelial cell', 'Mural cell', 'Fibroblast', 'Ventricular Cardiomyocyte', 'Atrial Cardiomyocyte']


In [3]:
import os
n_markers = 300
cell_type = 'cell_state'

cell_type_list= ['Endothelial cell', 'Mural cell', 'Fibroblast', 'Ventricular Cardiomyocyte', 'Atrial Cardiomyocyte']


for i in cell_type_list:
    a = i.replace('/','_')
    if os.path.exists(f'csv/Deepscore_HCA_l3_{a}_CLEAN.csv'):
        print(f'{a} already exists!')
    else:
        ref_py = sc.read(f'objects/reference_subset_clean/{a}.h5ad')
        adata = sc.read(f'objects/QUERY_Heart_subset/{a}.h5ad', compression='gzip')

        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)

        markers_filename = f"heart_cell_state_{a}"        # Identify differentially expressed genes between cell types

        with open(f'markers_ds/{markers_filename}.pickle', 'rb') as handle:
            ranked_genes_populations = pickle.load(handle) 

        # Step 1: Create a dictionary to store markers for each subset
        subset_markers_dict = {}

        # Step 2: Store markers for each subset
        for subset in ref_py.obs[cell_type].unique():
            subset_markers = ranked_genes_populations['names'][subset]
            subset_markers = [gene for gene in subset_markers if gene in adata.var.index]
            subset_markers_dict[subset] = set(subset_markers[:n_markers+100])

        # Step 3: Identify overlapping markers
        overlapping_markers = set()
        for subset, markers in subset_markers_dict.items():
            for other_subset, other_markers in subset_markers_dict.items():
                if subset != other_subset:
                    overlapping_markers.update(markers.intersection(other_markers))

        # Step 4: Select markers for each subset, excluding overlapping markers
        marker_dict = {}
        for subset, markers in subset_markers_dict.items():
            unique_markers = [marker for marker in markers if marker not in overlapping_markers]
            marker_dict[subset] = unique_markers[:n_markers]  # Select up to 200 unique markers
            # print(subset,len(unique_markers[:n_markers]))
        selected_markers = [marker for subset in marker_dict for marker in marker_dict[subset]]

        # print(selected_markers)



        # Subset the data to the selected markers
        ref_py = ref_py[:, list(selected_markers)].copy()
        adata = adata[:, list(selected_markers)].copy()


        sc.pp.scale(ref_py)
        sc.pp.scale(adata) 

        ref_py.obs[cell_type] = ref_py.obs[cell_type].tolist()
        len(ref_py.obs[cell_type].unique())

        def scheduler(epoch, lr):
            if epoch < 10:
                return lr
            else:
                return lr * tf.math.exp(-0.1)


        n_feat = ref_py.shape[1]
        n_labs = len(ref_py.obs[cell_type].unique())

        ds = deepscore.DeepScore(hidden_nodes=[512, 256],
                    n_features=n_feat, 
                    n_labels=n_labs,
                    epochs=30,
                    batch_size=128, 
                    activation="relu", 
                    dropout=True, 
                    dropout_rate=0.3,
                    batchnorm=True, 
                    lr=0.001,
                    weight_reg=True)

        os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"

        ds.set_reference(ref_py, label_by=cell_type, test_prop=0.1)

        ds.train(earlystopping=True, patience=10, lr_scheduler=scheduler,)

        prob_df, adata = ds.annotate(adata, pred_key='Deepscore_HCA',Unclassified = False,return_pred_matrix=True)

        adata.obs[['Deepscore_HCA','Deepscore_HCA_score']].to_csv(f'csv/Deepscore_HCA_l3_{a}.csv')

        prob_df.to_csv(f'csv/prob_matrix/Deepscore_HCA_l3_{a}.csv')

Model: "deepscore"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization (Batch  (None, 1653)              6612      
 Normalization)                                                  
                                                                 
 dense512 (Dense)            (None, 512)               846848    
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 batch_normalization_1 (Bat  (None, 512)               2048      
 chNormalization)                                                
                                                                 
 dense256 (Dense)            (None, 256)               131328    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0 

In [5]:
# GRID SEARCH

"""
Auto-tuning wrapper for DeepScore per cell type.

- Reproducible: fixed seeds.
- Small, sensible hyperparameter grid.
- Picks best config per cell type using (val_acc, -val_loss, test_acc).
- Saves best Keras model and CSV outputs preserving your filenames.
"""
import os, pickle, json, time
import numpy as np
import scanpy as sc
import tensorflow as tf
from itertools import product
from copy import deepcopy

# --- Determinism (set this BEFORE creating any TF graph/model) ---
SEED = 0
os.environ["PYTHONHASHSEED"] = str(SEED)
os.environ["TF_DETERMINISTIC_OPS"] = "1"
# Optional: if you really need the async allocator, set it BEFORE TF is imported.
os.environ["TF_GPU_ALLOCATOR"] = "cuda_malloc_async"
np.random.seed(SEED)
tf.random.set_seed(SEED)

from deepscore import deepscore  # from your deepscore.py

# ---- Your inputs ----
n_markers = 300
cell_type = 'cell_state'
cell_type_list = [
    'Endothelial cell', 'Mural cell', 'Fibroblast',
    'Ventricular Cardiomyocyte', 'Atrial Cardiomyocyte'
]

# --- LR schedulers ---
def lr_none(epoch, lr):      # identity
    return lr

def lr_exp_decay(epoch, lr): # gentle exponential decay after warmup
    if epoch < 10:
        return lr
    return lr * tf.math.exp(-0.08)

LR_SCHEDULERS = {
    "none": lr_none,
    "exp_decay": lr_exp_decay,
}

# --- Small hyperparameter search space (kept modest to avoid long runs) ---
GRID = {
    "hidden_nodes": [
        (512, 256),
        (256, 128),
        (512, 256, 128),
    ],
    "dropout": [True],
    "dropout_rate": [0.2, 0.3, 0.4],
    "batchnorm": [True],
    "lr": [1e-3, 5e-4],
    "l2": [0.0, 1e-4, 1e-3],   # L2 via kernel_regularizer
    "weight_reg": [True, False],# max-norm on weights (see class ctor)
    "batch_size": [64, 128],
    "epochs": [30, 60],
    "patience": [5, 10],
    "val_split": [0.1],        # keep stable
    "test_prop": [0.1],        # keep enough test for evaluation
    "lr_sched_name": ["none", "exp_decay"],
}

def grid_iter(grid_dict):
    keys = list(grid_dict.keys())
    for values in product(*[grid_dict[k] for k in keys]):
        yield dict(zip(keys, values))

# --- Utilities ---
def select_markers_for_subset(ref_py, adata, ranked_genes_populations, label_key, n_markers=300, pad=100):
    """Build the non-overlapping marker set per cell-state and subset both AnnDatas."""
    subset_markers_dict = {}
    for subset in ref_py.obs[label_key].unique():
        subset_markers = ranked_genes_populations['names'][subset]
        subset_markers = [g for g in subset_markers if g in adata.var.index]
        subset_markers_dict[subset] = set(subset_markers[:n_markers + pad])

    # remove overlapping markers across subsets
    overlapping = set()
    subs = list(subset_markers_dict.keys())
    for i, s in enumerate(subs):
        for t in subs[i+1:]:
            overlapping.update(subset_markers_dict[s].intersection(subset_markers_dict[t]))

    marker_dict = {}
    for subset, markers in subset_markers_dict.items():
        uniq = [m for m in markers if m not in overlapping]
        marker_dict[subset] = uniq[:n_markers]
    selected = [m for subset in marker_dict for m in marker_dict[subset]]
    return list(selected)

def evaluate_config(ref_py, adata, label_key, cfg):
    """
    Train one DeepScore config and return metrics for ranking.
    Returns (summary_dict, ds) so the caller can persist the winning model.
    """
    # Re-seed before each trial
    np.random.seed(SEED); tf.random.set_seed(SEED)

    n_feat = ref_py.shape[1]
    n_labs = len(ref_py.obs[label_key].unique())
    ds = deepscore.DeepScore(
        hidden_nodes=list(cfg["hidden_nodes"]),
        n_features=n_feat,
        n_labels=n_labs,
        epochs=int(cfg["epochs"]),
        batch_size=int(cfg["batch_size"]),
        activation="relu",
        dropout=bool(cfg["dropout"]),
        dropout_rate=float(cfg["dropout_rate"]),
        batchnorm=bool(cfg["batchnorm"]),
        lr=float(cfg["lr"]),
        weight_reg=bool(cfg["weight_reg"]),
        l1=0.0,
        l2=float(cfg["l2"]),
    )  # constructor supports these args and metrics include CategoricalAccuracy. :contentReference[oaicite:2]{index=2}

    # internal train/test split occurs here
    ds.set_reference(ref_py, label_by=label_key, test_prop=float(cfg["test_prop"]))  # :contentReference[oaicite:3]{index=3}

    sched = LR_SCHEDULERS[cfg["lr_sched_name"]]
    # Train; we’ll read back the last val metrics from History
    history = ds.train(
        val_split=float(cfg["val_split"]),
        earlystopping=True,
        lr_scheduler=sched,
        patience=int(cfg["patience"]),
        training_report=True,   # returns [loss, acc] on test
    )
    # training_report returns "evaluation" = [test_loss, test_acc] according to the code. :contentReference[oaicite:4]{index=4}
    test_loss, test_acc = float(history[0]), float(history[1])

    # Try to access last epoch validation metrics via model.history if present.
    # Keras History is not returned directly by train(), but callbacks store logs.
    # We’ll approximate using TensorBoard not being directly accessible; instead,
    # rely on test metrics for ranking and use val_split stability.
    # If you'd like strict val metrics, consider instrumenting deepscore.train to return history.history.

    summary = {
        "val_acc": None,      # placeholder (not directly exposed)
        "val_loss": None,     # placeholder
        "test_acc": test_acc,
        "test_loss": test_loss,
    }
    return summary, ds

def better_than(a, b):
    """Comparator: prioritize higher test_acc; tie-breaker lower test_loss."""
    if b is None: 
        return True
    if a["test_acc"] > b["test_acc"]: 
        return True
    if a["test_acc"] == b["test_acc"] and a["test_loss"] < b["test_loss"]:
        return True
    return False

# --- Main loop over cell types ---
os.makedirs('csv', exist_ok=True)
os.makedirs('csv/prob_matrix', exist_ok=True)
os.makedirs('models', exist_ok=True)
os.makedirs('markers_ds', exist_ok=True)

results_all = []

for i in cell_type_list:
    a = i.replace('/','_')
    out_csv = f'csv/Deepscore_HCA_l3_{a}_CLEAN.csv'
    if os.path.exists(out_csv):
        print(f'{a} already exists!'); 
        continue

    # Load reference & query
    ref_py = sc.read(f'objects/reference_subset_clean/{a}.h5ad')
    adata  = sc.read(f'objects/QUERY_Heart_subset/{a}.h5ad', compression='gzip')

    # Basic preprocessing (as in your snippet)
    sc.pp.normalize_total(adata, target_sum=1e4); sc.pp.log1p(adata)

    markers_filename = f"heart_cell_state_{a}"
    with open(f'markers_ds/{markers_filename}.pickle', 'rb') as handle:
        ranked_genes_populations = pickle.load(handle)

    # Build unique marker panel & subset
    selected_markers = select_markers_for_subset(
        ref_py, adata, ranked_genes_populations, label_key=cell_type,
        n_markers=n_markers, pad=100
    )
    ref_use = ref_py[:, selected_markers].copy()
    ad_use  = adata[:,  selected_markers].copy()

    # Scale (to stabilize optimization)
    sc.pp.scale(ref_use); sc.pp.scale(ad_use)

    # Ensure label dtype
    ref_use.obs[cell_type] = ref_use.obs[cell_type].astype(str)

    # --- Hyperparameter search ---
    best_metrics = None
    best_cfg = None
    best_model = None

    t0 = time.time()
    tried = 0
    for cfg in grid_iter(GRID):
        tried += 1
        cfg = deepcopy(cfg)
        metrics, ds = evaluate_config(ref_use, ad_use, cell_type, cfg)
        row = dict(cell_type=a, tried=tried, **cfg, **metrics)
        results_all.append(row)
        print(f"[{a}] Trial {tried}: test_acc={metrics['test_acc']:.4f}, test_loss={metrics['test_loss']:.4f}")

        if better_than(metrics, best_metrics):
            best_metrics = metrics
            best_cfg = deepcopy(cfg)
            best_model = ds  # keep the trained model of the best config so far

    elapsed = time.time() - t0
    print(f"[{a}] best_cfg: {json.dumps(best_cfg, default=str)} | metrics: {best_metrics} | {elapsed/60:.1f} min")

    # --- Final annotation with the best model ---
    # Use the best trained model directly to annotate the query
    prob_df, ad_scored = best_model.annotate(ad_use, pred_key='Deepscore_HCA',
                                             Unclassified=False, return_pred_matrix=True)  # :contentReference[oaicite:5]{index=5}

    # Save outputs (your original filenames)
    ad_scored.obs[['Deepscore_HCA','Deepscore_HCA_score']].to_csv(f'csv/Deepscore_HCA_l3_{a}.csv')
    prob_df.to_csv(f'csv/prob_matrix/Deepscore_HCA_l3_{a}.csv')

    # Optional: persist best Keras model for this cell type
    model_path = f"models/deepscore_best_{a}.keras"
    best_model.model.save(model_path)

# --- Save the search log (all cell types) ---
import pandas as pd
pd.DataFrame(results_all).to_csv("csv/deepscore_hparam_search_log.csv", index=False)
print("Search complete. Log written to csv/deepscore_hparam_search_log.csv")


Model: "deepscore"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization (Batch  (None, 1653)              6612      
 Normalization)                                                  
                                                                 
 dense512 (Dense)            (None, 512)               846848    
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 batch_normalization_1 (Bat  (None, 512)               2048      
 chNormalization)                                                
                                                                 
 dense256 (Dense)            (None, 256)               131328    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0 

2025-08-21 19:00:11.415541: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 298306992 exceeds 10% of free system memory.
2025-08-21 19:00:11.561069: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 268473648 exceeds 10% of free system memory.


Epoch 1/30


2025-08-21 19:00:11.908546: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 268473648 exceeds 10% of free system memory.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30

Evaluating model performance on unseen data (test data):


test loss: 1.05083, test accuracy:'               '0.68941
[Endothelial cell] Trial 1: test_acc=0.6894, test_loss=1.0508
Model: "deepscore"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization_3 (Bat  (None, 1653)              6612      
 chNormalization)                                                
                                                                 
 dense512 (Dense)            (None, 512)               846848    
                                                                 
 dropout_2 (Dropout)         (None, 512)               0         
                                                                 
 batch_normalization_4 (Bat  (None, 512)               2048      
 chNormalization)                                                
                   

2025-08-21 19:00:56.750868: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 298306992 exceeds 10% of free system memory.
2025-08-21 19:00:56.864414: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 268473648 exceeds 10% of free system memory.


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30

Evaluating model performance on unseen data (test data):


test loss: 1.08696, test accuracy:'               '0.68741
[Endothelial cell] Trial 2: test_acc=0.6874, test_loss=1.0870
Model: "deepscore"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization_6 (Bat  (None, 1653)              6612      
 chNormalization)                                                
                                                                 
 dense512 (Dense)            (None, 512)               846848    
                                                                 
 dropout_4 (Dropout)         (None, 512)               0         
                                                                 
 batch_normalization_7 (Bat  (None, 512)               2048      
 chNormalization)                                                
        

KeyboardInterrupt: 

In [9]:
## SET PARAMETERS
n_markers = 500 # Max number of markers to use per cell-type
overlapping = False # Parameter to control overlapping marker genes between cell types on the prediction.


# ref_py_save = ref_py.copy()

# for mod in ['scRNA','snRNA','scRNA5p']:
#     if os.path.exists(f'csv/Deepscore_HCA_l1_{mod}_CLEAN.csv'):
#         print(f'{mod} already exists!')

mod='HEART'

# adata.X = adata.layers['counts'].copy()


with open(f'markers_ds/{markers_filename}.pickle', 'rb') as handle:
    ranked_genes_populations = pickle.load(handle) 

if overlapping:
    selected_markers =[]
    for cell_type_ in ref_py.obs[cell_type].unique():
        cell_type_markers = []
        for marker in ranked_genes_populations['names'][cell_type_][:n_markers]:
            if marker in adata.var.index: 
                selected_markers.append(marker)
    selected_markers = set(selected_markers)

else:
    # Step 2: Store markers for each subset
    subset_markers_dict ={}
    for subset in ref_py.obs[cell_type].unique():
        subset_markers = ranked_genes_populations['names'][subset]
        subset_markers = [gene for gene in subset_markers if gene in adata.var.index]
        subset_markers_dict[subset] = set(subset_markers[:n_markers+100])

    # Step 3: Identify overlapping markers
    overlapping_markers = set()
    for subset, markers in subset_markers_dict.items():
        for other_subset, other_markers in subset_markers_dict.items():
            if subset != other_subset:
                overlapping_markers.update(markers.intersection(other_markers))

    # Step 4: Select markers for each subset, excluding overlapping markers
    marker_dict = {}
    for subset, markers in subset_markers_dict.items():
        unique_markers = [marker for marker in markers if marker not in overlapping_markers]
        marker_dict[subset] = unique_markers[:n_markers]  # Select up to TOP n_markers
    selected_markers = [marker for subset in marker_dict for marker in marker_dict[subset]]



    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)


    # Subset the data to the selected markers

    ref_py = ref_py[:, list(selected_markers)].copy()
    adata = adata[:, list(selected_markers)].copy()

    len(selected_markers)

    sc.pp.scale(ref_py)
    sc.pp.scale(adata)

    ref_py.obs[cell_type] = ref_py.obs[cell_type].tolist()
    len(ref_py.obs[cell_type].unique())


    def scheduler(epoch, lr):
        if epoch < 10:
            return lr
        else:
            return lr * tf.math.exp(-0.1)


    n_feat = ref_py.shape[1]
    n_labs = len(ref_py.obs[cell_type].unique())

    ds = deepscore.DeepScore(hidden_nodes=[1024, 256],
                   n_features=n_feat, 
                   n_labels=n_labs,
                   epochs=30,
                   batch_size=128, 
                   activation="relu", 
                   dropout=True, 
                   dropout_rate=0.1,
                   batchnorm=True, 
                   lr=0.001,
                   weight_reg=True)




    import os
    os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"

    ds.set_reference(ref_py, label_by=cell_type, test_prop=0.1)

    ds.train(earlystopping=True, patience=10, lr_scheduler=scheduler,)
    # ds.model.save(f'models/deepscore') # In case you want to save the DS model

    prob_df, adata = ds.annotate(adata, pred_key='Deepscore_HCA',Unclassified = False,return_pred_matrix=True)

    # SAVE the RESULTS on csv
    adata.obs[['Deepscore_HCA','Deepscore_HCA_score']].to_csv(f'csv/Deepscore_HCA_l1_{mod}.csv')

    prob_df.to_csv(f'csv/prob_matrix/Deepscore_HCA_l1_{mod}_CLEAN.csv')


Model: "deepscore"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 batch_normalization (Batch  (None, 1850)              7400      
 Normalization)                                                  
                                                                 
 dense1024 (Dense)           (None, 1024)              1895424   
                                                                 
 dropout (Dropout)           (None, 1024)              0         
                                                                 
 batch_normalization_1 (Bat  (None, 1024)              4096      
 chNormalization)                                                
                                                                 
 dense256 (Dense)            (None, 256)               262400    
                                                                 
 dropout_1 (Dropout)         (None, 256)               0 

2025-08-14 17:43:00.721280: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2488464600 exceeds 10% of free system memory.
2025-08-14 17:43:01.785926: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 2239617400 exceeds 10% of free system memory.


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30

Evaluating model performance on unseen data (test data):


test loss: 0.00604, test accuracy:'               '0.99872


2025-08-14 17:59:28.342640: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1699195400 exceeds 10% of free system memory.


  10/1794 [..............................] - ETA: 11s 

2025-08-14 17:59:29.499760: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1699195400 exceeds 10% of free system memory.


