## Notebook to cluster cells based on senescence markers to identify potentially senescent cells

attempt to use this functionality for determing scores for
- Hu Y, Fryatt GL, Ghorbani M et al. Replicative senescence dictates the emergence of disease-associated microglia and contributes to Aβ pathology. Cell Rep 2021;35:109228.
https://www.sciencedirect.com/science/article/pii/S2211124721005799?via%3Dihub
    - "custom senescence signature (Cdkn2a, Cdkn1a, Cdkn2d, Casp8, Il1b, Glb1, Serpine1)"
- Dehkordi SK, Walker J, Sah E et al. Profiling senescent cells in human brains reveals neurons with CDKN2D/p19 and tau neuropathology. Nat Aging 2021;1:1107–16.
https://pubmed.ncbi.nlm.nih.gov/35531351/
- Casella G, Munk R, Kim KM et al. Transcriptome signature of cellular senescence. Nucleic Acids Res 2019;47:7294–305.
https://pubmed.ncbi.nlm.nih.gov/31251810/
    - Canonical Senescence Pathway (CDKN2D, ETS2, RB1, E2F3, CDK6, RBL2, ATM, BMI1, MDM2, CDK4, CCNE1)
    - Senescence Response Pathway (IGFBP7, VIM, FN1, SPARC, IGFBP4, TIMP1, TBX2, TBX3, COL1A1, COL3A1, IGFBP2, TGFB1I1, PTEN, CD44, NFIA, CALR, TIMP2, CXCL8)
    - Senescence Initiating Pathway (SOD1, MAP2K1, GSK3B, PIK3CA, SOD2, MAPK14, IGF1R, TP53BP1, NBN, HRAS, CITED2, CREG1, ABL1, MORC3, NFKB1, AKT1, CDKN1B, EGR1, RBL1, MAP2K6, IGF1, IRF3, PCNA, GADD45A, MAP2K3, IGFBP5, SIRT1, ING1, TGFB1, TERF2)
    - Cell Age (PEBP1, PKM, CKB, AAK1, NUAK1, MAST1, SORBS2, BRAF, SPIN1, MAP2K1, YPEL3, MAPK14, PDPK1, TOP1, ITPK1, MATK, RPS6KA6, SPOP, ITSN2, PDZD2, MAP2K2, LIMK1, DHCR24, PBRM1, MAP3K7, SIN3B, SOX5, EWSR1, PDCD10, CPEB1, NEK4, RB1, MCRS1, PNPT1, HRAS, STK32C, RAF1, ETS2, SMARCB1, FASTK, SLC13A3, TRIM28, MORC3, MAPKAPK5, MAP2K7, STK40, PMVK, CEBPB, GRK6, STAT5B, CDKN1B, PDIK1L, AKT1, MAPK12, MAP2K6, PIAS4, ADCK5, SMURF2, PCGF2, IRF3, PLA2R1, TYK2, ERRFI1, BRD7, ING2, FBXO31, NADK, PTTG1, BHLHE40, ASF1A, ING1, NINJ1, MXD4)
    - UniUp ['TMEM159', 'CHPF2', 'SLC9A7', 'PLOD1', 'FAM2234B', 'DHRS7', 'SRPX', 'SRPX2', 'TNFSF13B', 'PDLIM1', 'ELMOD1', 'CCND3', 'TMEM30A', 'STAT1', 'RND3', 'TMEM59', 'SARAF', 'SLC16A14', 'SLC02B1', 'ARRDC4', 'PAM', 'WDR78', 'NCSTN', 'GPR155', 'CLDN1', 'JCAD', 'BLCAP', 'FILIP1L', 'TAP1', 'TNFRSF10C', 'SAMD9L', 'SMC03', 'POFUT2', 'KIAA1671', 'LRP10', 'BMS1P9', 'MAP4K3-DT', 'AC002480.1', 'LINC02154', 'TM4SF1-AS1', 'PTCHD4', 'H2AFJ', 'PURPL']

In [None]:
!date

#### import libraries

In [None]:
import scanpy as sc
from anndata import AnnData
from numpy import ndarray
from random import seed, randint
from scipy.stats import zscore, shapiro, kstest, norm
from pandas import DataFrame
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
from seaborn import scatterplot, displot, heatmap
from pandas import read_csv, DataFrame

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

import warnings
warnings.filterwarnings(action='ignore')

#### set notebook variables

In [None]:
# naming
project = 'aging'

# directories 
wrk_dir = '/home/jupyter/brain_aging_phase1'
quants_dir = f'{wrk_dir}/demux'
results_dir = f'{wrk_dir}/results'

# in files
in_file = f'{quants_dir}/{project}.pegasus.leiden_085.subclustered.h5ad'
glmmtmb_file = f'{results_dir}/{project}.glmmtmb_age_diffs_fdr.csv'

# variables
DEBUG = True
dpi_value = 50
gene_sets = {
             'CSS': ['CDKN2A', 'CDKN1A', 'CDKN2D', 'CASO8', 'IL1B', 'GLB1', 'SERPINE1'],
             'CSP': ['CDKN2D', 'ETS2', 'RB1', 'E2F3', 'CDK6', 'RBL2', 'ATM', 
                     'BMI1', 'MDM2', 'CDK4', 'CCNE1'],
             'SRP': ['IGFBP7', 'VIM', 'FN1', 'SPARC', 'IGFBP4', 'TIMP1', 'TBX2', 
                     'TBX3', 'COL1A1', 'COL3A1', 'IGFBP2', 'TGFB1I1', 'PTEN', 
                     'CD44', 'NFIA', 'CALR', 'TIMP2', 'CXCL8'],
             'SIP': ['SOD1', 'MAP2K1', 'GSK3B', 'PIK3CA', 'SOD2', 'MAPK14', 
                     'IGF1R', 'TP53BP1', 'NBN', 'HRAS', 'CITED2', 'CREG1', 
                     'ABL1', 'MORC3', 'NFKB1', 'AKT1', 'CDKN1B', 'EGR1', 
                     'RBL1', 'MAP2K6', 'IGF1', 'IRF3', 'PCNA', 'GADD45A', 
                     'MAP2K3', 'IGFBP5', 'SIRT1', 'ING1', 'TGFB1', 'TERF2'], 
             'CellAge': ['PEBP1', 'PKM', 'CKB', 'AAK1', 'NUAK1', 'MAST1', 
                         'SORBS2', 'BRAF', 'SPIN1', 'MAP2K1', 'YPEL3', 'MAPK14', 
                         'PDPK1', 'TOP1', 'ITPK1', 'MATK', 'RPS6KA6', 'SPOP', 
                         'ITSN2', 'PDZD2', 'MAP2K2', 'LIMK1', 'DHCR24', 'PBRM1', 
                         'MAP3K7', 'SIN3B', 'SOX5', 'EWSR1', 'PDCD10', 'CPEB1', 
                         'NEK4', 'RB1', 'MCRS1', 'PNPT1', 'HRAS', 'STK32C', 'RAF1', 
                         'ETS2', 'SMARCB1', 'FASTK', 'SLC13A3', 'TRIM28', 'MORC3', 
                         'MAPKAPK5', 'MAP2K7', 'STK40', 'PMVK', 'CEBPB', 'GRK6', 
                         'STAT5B', 'CDKN1B', 'PDIK1L', 'AKT1', 'MAPK12', 'MAP2K6', 
                         'PIAS4', 'ADCK5', 'SMURF2', 'PCGF2', 'IRF3', 'PLA2R1', 
                         'TYK2', 'ERRFI1', 'BRD7', 'ING2', 'FBXO31', 'NADK', 'PTTG1', 
                         'BHLHE40', 'ASF1A', 'ING1', 'NINJ1', 'MXD4'], 
             'UniUp': ['TMEM159', 'CHPF2', 'SLC9A7', 'PLOD1', 'FAM2234B', 'DHRS7', 
                     'SRPX', 'SRPX2', 'TNFSF13B', 'PDLIM1', 'ELMOD1', 'CCND3', 
                     'TMEM30A', 'STAT1', 'RND3', 'TMEM59', 'SARAF', 'SLC16A14', 
                     'SLC02B1', 'ARRDC4', 'PAM', 'WDR78', 'NCSTN', 'GPR155', 
                     'CLDN1', 'JCAD', 'BLCAP', 'FILIP1L', 'TAP1', 'TNFRSF10C', 
                     'SAMD9L', 'SMC03', 'POFUT2', 'KIAA1671', 'LRP10', 'BMS1P9', 
                     'MAP4K3-DT', 'AC002480.1', 'LINC02154', 'TM4SF1-AS1', 
                     'PTCHD4', 'H2AFJ', 'PURPL'],
             'UniDown': ['MCUB', 'FBL', 'HIST1H1D', 'HIST1H1A', 'FAM129A', 
                         'ANP32B', 'PARP1', 'LBR', 'SSRP1', 'TMSB15A', 
                         'CBS', 'CDCA7L', 'HIST1H1E', 'CBX2', 'PTMA', 'HIST2H2AB', 
                         'ITPRIPL1', 'AC074135.1']    
}
# exclude_cell_types = ['uncertain', 'uncertain-2', 'uncertain-3', 'Astrocyte-GFAP-Hi']
exclude_cell_types = ['Astrocyte-GFAP-Hi']
seed(42)

### read the anndata (h5ad) file

In [None]:
%%time
adata = sc.read(in_file, cache=True)
print(adata)

### Plot the clusters

In [None]:
with rc_context({'figure.figsize': (9, 9), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-bright')
    sc.pl.umap(adata, color=['new_anno', 'broad_celltype'], legend_loc='on data')

In [None]:
adata.obs.broad_celltype.value_counts()

### remove cell-types that are known excludes

In [None]:
adata = adata[~adata.obs.new_anno.isin(exclude_cell_types)]
print(adata)

### replot the clusters without the excluded celltypes

In [None]:
with rc_context({'figure.figsize': (9, 9), 'figure.dpi': dpi_value}):
    plt.style.use('seaborn-bright')
    sc.pl.umap(adata, color=['new_anno', 'broad_celltype'], legend_loc='on data')

In [None]:
adata.obs.new_anno.value_counts()

### combine the senescence gene sets into single marker set

In [None]:
marker_set = []
for name, genes in gene_sets.items():
    marker_set.extend(genes)
marker_set = list(set(marker_set))    
print(f'lenght of marker set is {len(marker_set)}')

if DEBUG:
    print(marker_set)

### haven't had much luck using getting past cell-type signal to senescent signal even just using the senescent markers try to do so within each broad cell-type

In [None]:
adata.obs.broad_celltype.value_counts()

### for each broad cell-type subset the data to just the marker genes and cluster
maybe small within cell-type cluster separated based on senescent markers represents senescent cells?

In [None]:
for target_cell_type in adata.obs.broad_celltype.unique():
    print(f'#### {target_cell_type} ####')
    sdata = adata[adata.obs.broad_celltype == target_cell_type, adata.var.index.isin(marker_set)]
    if DEBUG:
        print(sdata)
    sc.pp.highly_variable_genes(sdata, n_top_genes=len(marker_set), 
                                batch_key='pool_name', flavor='seurat')
    sc.tl.pca(sdata, svd_solver='arpack', use_highly_variable=True)
    sc.pl.pca_variance_ratio(sdata, log=True)
    sc.pp.neighbors(sdata)
    sc.tl.umap(sdata)
    with rc_context({'figure.figsize': (9, 9), 'figure.dpi': dpi_value}):
        plt.style.use('seaborn-bright')
        sc.pl.umap(sdata, color=['new_anno', 'broad_celltype'], legend_loc='on data')
    sc.tl.leiden(sdata, resolution=0.5)
    sc.tl.umap(sdata)
    with rc_context({'figure.figsize': (12, 12), 'figure.dpi': dpi_value}):
        plt.style.use('seaborn-talk')
        sc.pl.umap(sdata, color=['leiden', 'new_anno', 'broad_celltype'], legend_loc='on data')
    for name, genes in gene_sets.items():
        gene_set = sorted(list(set(genes) & set(sdata.var.index)))    
        print(f'---- {name} ----')
        with rc_context({'figure.figsize': (12, 12), 'figure.dpi': dpi_value}):
            plt.style.use('seaborn-bright')
            sc.pl.dotplot(sdata, gene_set, groupby='leiden', cmap='Purples')
    display(sdata.obs.leiden.value_counts())
    display(sdata.obs.leiden.value_counts()/sdata.obs.shape[0]*100)            

In [None]:
!date