## Notebook to combine replication and reference cohort data with our Brain Aging Phase 1 data and do some harminization of obs attribute features. Additional other public data is also included for cluster and cell-type labeling purposes

Replication data: 
NABEC snRNA from Xylena Reed

public data:
1. [Leng K, Li E, Eser R et al. Molecular characterization of selectively vulnerable neurons in Alzheimer’s disease. Nat Neurosci 2021;24:276–87.](https://pubmed.ncbi.nlm.nih.gov/33432193/)
2. [Morabito S, Miyoshi E, Michael N et al. Single-nucleus chromatin accessibility and transcriptomic characterization of Alzheimer’s disease. Nat Genet 2021, DOI: 10.1038/s41588-021-00894-z.](https://pubmed.ncbi.nlm.nih.gov/34239132/)



In [1]:
!date

Tue Nov 14 19:53:41 UTC 2023


#### import libraries

In [2]:
from pandas import read_csv, concat
from scanpy import read_h5ad, read_10x_h5
from anndata import concat as ad_concat
from random import sample

#### set notebook variables

In [3]:
# naming
project = 'aging_phase1'
set_name = f'{project}_replication'

# directories
wrk_dir = '/home/jupyter/brain_aging_phase1'
demux_dir = f'{wrk_dir}/demux'
replication_dir = f'{wrk_dir}/replication'
figures_dir = f'{wrk_dir}/figures'
public_dir = f'{wrk_dir}/public'

# in files
phase1_raw_h5ad = f'{demux_dir}/aging.h5ad'
phase1_final_h5ad = f'{demux_dir}/aging.pegasus.leiden_085.subclustered.h5ad'
replication_h5ad_file = f'{replication_dir}/{project}_nabec.raw.h5ad'
replication_doublets_file = f'{replication_dir}/{project}_nabec.scrublet_scores.csv'

# out files
raw_anndata_file = f'{replication_dir}/{set_name}.raw.h5ad'

# variables
DEBUG = False

### load data

#### load the replication data

In [4]:
%%time
adata_rep = read_h5ad(replication_h5ad_file)
# retain original barcode
adata_rep.obs['Barcode'] = adata_rep.obs.index.astype('category')
print(adata_rep)
if DEBUG:
    display(adata_rep.obs.sample(5))
    display(adata_rep.var.sample(5))    

AnnData object with n_obs × n_vars = 79600 × 36601
    obs: 'sample_id', 'pmi', 'sex', 'age', 'Barcode'
CPU times: user 549 ms, sys: 1.54 s, total: 2.09 s
Wall time: 8.61 s


#### load the replication data doublet predictions

In [5]:
rep_dblt_df = read_csv(replication_doublets_file, index_col=0)
print(rep_dblt_df.shape)
if DEBUG:
    display(rep_dblt_df.head())

(79600, 6)


#### load the reference data

##### load the Leng et al data
- for the entorhinal cortex samples only keep the Braak Stage 0 samples (n=3)
- for the superior frontal gyrus only keep the Braak Stage 0 or 2 (n=7)

In [6]:
%%time
ec_file = f'{public_dir}/cellxgene_collections/Leng_entorhinal_cortex.h5ad'
adata_ref_ec = read_h5ad(ec_file)
# retain original barcode
adata_ref_ec.obs['Barcode'] = adata_ref_ec.obs.index.astype('category')
# filter by Braak Stage
adata_ref_ec = adata_ref_ec[adata_ref_ec.obs.BraakStage == '0']
print(adata_ref_ec)
if DEBUG:
    display(adata_ref_ec.obs.sample(5))
    display(adata_ref_ec.var.sample(5))   

View of AnnData object with n_obs × n_vars = 9730 × 32826
    obs: 'SampleID', 'donor_id', 'BraakStage', 'SampleBatch', 'nUMI', 'nGene', 'initialClusterAssignments', 'seurat.clusters', 'clusterAssignment', 'clusterCellType', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'Barcode'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    uns: 'schema_version', 'title'
    obsm: 'X_cca', 'X_cca.aligned', 'X_tsne'
CPU times: user 2.23 s, sys: 206 ms, total: 2.44 s
Wall time: 2.52 s


##### switch var attribute to use gene name instead of ID

In [7]:
adata_ref_ec.var['gene_id'] = adata_ref_ec.var.index.astype('category')
adata_ref_ec.var.index = adata_ref_ec.var.feature_name
if DEBUG:
    display(adata_ref_ec.var.head(10))

  """Entry point for launching an IPython kernel.


In [8]:
%%time
sfg_file = f'{public_dir}/cellxgene_collections/Leng_superior_frontal_gyrus.h5ad'
adata_ref_sfg = read_h5ad(sfg_file)
# retain original barcode
adata_ref_sfg.obs['Barcode'] = adata_ref_sfg.obs.index.astype('category')
# filter by Braak Stage
adata_ref_sfg = adata_ref_sfg[(adata_ref_sfg.obs.BraakStage == '0') | (adata_ref_sfg.obs.BraakStage == '2')]
print(adata_ref_sfg)
if DEBUG:
    display(adata_ref_sfg.obs.sample(5))
    display(adata_ref_sfg.var.sample(5))  

View of AnnData object with n_obs × n_vars = 32240 × 32826
    obs: 'SampleID', 'donor_id', 'BraakStage', 'SampleBatch', 'nUMI', 'nGene', 'initialClusterAssignments', 'seurat.clusters', 'clusterAssignment', 'clusterCellType', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'Barcode'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'
    uns: 'schema_version', 'title'
    obsm: 'X_cca', 'X_cca.aligned', 'X_tsne'
CPU times: user 4.29 s, sys: 301 ms, total: 4.59 s
Wall time: 4.69 s


##### switch var attribute to use gene name instead of ID

In [9]:
adata_ref_sfg.var['gene_id'] = adata_ref_sfg.var.index.astype('category')
adata_ref_sfg.var.index = adata_ref_sfg.var.feature_name
if DEBUG:
    display(adata_ref_sfg.var.head(10))

  """Entry point for launching an IPython kernel.


##### combine the Leng data

In [10]:
adata_leng = ad_concat([adata_ref_ec, adata_ref_sfg])
adata_leng.obs_names_make_unique()
print(adata_leng)
if DEBUG:
    display(adata_leng.obs.sample(5))
    display(adata_leng.var.sample(5))

AnnData object with n_obs × n_vars = 41970 × 32826
    obs: 'SampleID', 'donor_id', 'BraakStage', 'SampleBatch', 'nUMI', 'nGene', 'initialClusterAssignments', 'seurat.clusters', 'clusterAssignment', 'clusterCellType', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'Barcode'
    obsm: 'X_cca', 'X_cca.aligned', 'X_tsne'




#### load the Morabita data

In [11]:
%%time
morabita_data_file = f'{public_dir}/Morabita_snRNA_ATAC/GSE174367_snRNA-seq_filtered_feature_bc_matrix.h5'
adata_morabita = read_10x_h5(morabita_data_file)
# retain original barcode
adata_morabita.obs['Barcode'] = adata_morabita.obs.index.astype('category')
adata_morabita.var_names_make_unique()
print(adata_morabita)
if DEBUG:
    display(adata_morabita.obs.sample(5))
    display(adata_morabita.var.sample(5))    

  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 61770 × 58721
    obs: 'Barcode'
    var: 'gene_ids', 'feature_types', 'genome'
CPU times: user 9.29 s, sys: 1.55 s, total: 10.8 s
Wall time: 10.9 s


In [12]:
morabita_info_file = f'{public_dir}/Morabita_snRNA_ATAC/GSE174367_snRNA-seq_cell_meta.csv.gz'
morabita_info = read_csv(morabita_info_file)
print(morabita_info.shape)
# make cell IDs the index
morabita_info = morabita_info.set_index('Barcode')
# keep only info for cells present in data
morabita_info = morabita_info.loc[morabita_info.index.isin(adata_morabita.obs.index)]
print(morabita_info.shape)
if DEBUG:
    display(morabita_info.sample(5))

(61472, 12)
(61472, 11)


##### add Morabita info into the obs attribute

In [13]:
print(len(set(morabita_info.index) ^ set(adata_morabita.obs.index)))
# drop any cells there wasn't info for
adata_morabita = adata_morabita[adata_morabita.obs.index.isin(morabita_info.index)]
print(adata_morabita)
len(set(morabita_info.index) ^ set(adata_morabita.obs.index))
adata_morabita.obs = concat([adata_morabita.obs, morabita_info], axis='columns')
# keep on the control samples
adata_morabita = adata_morabita[adata_morabita.obs.Diagnosis == 'Control']
print(adata_morabita)
if DEBUG:
    display(adata_morabita.obs.sample(5))
    display(adata_morabita.var.sample(5))    

298
View of AnnData object with n_obs × n_vars = 61472 × 58721
    obs: 'Barcode'
    var: 'gene_ids', 'feature_types', 'genome'
View of AnnData object with n_obs × n_vars = 22796 × 58721
    obs: 'Barcode', 'SampleID', 'Diagnosis', 'Batch', 'Cell.Type', 'cluster', 'Age', 'Sex', 'PMI', 'Tangle.Stage', 'Plaque.Stage', 'RIN'
    var: 'gene_ids', 'feature_types', 'genome'


#### load the phase 1 discovery data

In [14]:
%%time
adata_disc = read_h5ad(phase1_raw_h5ad)
# retain original barcode
adata_disc.obs['Barcode'] = adata_disc.obs.index.astype('category')
print(adata_disc)
if DEBUG:
    display(adata_disc.obs.sample(5))
    display(adata_disc.var.sample(5))    

AnnData object with n_obs × n_vars = 168090 × 36601
    obs: 'pool_name', 'Sample_id', 'Tissue_source', 'Brain_region', 'Clinical_diagnosis', 'Age', 'Sex', 'donor_id', 'lane_num', 'Barcode'
CPU times: user 1.8 s, sys: 3.32 s, total: 5.12 s
Wall time: 22.3 s


##### load the Phase 1 processed discovery data
so the labelled cell-types can be accessed

In [15]:
%%time
adata_disc_final = read_h5ad(phase1_final_h5ad)
print(adata_disc_final)
disc_cell_types = adata_disc_final.obs[['new_anno', 'broad_celltype']].copy()
print(disc_cell_types.shape)
if DEBUG:
    display(adata_disc_final.obs.sample(5))
    display(disc_cell_types.sample(5))  

AnnData object with n_obs × n_vars = 167945 × 35441
    obs: 'pool_name', 'Sample_id', 'Tissue_source', 'Brain_region', 'Clinical_diagnosis', 'Age', 'Sex', 'donor_id', 'lane_num', 'Channel', 'n_genes', 'n_counts', 'percent_mito', 'scale', 'Group', 'leiden_labels', 'anno', 'leiden_labels_085', 'new_anno', 'Age_group', 'broad_celltype'
    var: 'n_cells', 'percent_cells', 'robust', 'highly_variable_features', 'mean', 'var', 'hvf_loess', 'hvf_rank'
    uns: 'Channels', 'Groups', 'PCs', 'W_diffmap', 'W_pca_harmony', 'broad_celltype_colors', 'c2gid', 'df_qcplot', 'diffmap_evals', 'diffmap_knn_distances', 'diffmap_knn_indices', 'genome', 'gncells', 'leiden_resolution', 'modality', 'ncells', 'new_anno_colors', 'norm_count', 'pca', 'pca_features', 'pca_harmony_knn_distances', 'pca_harmony_knn_indices', 'stdzn_max_value', 'stdzn_mean', 'stdzn_std'
    obsm: 'X_diffmap', 'X_fle', 'X_pca', 'X_pca_harmony', 'X_phi', 'X_umap', 'X_umap_085'
    varm: 'de_res', 'gmeans', 'gstds', 'means', 'partial_su

### remove doublets from the replication data

In [16]:
not_doublets = rep_dblt_df.loc[~rep_dblt_df.predicted_doublet]
print(not_doublets.shape)

(75133, 6)


In [17]:
adata_rep = adata_rep[adata_rep.obs.index.isin(not_doublets.index)]
print(adata_rep)
if DEBUG:
    display(adata_rep.obs.sample(10))

View of AnnData object with n_obs × n_vars = 75133 × 36601
    obs: 'sample_id', 'pmi', 'sex', 'age', 'Barcode'


### set the study IDs
will set these to author's last name

In [18]:
# replication data from Reed
adata_rep.obs['Study'] = 'Reed'
adata_rep.obs['Study_type'] = 'replication'
# reference data from Leng and Morabita
adata_leng.obs['Study'] = 'Leng'
adata_leng.obs['Study_type'] = 'reference'
adata_morabita.obs['Study'] = 'Morabita'
adata_morabita.obs['Study_type'] = 'reference'
# discovery data from Duffy
adata_disc.obs['Study'] = 'Duffy'
adata_disc.obs['Study_type'] = 'discovery'

  
  import sys


### harmonize study obs attribute features

#### harmonize replication data obs

In [19]:
adata_rep.obs['Batch'] = 'None'
adata_rep.obs['Cluster'] = 'None'
adata_rep.obs['Cell_type'] = 'None'
adata_rep.obs['Brain_region'] = 'frontal cortex'
adata_rep.obs = adata_rep.obs.drop(columns=['pmi'])
adata_rep.obs = adata_rep.obs.rename(columns={'sample_id': 'Sample_ID', 'sex': 'Sex', 'age': 'Age'})

#### harmonize Leng et al reference data obs

In [20]:
adata_leng.obs['Age'] = (adata_leng.obs.development_stage.str.replace('-year-old human stage','')
.str.replace(' year-old and over human stage',''))
drop_cols = ['donor_id', 'BraakStage', 'nUMI', 'nGene', 'initialClusterAssignments', 
             'cell_type', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 
             'assay_ontology_term_id', 'disease_ontology_term_id', 
             'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 
             'is_primary_data', 'organism_ontology_term_id', 'suspension_type', 
             'cell_type', 'assay', 'disease', 'organism', 'self_reported_ethnicity', 
             'seurat.clusters', 'sex_ontology_term_id', 'development_stage']
adata_leng.obs = adata_leng.obs.drop(columns=drop_cols)
adata_leng.obs = adata_leng.obs.rename(columns={'SampleID': 'Sample_ID', 
                                                'SampleBatch': 'Batch', 
                                                'clusterAssignment': 'Cluster', 
                                                'clusterCellType': 'Cell_type', 
                                                'tissue': 'Brain_region', 
                                                'sex': 'Sex'})

#### harmonize Morabita et al reference data obs

In [21]:
adata_morabita.obs['Brain_region'] = 'prefrontal cortex'
adata_morabita.obs = adata_morabita.obs.drop(columns=['Diagnosis', 'Tangle.Stage', 
                                                      'Plaque.Stage', 'PMI', 'RIN'])
adata_morabita.obs = adata_morabita.obs.rename(columns={'SampleID': 'Sample_ID', 
                                                        'cluster': 'Cluster', 
                                                        'Cell.Type': 'Cell_type'})

#### harmonize Phase 1 discovery data obs

##### merge the cell-types onto the discovery data

In [22]:
old_index = adata_disc.obs.index.copy()
adata_disc.obs = adata_disc.obs.merge(disc_cell_types, how='left', 
                                      left_index=True, right_index=True)
print(adata_disc.obs.index.equals(old_index))
print(adata_disc)
if DEBUG:
    display(adata_disc.obs.sample(5))

True
AnnData object with n_obs × n_vars = 168090 × 36601
    obs: 'pool_name', 'Sample_id', 'Tissue_source', 'Brain_region', 'Clinical_diagnosis', 'Age', 'Sex', 'donor_id', 'lane_num', 'Barcode', 'Study', 'Study_type', 'new_anno', 'broad_celltype'


In [23]:
adata_disc.obs = adata_disc.obs.drop(columns=['Tissue_source', 'Clinical_diagnosis', 
                                              'lane_num', 'donor_id'])
adata_disc.obs = adata_disc.obs.rename(columns={'Sample_id': 'Sample_ID', 
                                                'pool_name': 'Batch', 
                                                'new_anno': 'Cluster', 
                                                'broad_celltype': 'Cell_type'})

In [24]:
print(adata_rep.obs.columns)
print(adata_leng.obs.columns)
print(adata_morabita.obs.columns)
print(adata_disc.obs.columns)

Index(['Sample_ID', 'Sex', 'Age', 'Barcode', 'Study', 'Study_type', 'Batch',
       'Cluster', 'Cell_type', 'Brain_region'],
      dtype='object')
Index(['Sample_ID', 'Batch', 'Cluster', 'Cell_type', 'Sex', 'Brain_region',
       'Barcode', 'Study', 'Study_type', 'Age'],
      dtype='object')
Index(['Barcode', 'Sample_ID', 'Batch', 'Cell_type', 'Cluster', 'Age', 'Sex',
       'Study', 'Study_type', 'Brain_region'],
      dtype='object')
Index(['Batch', 'Sample_ID', 'Brain_region', 'Age', 'Sex', 'Barcode', 'Study',
       'Study_type', 'Cluster', 'Cell_type'],
      dtype='object')


### harmonize study var attribute featues

ok if we lose some features, just need to retain most for clustering purposes

In [25]:
def count_var_diffs(data_vars, genes):
    diff_genes = set(data_vars) - set(genes)
    print(len(diff_genes))
    if len(diff_genes) < 20:
        print(diff_genes)
    else:
        print(sample(diff_genes, 20))

gene_intersect = (set(adata_rep.var.index) & set(adata_disc.var.index) & 
set(adata_leng.var.index) & set(adata_morabita.var.index))

print(f'found {len(gene_intersect)} genes shared between datasets')
        
print('replication data')
count_var_diffs(adata_rep.var.index, gene_intersect)
print('discovery data')
count_var_diffs(adata_disc.var.index, gene_intersect)
print('Leng reference data')
count_var_diffs(adata_leng.var.index, gene_intersect)
print('Morabita reference data')
count_var_diffs(adata_morabita.var.index, gene_intersect)

found 23007 genes shared between datasets
replication data
13594
['AC104964.4', 'AC026412.3', 'AL021707.8', 'AC090809.1', 'AF121898.1', 'AC055720.2', 'AC106772.1', 'AC026415.1', 'AL596257.1', 'AC010280.2', 'AC108925.1', 'AC104633.1', 'AC244517.12', 'AL031963.3', 'AC015845.2', 'AL365295.2', 'AC079171.1', 'AC093801.1', 'AC010336.6', 'AC003965.1']
discovery data
13594
['AC034229.3', 'AC007038.1', 'AL139397.1', 'AC125613.1', 'AC107072.1', 'AL158834.2', 'AL450468.1', 'AC113189.1', 'AC092574.2', 'AL353708.1', 'AP001469.3', 'AC091814.1', 'AC069366.2', 'AC092687.1', 'AC110992.1', 'AP002755.1', 'AL356805.1', 'MPP7-DT', 'AC090578.3', 'H2BFWT']
Leng reference data
9819
['RP11-122L9.1', 'CTD-2561F5.1', 'RP11-293M10.2', 'RP11-354O24.1', 'RP11-10N23.4', 'RP11-157B13.7', 'RP11-351O2.1', 'RP11-145M9.5', 'RP11-497G19.1', 'RP11-484M3.5', 'RP11-298E9.7', 'AC010746.3', 'DARS1-AS1', 'RP11-634H22.1', 'TAFA2', 'RP11-250B2.5', 'RP11-115D19.1', 'RP11-555E9.1', 'RP11-162D9.3', 'CYP2U1-AS1']
Morabita reference d

### combine the different studies

#### make sure obs and var indices are unique

In [26]:
adata_rep.obs_names_make_unique()
adata_rep.var_names_make_unique()

adata_leng.obs_names_make_unique()
adata_leng.var_names_make_unique()

adata_morabita.obs_names_make_unique()
adata_morabita.var_names_make_unique()

adata_disc.obs_names_make_unique()
adata_disc.var_names_make_unique()

In [27]:
adata = ad_concat([adata_rep, adata_leng, adata_morabita, adata_disc])
adata.obs_names_make_unique('_')
print(adata)
if DEBUG:
    display(adata.obs.sample(10))
    display(adata.var.sample(10))

AnnData object with n_obs × n_vars = 307989 × 23007
    obs: 'Sample_ID', 'Sex', 'Age', 'Barcode', 'Study', 'Study_type', 'Batch', 'Cluster', 'Cell_type', 'Brain_region'


  utils.warn_names_duplicates("obs")


#### convert the age obs attribute feature to float from string

In [28]:
adata.obs.Age = adata.obs.Age.astype('float')

#### harmonzie sex values

In [29]:
adata.obs.Sex = adata.obs.Sex.replace({'Male': 'male', 'Female': 'female', 
                                       'M': 'male', 'F': 'female'})

#### harmonize cell-type names

In [30]:
adata.obs.Cell_type = adata.obs.Cell_type.replace({'Oligodendrocyte': 'Oligo', 
                                                   'ODC': 'Oligo',
                                                   'Exc': 'ExN', 
                                                   'EX': 'ExN',
                                                   'Astrocyte': 'Astro', 
                                                   'ASC': 'Astro', 
                                                   'Inh': 'InN', 
                                                   'INH': 'InN',
                                                   'Microglia': 'Micro', 
                                                   'MG': 'Micro', 
                                                   'Endothelial': 'Endo', 
                                                   'PER.END': 'Endo'})

##### convert the Batch column from object to string

In [31]:
adata.obs.Batch = adata.obs.Batch.astype('string')

### save the data

In [32]:
adata.write(raw_anndata_file)

In [33]:
!date

Tue Nov 14 19:56:07 UTC 2023
