## Note book to migrate phase1 GEX pools that contained Entorhinal cortex samples to phase2
- using the demultiplexed raw anndata files
- pool format renaming to be consistent with phase 2 data

In [1]:
!date

Wed Aug 30 12:33:22 EDT 2023


#### import libraries

In [2]:
from pandas import read_csv, DataFrame
from scanpy import read
from os.path import exists
from scanpy import read_h5ad
from anndata import AnnData

#### set notebook variables

In [3]:
# naming
project = 'aging_phase2'

# directories
phase1_demux_path = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase1/demux'
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
info_dir = f'{wrk_dir}/sample_info'
demux_dir = f'{wrk_dir}/demux'

# in files
info_file = f'{info_dir}/{project}.sample_info.csv'
phase1_final_file = f'{phase1_demux_path}/aging.pegasus.leiden_085.subclustered.h5ad'

# variables
DEBUG = False
pools = [4, 5]
lanes = range(1, 9)

#### functions

In [4]:
def make_new_obs(data: AnnData, info: DataFrame, prev_info: DataFrame, 
                 verbose: bool=False) -> AnnData:
    # make sure to subset to only 'Brain_region' == 'Entorhinal cortex'
    data = data[data.obs.Brain_region == 'Entorhinal cortex']
    # adjust and subset keep columns
    data.obs = data.obs.rename(columns={'Sample_id': 'sample_id'})
    data.obs = data.obs[['sample_id']]
    # update the anndata obs with sample info
    new_obs = data.obs.copy()
    new_obs['barcodes'] = new_obs.index
    new_obs = new_obs.merge(info, how='left', left_on='sample_id', right_on='sample_id')
    new_obs = new_obs.set_index('barcodes', drop=True)
    new_obs = new_obs.merge(prev_info, how='left', left_index=True, right_index=True)    
    new_obs['donor_id'] = new_obs.hbcc_id
    new_obs = new_obs.drop(columns=['hbcc_id'])
    if not new_obs.index.equals(data.obs.index):
        print('re-indexing ...')
        new_obs = new_obs.reindex(data.obs.index)
    # now replace the original obs
    data.obs = new_obs     
    if verbose:
        print(new_obs.index.equals(data.obs.index))
        print(f'shape of temp obs {new_obs.shape}')
    return data

### load the sample info data

In [5]:
info_df = read_csv(info_file)
print(f'shape of info {info_df.shape}')
if DEBUG:
    display(info_df.head())

shape of info (36, 13)


### load the phase1 final data
for migrating the cluster and cell-types for comparison purposes

In [6]:
%%time
phase1_data = read(phase1_final_file)
print(phase1_data)

Only considering the two last: ['.subclustered', '.h5ad'].
Only considering the two last: ['.subclustered', '.h5ad'].
AnnData object with n_obs × n_vars = 167945 × 35441
    obs: 'pool_name', 'Sample_id', 'Tissue_source', 'Brain_region', 'Clinical_diagnosis', 'Age', 'Sex', 'donor_id', 'lane_num', 'Channel', 'n_genes', 'n_counts', 'percent_mito', 'scale', 'Group', 'leiden_labels', 'anno', 'leiden_labels_085', 'new_anno', 'Age_group', 'broad_celltype'
    var: 'n_cells', 'percent_cells', 'robust', 'highly_variable_features', 'mean', 'var', 'hvf_loess', 'hvf_rank'
    uns: 'Channels', 'Groups', 'PCs', 'W_diffmap', 'W_pca_harmony', 'broad_celltype_colors', 'c2gid', 'df_qcplot', 'diffmap_evals', 'diffmap_knn_distances', 'diffmap_knn_indices', 'genome', 'gncells', 'leiden_resolution', 'modality', 'ncells', 'new_anno_colors', 'norm_count', 'pca', 'pca_features', 'pca_harmony_knn_distances', 'pca_harmony_knn_indices', 'stdzn_max_value', 'stdzn_mean', 'stdzn_std'
    obsm: 'X_diffmap', 'X_fle',

#### keep and rename only info want to migrate for comparison purposes

In [7]:
phase1_df = phase1_data.obs[['leiden_labels_085', 'new_anno']]
phase1_df = phase1_df.rename(columns={'leiden_labels_085': 'phase1_cluster', 
                                      'new_anno': 'phase1_celltype'})
print(f'shape of previous info to keep {phase1_df.shape}')
if DEBUG:
    display(phase1_df.head())
    display(phase1_df.phase1_cluster.value_counts())
    display(phase1_df.phase1_celltype.value_counts())

shape of previous info to keep (167945, 2)


### migrate the pools

In [8]:
for pool in pools:
    for lane in lanes:
        pool_name = f'Aging_P00{pool}_SCRN_{lane}'
        pool_path = f'{phase1_demux_path}/{pool_name}.h5ad'
        if exists(pool_path):
            print(pool_name)
            adata = read_h5ad(pool_path)
            if DEBUG:
                print(adata)                
                print(f'shape of adata obs {adata.obs.shape}')                
                display(adata.obs.head())            
            adata = make_new_obs(adata, info_df, phase1_df)
            # save modifid anndata
            out_path = f'{demux_dir}/GEX_P{pool}_{lane}.h5ad'
            adata.write(out_path)
            print(f'created: {out_path}')
            if DEBUG:
                print(adata)                
                print(f'shape of modified adata obs {adata.obs.shape}')                
                display(adata.obs.head())                

Aging_P004_SCRN_1
created: /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/demux/GEX_P4_1.h5ad
Aging_P004_SCRN_2
created: /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/demux/GEX_P4_2.h5ad
Aging_P004_SCRN_3
created: /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/demux/GEX_P4_3.h5ad
Aging_P004_SCRN_4
created: /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/demux/GEX_P4_4.h5ad
Aging_P004_SCRN_5
created: /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/demux/GEX_P4_5.h5ad
Aging_P004_SCRN_6
created: /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/demux/GEX_P4_6.h5ad
Aging_P004_SCRN_7
created: /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/demux/GEX_P4_7.h5ad
Aging_P004_SCRN_8
created: /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/demux/GEX_P4_8.h5ad
Aging_P005_SCRN_1
created: /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/demux/GEX_P5_1.h5ad
Aging_P005_SCRN_2
created: /labshare/raph/datasets/adrd_neuro/brain_aging/phase2/demux/GEX_

In [9]:
!date

Wed Aug 30 12:33:36 EDT 2023
