## Note book to migrate phase1 GEX pools that contained Entorhinal cortex samples to phase2
- using the demultiplexed raw anndata files
- pool format renaming to be consistent with phase 2 data

In [1]:
!date

Tue Aug 15 17:08:18 EDT 2023


#### import libraries

In [2]:
from pandas import read_csv, DataFrame
from os.path import exists
from scanpy import read_h5ad
from anndata import AnnData

#### set notebook variables

In [3]:
# naming
project = 'aging_phase2'

# directories
phase1_demux_path = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase1/demux'
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
info_dir = f'{wrk_dir}/sample_info'
demux_dir = f'{wrk_dir}/demux'

# in files
info_file = f'{info_dir}/{project}.sample_info.csv'

# variables
DEBUG = False
pools = [4, 5]
lanes = range(1, 9)

#### functions

In [4]:
def make_new_obs(data: AnnData, info: DataFrame, verbose: bool=False) -> AnnData:
    # make sure to subset to only 'Brain_region' == 'Entorhinal cortex'
    data = data[data.obs.Brain_region == 'Entorhinal cortex']
    # adjust and subset keep columns
    data.obs = data.obs.rename(columns={'Sample_id': 'sample_id'})
    data.obs = data.obs[['sample_id']]
    # update the anndata obs with sample info
    new_obs = data.obs.copy()
    new_obs['barcodes'] = new_obs.index
    new_obs = new_obs.merge(info, how='left', left_on='sample_id', right_on='sample_id')
    new_obs = new_obs.set_index('barcodes', drop=True)
    new_obs['donor_id'] = new_obs.hbcc_id
    new_obs = new_obs.drop(columns=['hbcc_id'])
    if not new_obs.index.equals(data.obs.index):
        print('re-indexing ...')
        new_obs = new_obs.reindex(data.obs.index)
    # now replace the original obs
    data.obs = new_obs     
    if verbose:
        print(new_obs.index.equals(data.obs.index))
        print(f'shape of temp obs {new_obs.shape}')
    return data

### load the sample info data

In [5]:
info_df = read_csv(info_file)
print(f'shape of info {info_df.shape}')
if DEBUG:
    display(info_df.head())

shape of info (36, 13)


In [6]:
for pool in pools:
    for lane in lanes:
        pool_name = f'Aging_P00{pool}_SCRN_{lane}'
        pool_path = f'{phase1_demux_path}/{pool_name}.h5ad'
        if exists(pool_path):
            print(pool_name)
            adata = read_h5ad(pool_path)
            if DEBUG:
                print(adata)                
                print(f'shape of adata obs {adata.obs.shape}')                
                display(adata.obs.head())            
            adata = make_new_obs(adata, info_df)
            # save modifid anndata
            out_path = f'{demux_dir}/GEX_P{pool}_{lane}.h5ad'
            adata.write(out_path)
            if DEBUG:
                print(adata)                
                print(f'shape of modified adata obs {adata.obs.shape}')                
                display(adata.obs.head())                

Aging_P004_SCRN_1
Aging_P004_SCRN_2
Aging_P004_SCRN_3
Aging_P004_SCRN_4
Aging_P004_SCRN_5
Aging_P004_SCRN_6
Aging_P004_SCRN_7
Aging_P004_SCRN_8
Aging_P005_SCRN_1
Aging_P005_SCRN_2
Aging_P005_SCRN_3
Aging_P005_SCRN_4
Aging_P005_SCRN_5
Aging_P005_SCRN_6
Aging_P005_SCRN_7
Aging_P005_SCRN_8


In [7]:
!date

Tue Aug 15 17:08:27 EDT 2023
