## Note book to migrate phase1 GEX pools that contained Entorhinal cortex samples to phase2
- using the demultiplexed raw anndata files
- migrate the demux output
- migrate the Cellbender output
- pool format renaming to be consistent with phase 2 data

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, DataFrame
from scanpy import read
from os.path import exists
from os import listdir
from shutil import copyfile
from scanpy import read_h5ad
from anndata import AnnData

#### set notebook variables

In [None]:
# naming
project = 'aging_phase2'

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
phase1_path = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase1'
phase1_demux_path = f'{phase1_path}/demux'
phase1_cellbender_path = f'{phase1_path}/cellbender'
info_dir = f'{wrk_dir}/sample_info'
demux_dir = f'{wrk_dir}/demux'
cellbender_dir = f'{wrk_dir}/cellbender'

# in files
info_file = f'{info_dir}/{project}.sample_info.csv'
phase1_final_file = f'{phase1_demux_path}/aging.pegasus.leiden_085.subclustered.h5ad'

# variables
DEBUG = False
phase1_pools = [4, 5]
lanes = range(1, 9)

#### functions

In [None]:
def make_new_obs(data: AnnData, info: DataFrame, prev_info: DataFrame, 
                 verbose: bool=False) -> AnnData:
    # make sure to subset to only 'Brain_region' == 'Entorhinal cortex'
    data = data[data.obs.Brain_region == 'Entorhinal cortex']
    # adjust and subset keep columns
    data.obs = data.obs.rename(columns={'Sample_id': 'sample_id'})
    data.obs = data.obs[['sample_id']]
    # update the anndata obs with sample info
    new_obs = data.obs.copy()
    new_obs['barcodes'] = new_obs.index
    new_obs = new_obs.merge(info, how='left', left_on='sample_id', 
                            right_on='sample_id')
    new_obs = new_obs.set_index('barcodes', drop=True)
    new_obs = new_obs.merge(prev_info, how='left', left_index=True, 
                            right_index=True)    
    new_obs['donor_id'] = new_obs.hbcc_id
    new_obs = new_obs.drop(columns=['hbcc_id'])
    if not new_obs.index.equals(data.obs.index):
        print('re-indexing ...')
        new_obs = new_obs.reindex(data.obs.index)
    # now replace the original obs
    data.obs = new_obs
    if verbose:
        print(new_obs.index.equals(data.obs.index))
        print(f'shape of temp obs {new_obs.shape}')
    return data

### load the sample info data

In [None]:
info_df = read_csv(info_file)
print(f'shape of info {info_df.shape}')
if DEBUG:
    display(info_df.head())

### load the phase1 final data
for migrating the cluster and cell-types for comparison purposes

In [None]:
%%time
phase1_data = read(phase1_final_file)
print(phase1_data)

#### keep and rename only info want to migrate for comparison purposes

In [None]:
phase1_df = phase1_data.obs[['leiden_labels_085', 'new_anno']]
phase1_df = phase1_df.rename(columns={'leiden_labels_085': 'phase1_cluster',
                                      'new_anno': 'phase1_celltype'})
print(f'shape of previous info to keep {phase1_df.shape}')
if DEBUG:
    display(phase1_df.head())
    display(phase1_df.phase1_cluster.value_counts())
    display(phase1_df.phase1_celltype.value_counts())

### migrate the pool anndata files

In [None]:
for pool in phase1_pools:
    for lane in lanes:
        pool_name = f'Aging_P00{pool}_SCRN_{lane}'
        pool_path = f'{phase1_demux_path}/{pool_name}.h5ad'
        if exists(pool_path):
            print(pool_name)
            adata = read_h5ad(pool_path)
            if DEBUG:
                print(adata)
                print(f'shape of adata obs {adata.obs.shape}')
                display(adata.obs.head())            
            adata = make_new_obs(adata, info_df, phase1_df)
            # save modifid anndata
            out_path = f'{demux_dir}/GEX_P{pool}_{lane}.h5ad'
            adata.write(out_path)
            print(f'created: {out_path}')
            if DEBUG:
                print(adata)
                print(f'shape of modified adata obs {adata.obs.shape}')
                display(adata.obs.head())                

### migrate the pool demux outputs

In [None]:
%%time
for pool in phase1_pools:
    for lane in lanes:
        pool_name = f'Aging_P00{pool}_SCRN_{lane}'
        new_name = f'GEX_P{pool}_{lane}'
        pool_path = f'{phase1_demux_path}/{pool_name}.best'
        if exists(pool_path):
            copyfile(pool_path, f'{demux_dir}/{new_name}.best')

### migrate the pool cellbender outputs

In [None]:
%%time
for pool in phase1_pools:
    for lane in lanes:
        pool_name = f'Aging_P00{pool}_SCRN_{lane}'
        new_name = f'sample_ec_GEX_P{pool}_{lane}'
        pool_path = f'{phase1_cellbender_path}/{pool_name}_out.h5'
        if exists(pool_path):
            matching_files = [file for file in listdir(phase1_cellbender_path) 
                              if file.startswith(pool_name)]
            for pool_file in matching_files:
                copyfile(f'{phase1_cellbender_path}/{pool_file}',
                         f'{cellbender_dir}/{pool_file.replace(pool_name, new_name)}')

In [None]:
!date