In [None]:
# Import libraries
import os
import math
import magpy as mp
import scanpy as sc
import numpy as np
import pandas as pd
import anndata as ad
import diffxpy.api as de
import matplotlib.pyplot as plt

combined_path = "/proj/magness/CDS010-014-015_combined"

In [None]:
### CHANGELOG ###

# v1.2
# -> Added .obs['organ'] annotation to specify SI v Colon
# -> Removed .obs['type']== '15-?' cluster, comprised of high mitochondrial counts
# -> Changed .obs['type']=='21-?' cluster to M_cells
# -> Changed .uns['leiden_colors'] and .uns['type_colors'] to new colors and made consistent
# -> Randomly shuffled cell indexes to make better overlays
# -> Changed .obs['donor'] to 'Donor 1', 'Donor 2', and 'Donor 3'
# -> Added .obs['cds'] containing 'CDS010', 'CDS014', and 'CDS015'

# v1.3
# -> Updated lineage to separate EEC and goblet_prog to SI_EEC/C_EEC and SI_goblet_proj/C_goblet_proj
# -> Reordered lineage categories for easier plotting
# -> Reordered Donor for numerical order
# -> Reordered region by proximal to distal
# -> Added in palettes for select .obs annotations
# -> Added layer for raw annotations
# -> Update colors for each lineage, organ, and donor and fig 1c

# v1.4
# -> Changed names in .obs['leiden']: Paneth/PLC -> SI_Best4/C_Best4 | goblet -> SI_secretory
# -> Changed names in .obs['lineage']: Paneth/PLC -> SI_Best4/C_Best4 | split SI_secretory -> paneth/goblet
# -> Changed .obs['type'] labels to Best4 and paneth
# -> Added annotation for Parikh et al 2019 crypt-axis score
# -> Added additional color for lineage=='paneth'
# -> Added annotation of ['XXXXXXXX'] for donor+region adata.obs['donor_region'] = adata.obs.donor.str.cat(adata.obs.region)

# v1.5
# -> Added pseudotime info
# -> Added raw_scaled data to object
# -> Removed SI_paneth lineage
# -> Cluster name updates in .obs['lineage']: SI-6? > SI_AE2, SI_ & C_goblet_prog > SI_sec_prog & C_sec_prog, C_earlyCC > C_earlyACC, C_latecc > C_lateACC

# v1.6
# -> update file name to be k10 NOT k25
# -> add types for Secretory prog, AE2?
# -> consolidating and reorganizing annotations for several var annotations that are duplicated across donors
# -> concatenate gene_ids, feature_types, genome, mt
# -> remove ribo, hb, all of .varm and .uns for cell type specific annotations
# -> add donor annotation to n_cells, n_cells_by_counts, mean_counts, pct_dropout, total counts
# -> move .raw layer to ln_rpdm-normalized and raw_normalized to raw_rpdm-normalized
# -> add combined and preprocessed dataset to .raw layer

# v1.7
# -> deleted old aspects from previous version [adata.raw, adata.layers['raw','raw_rpdm-normalized', and 'ln_rpdm-normalized']
# -> consolidated .var genes that shared ENSG ids based on the most recent annotation. these genes were summed in .raw and the raw_rpdm-normalized and ln_rpdm-normalized layers were updated accordingly
# -> the values in adata.X which is the default layer for dimensional reduction, clustering, UMAP, etc were not adjusted. Only duplicate values were removed to correspond to the other updates from above  

In [None]:
### Update of main data object to v1.2 ###
### Changes for v1.2 ###

adata = mp.load(combined_path, 'clustered_adata_k25_lr0.92.h5ad')
print(adata.uns['neighbors'])
# contrast_palette = ['#ff0000','#2f4f4f','#808000','#b22222','#3cb371','#7f007f','#d3d3d3','#ff8c00',
#                     '#ffd700','#0000cd','#00ff7f','#4169e1','#ffa07a','#00bfff','#adff2f','#ff00ff',
#                     '#f0e68c','#dda0dd','#ff1493','#00ffff']

# adata.uns['type_colors'] = contrast_palette
# adata.uns['leiden_colors'] = contrast_palette

# print(adata)

# print(adata.obs['donor'].value_counts())

# adata = adata[~adata.obs['lineage'].isin(['15-?'])]

# lineage_map = {'0':'SI_matureAE',
#               '1':'SI_intermAE',
#               '2':'C_earlyCC',
#               '3':'SI_earlyAE',
#               '4':'C_goblet',
#               '5':'SI_ISC',
#               '6':'SI_6-?',
#               '7':'C_lateCC',
#               '8':'C_tuft',
#               '9':'C_TA',
#               '10':'C_plc',
#               '11':'SI_goblet',
#               '12':'SI_TA',
#               '13':'C_ISC',
#               '14':'SI_paneth',
#               '16':'SI_tuft',
#               '17':'EEC',
#               '18':'goblet_prog',
#               '19':'SI_TA2',
#               '21':'SI_FAE',
#               }
# adata.obs['lineage'] = adata.obs['leiden'].map(lineage_map)      

# print(adata.obs['lineage'].value_counts())

# type_map = {
#     'SI_matureAE':'absorptive',
#     'SI_intermAE':'absorptive',
#     'C_earlyCC':'absorptive',
#     'SI_earlyAE':'absorptive',
#     'C_goblet':'goblet',
#     'SI_ISC':'ISC',
#     'SI_6-?':'SI_6-?',
#     'C_lateCC':'absorptive',
#     'C_tuft':'tuft',
#     'C_TA':'TA',
#     'C_plc':'paneth+plc',
#     'SI_goblet':'goblet',
#     'SI_TA':'TA',
#     'C_ISC':'ISC',
#     'SI_paneth':'paneth+plc',
#     'SI_tuft':'tuft',
#     'EEC':'EEC',
#     'goblet_prog':'goblet',
#     'SI_TA2':'TA',
#     'SI_FAE':'FAE',
#               }

# adata.obs['type'] = adata.obs['lineage'].map(type_map) 

# print(adata.obs['type'].value_counts())

# sc.pl.umap(adata, color = ['lineage','type'], palette = contrast_palette, ncols = 1)

# mask1 = adata.obs['region'].isin(['Duo','Jej','Ile'])
# mask2 = ~adata.obs['lineage'].str.startswith('C_')
# is_SI = mask1 & mask2

# mask3 = adata.obs['region'].isin(['AC','TC','DC'])
# mask4 = ~adata.obs['lineage'].str.startswith('SI_')
# is_colon = mask3 & mask4

# adata.obs['organ'] = is_SI.map({True:'SI',False:'Colon'})

# adata = adata[((is_SI) | (is_colon)),:]

# sc.pl.umap(adata, color = 'organ')

# print(adata.obs['organ'].value_counts())

# adata.obs['cds'] = adata.obs['donor']

# donor_map = {'CDS010':'Donor 2',
#               'CDS014':'Donor 1',
#               'CDS015':'Donor 3'
#               }

# adata.obs['donor'] = adata.obs['donor'].map(donor_map) 

# print(adata.obs['cds'].value_counts())
# print(adata.obs['donor'].value_counts())

# sc.pl.umap(adata, color = 'donor')

# new_index = np.random.permutation(adata.obs_names)
# print(new_index)
# adata = adata[new_index,:]

# sc.pl.umap(adata, color = 'donor')

# mp.save(adata, combined_path, 'clustered_annotated_adata_k25_lr0.92_v1.2.h5ad')

In [None]:
### Update of main data object to v1.3 ###

expt_path = "/proj/magness/CDS010-014-015_combined"
adata = mp.load(expt_path, "clustered_annotated_adata_k25_lr0.92_v1.2.h5ad")


### CHANGE NAMES OF LEIDEN CLUSTERS ###
lineage_map = {'0':'SI_matureAE',
              '1':'SI_intermAE',
              '2':'C_earlyCC',
              '3':'SI_earlyAE',
              '4':'C_goblet',
              '5':'SI_ISC',
              '6':'SI_6-?',
              '7':'C_lateCC',
              '8':'C_tuft',
              '9':'C_TA',
              '10':'C_plc',
              '11':'SI_goblet',
              '12':'SI_TA',
              '13':'C_ISC',
              '14':'SI_paneth',
              '16':'SI_tuft',
              '17':'EEC',
              '18':'goblet_prog',
              '19':'SI_TA2',
              '21':'SI_FAE',
              }

adata.obs['leiden'] = adata.obs['leiden'].map(lineage_map)


### SPLIT .OBS.LINEAGE EEC AND GOLBET_PROG INTO SI AND COLON ###
colon_GPs = (adata.obs['lineage']=='goblet_prog') & (adata.obs['organ']=='Colon' )
SI_GPs = (adata.obs['lineage']=='goblet_prog') & (adata.obs['organ']=='SI' )

adata.obs['lineage'].cat.add_categories(new_categories=['C_goblet_prog','SI_goblet_prog'],inplace=True)
adata.obs['lineage'] = adata.obs['lineage'].mask(colon_GPs,other='C_goblet_prog')
adata.obs['lineage'] = adata.obs['lineage'].mask(SI_GPs,other='SI_goblet_prog')
adata.obs['lineage'].cat.remove_categories(removals='goblet_prog',inplace=True)

colon_EECs = (adata.obs['lineage']=='EEC') & (adata.obs['organ']=='Colon' )
SI_EECs = (adata.obs['lineage']=='EEC') & (adata.obs['organ']=='SI' )

adata.obs['lineage'].cat.add_categories(new_categories=['C_EEC','SI_EEC'],inplace=True)
adata.obs['lineage'] = adata.obs['lineage'].mask(colon_EECs,other='C_EEC')
adata.obs['lineage'] = adata.obs['lineage'].mask(SI_EECs,other='SI_EEC')
adata.obs['lineage'].cat.remove_categories(removals='EEC',inplace=True)


### REORDER LINEAGES AND DONOR LIST ###
lineage_order = ['SI_ISC','C_ISC','SI_TA','SI_TA2','C_TA',
                 'SI_earlyAE','C_earlyCC','SI_intermAE','SI_matureAE','SI_6-?','C_lateCC', 
                 'SI_paneth','C_plc','SI_tuft','C_tuft',
                 'SI_goblet_prog','C_goblet_prog','SI_goblet','C_goblet',
                 'SI_EEC','C_EEC','SI_FAE']

adata.obs.lineage.cat.reorder_categories(lineage_order, inplace=True)

donor_order = ['Donor 1','Donor 2','Donor 3']

adata.obs.donor.cat.reorder_categories(donor_order, inplace=True)

region_order = ['Duo','Jej','Ile','AC','TC','DC']

adata.obs.region.cat.reorder_categories(region_order,inplace=True)

### ADD RAW_NORMALIZED LAYER TO ADATA OBJECT ###
adata2 = mp.load(expt_path,2)

adata2 = adata2[adata.obs_names, adata.var_names]

adata.layers['raw'] = adata2.X

sc.pp.normalize_total(adata2)
adata.layers['raw_normalized'] = adata2.X

### SAVE ###
mp.save(adata,expt_path,"clustered_annotated_adata_k25_lr0.92_v1.3.h5ad")

In [None]:
### Update of main data object to v1.4 ###

expt_path = "/proj/magness/CDS010-014-015_combined"
adata = mp.load(expt_path, "clustered_annotated_adata_k25_lr0.92_v1.3.h5ad")

### Change names in .obs['leiden']: Paneth/PLC -> SI_Best4/C_Best4 | SI_goblet -> SI_secretory | goblet_prog > secretory_prog
leiden_map = {'SI_paneth':'SI_BEST4',
              'SI_goblet':'SI_secretory',
              'C_plc':'C_BEST4',
              'goblet_prog':'secretory_prog',
              'SI_matureAE':'SI_matureAE',
              'SI_intermAE':'SI_intermAE',
              'C_earlyCC':'C_earlyCC',
              'SI_earlyAE':'SI_earlyAE',
              'C_goblet':'C_goblet',
              'SI_ISC':'SI_ISC',
              'SI_6-?':'SI_6-?',
              'C_lateCC':'C_lateCC',
              'C_tuft':'C_tuft',
              'C_TA':'C_TA',
              'SI_TA':'SI_TA',
              'C_ISC':'C_ISC',
              'SI_tuft':'SI_tuft',
              'EEC':'EEC',
              'SI_TA2':'SI_TA2',
              'SI_FAE':'SI_FAE',
              }

adata.obs['leiden'] = adata.obs['leiden'].map(leiden_map)

### Change names in .obs['lineage']: Paneth/PLC -> SI_Best4/C_Best4 | split SI_secretory -> paneth/goblet
# Subclustering and recalling paneth cells out of goblet cells

mp.settings.leiden_resolution = 0.15
mp.settings.num_neighbors = 15
mp.settings.num_pcs = 40
kwargs = {'max_iter_harmony':20}

subset = adata[adata.obs['leiden']=='SI_secretory'].copy()
subset = mp.pipeline.recluster(expt_path=combined_path, data=subset,
                               write_file = 'goblet_subscluster_lr0.2_nn40_pcs40.h5ad',
                               neighbors_key=None, harmonize=True, calc_leiden = True, reprocess = False,
                               calc_hvg = True, annotate_cell_cycle = False, regress_cell_cycle = False,
                               reembed = False, save = False, recalc_pca = False, **kwargs)

# sc.pl.umap(subset, color = ['leiden','lineage','DEFA5','DEFA6','MUC2','LYZ'], ncols = 1)

goblet_subset = subset[subset.obs['leiden']=='0']
paneth_subset = subset[subset.obs['leiden']=='1']

adata.obs.lineage.cat.add_categories(["SI_BEST4","C_BEST4"], inplace = True)                   

adata.obs['lineage'].mask(adata.obs['lineage']=='SI_paneth', 'SI_BEST4', inplace = True)
adata.obs['lineage'].mask(adata.obs['lineage']=='C_plc', 'C_BEST4', inplace = True)
adata.obs['lineage'].mask(adata.obs_names.isin(goblet_subset.obs_names.tolist()), 'SI_goblet', inplace = True)
adata.obs['lineage'].mask(adata.obs_names.isin(paneth_subset.obs_names.tolist()), 'SI_paneth', inplace = True)

adata.obs.lineage.cat.remove_categories(['C_plc'], inplace = True) 

### Change .obs['type'] labels to BEST4 and paneth and rename goblet prog to secretory prog
# adata.obs.lineage.cat.rename_categories({'SI_goblet_prog':'SI_secretory_prog','C_goblet_prog':'C_secretory_prog','paneth+plc':'paneth'},inplace = True)     

type_map = {
    'SI_matureAE':'absorptive',
    'SI_intermAE':'absorptive',
    'C_earlyCC':'absorptive',
    'SI_earlyAE':'absorptive',
    'C_goblet':'goblet',
    'SI_ISC':'ISC',
    'SI_6-?':'SI_6-?',
    'C_lateCC':'absorptive',
    'C_tuft':'tuft',
    'C_TA':'TA',
    'SI_BEST4':'BEST4',
    'C_BEST4':'BEST4',
    'SI_goblet':'goblet',
    'SI_TA':'TA',
    'C_ISC':'ISC',
    'SI_paneth':'paneth',
    'SI_tuft':'tuft',
    'SI_EEC':'EEC',
    'C_EEC':'EEC',
    'SI_secretory_prog':'secretory_prog',
    'C_secretory_prog':'secretory_prog',
    'SI_TA2':'TA',
    'SI_FAE':'FAE'
}

adata.obs['type'] = adata.obs['lineage'].map(type_map)

# sc.pl.umap(adata, color = 'type')
# print(adata.obs.type.unique())

### Added donor_region annotation ###

adata.obs['donor_region'] = adata.obs.donor.str.cat(adata.obs.region,sep="_")

### ANNOTATE DATA WITH MEAN EXPRESSION, PCTEXP, and NCELLS ###
import gc

raw_adata = adata.copy()
raw_adata.X = raw_adata.layers['raw_normalized'].todense()

for lineage in adata.obs.lineage.unique():
#     print()
#     print(lineage)
    gc.collect()
    lineage_subset = raw_adata[raw_adata.obs.lineage == lineage]
    adata.varm[f'{lineage}_means'] = np.zeros((adata.n_vars, 4, 3)) #gene, donor, region
    adata.varm[f'{lineage}_pctexp'] = np.zeros((adata.n_vars, 4, 3)) #gene, donor, region
    adata.uns[f'{lineage}_ncells'] = np.zeros((4, 3)) #donor, region
    
    if 'SI_' in lineage: regions = ['Duo','Jej','Ile']
    else: regions = ['AC','TC','DC']
    
    for i, donor in enumerate(['combined','Donor 1','Donor 2','Donor 3']):
#         print(donor)
        if i == 0: donor_subset = lineage_subset
        else: donor_subset = lineage_subset[lineage_subset.obs['donor']==donor]
            
        for j, region in enumerate(regions):
#             print(region)
            region_subset = donor_subset[donor_subset.obs['region']==region].copy()
            
            adata.varm[f'{lineage}_means'][:,i,j] = region_subset.X.mean(axis=0)
            adata.varm[f'{lineage}_pctexp'][:,i,j] = (region_subset.X > 0).sum(axis=0) / region_subset.n_obs
            adata.uns[f'{lineage}_ncells'][i,j] = region_subset.n_obs


### SAVE NEW DATA VERSION ###            
mp.save(adata,expt_path,"clustered_annotated_adata_k25_lr0.92_v1.4.h5ad")

In [None]:
### Update of main data object to v1.5
combined_path = "/proj/magness/CDS010-014-015_combined"
adata = mp.load(combined_path, "clustered_annotated_adata_k25_lr0.92_v1.42.h5ad")

### -> Cluster name updates in .obs['lineage']: 
## SI-6? > SI_AE2
## SI_ & C_goblet_prog > SI_sec_prog & C_sec_prog
## C_earlyCC > C_earlyACC
## C_latecc > C_lateACC

#print(adata.obs.lineage.value_counts())

lineage_map = {'SI_paneth':'SI_paneth',
               'SI_goblet':'SI_goblet',
               'C_BEST4':'C_BEST4',
               'SI_BEST4':'SI_BEST4',
               'SI_goblet_prog':'SI_secretory_prog',
               'C_goblet_prog':'C_secretory_prog',
               'SI_matureAE':'SI_matureAE',
               'SI_intermAE':'SI_intermAE',
               'C_earlyCC':'C_earlyACC',
               'SI_earlyAE':'SI_earlyAE',
               'C_goblet':'C_goblet',
               'SI_ISC':'SI_ISC',
               'SI_6-?':'SI_AE2',
               'C_lateCC':'C_lateACC',
               'C_tuft':'C_tuft',
               'C_TA':'C_TA',
               'SI_TA':'SI_TA',
               'C_ISC':'C_ISC',
               'SI_tuft':'SI_tuft',
               'SI_EEC':'SI_EEC',
               'C_EEC':'C_EEC',
               'SI_TA2':'SI_TA2',
               'SI_FAE':'SI_FAE',
              }

adata.obs.lineage = adata.obs.lineage.map(lineage_map)

#adata.obs.lineage.cat.remove_categories(['SI_goblet_prog','C_goblet_prog','C_earlyCC','C_lateCC','SI_6-?'], inplace = True)

print(adata.obs.lineage.value_counts())

sc.pl.umap(adata, color = 'lineage')
# sc.pl.dotplot(adata, 'LGR5', groupby = 'lineage')

mp.save(adata, combined_path, 'clustered_annotated_adata_k25_lr0.92_v1.5.h5ad')



In [None]:
##### Update of main data object to v1.6
adata = mp.load(combined_path, "clustered_annotated_adata_k25_lr0.92_v1.5.h5ad")
print(adata)

### -> renaming varm for
## SI-6? > SI_AE2
## SI_ & C_goblet_prog > SI_sec_prog & C_sec_prog
## C_earlyCC > C_earlyACC
## C_latecc > C_lateACC
### consolidating and reorganizing annotations for several var annotations that are duplicated across donors
#concatenate gene_ids, feature_types, genome, mt
#remove ribo, hb
#add donor annotation to n_cells, n_cells_by_counts, mean_counts, pct_dropout, total counts
var_list = ['gene_ids', 'feature_types', 'genome','mt', 'n_cells','ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts','C_BEST4_means',
            'C_BEST4_pctexp','C_EEC_means','C_EEC_pctexp','C_ISC_means','C_ISC_pctexp','C_TA_means','C_TA_pctexp','C_earlyCC_means','C_earlyCC_pctexp',
            'C_goblet_means','C_goblet_pctexp','C_lateCC_means','C_lateCC_pctexp','C_tuft_means','C_tuft_pctexp','SI_BEST4_means','SI_BEST4_pctexp',
            'SI_EEC_means','SI_EEC_pctexp','SI_FAE_means','SI_FAE_pctexp','SI_ISC_means','SI_ISC_pctexp','SI_TA2_means','SI_TA2_pctexp','SI_TA_means',
            'SI_TA_pctexp','SI_earlyAE_means','SI_earlyAE_pctexp','SI_goblet_means','SI_goblet_pctexp','SI_intermAE_means','SI_intermAE_pctexp',
            'SI_matureAE_means','SI_matureAE_pctexp','SI_paneth_means','SI_paneth_pctexp','SI_tuft_means','SI_tuft_pctexp','SI_6-?_means','SI_6-?_pctexp',
            'C_goblet_prog_means','C_goblet_prog_pctexp','SI_goblet_prog_means','SI_goblet_prog_pctexp','C_BEST4_ncells','C_EEC_ncells','C_ISC_ncells','C_TA_ncells','C_earlyCC_ncells','C_goblet_ncells',
            'C_goblet_prog_ncells','C_lateCC_ncells','C_tuft_ncells','SI_6-?_ncells','SI_BEST4_ncells','SI_EEC_ncells','SI_FAE_ncells','SI_ISC_ncells','SI_TA2_ncells',
            'SI_TA_ncells','SI_earlyAE_ncells','SI_goblet_ncells','SI_goblet_prog_ncells','SI_intermAE_ncells','SI_matureAE_ncells','SI_paneth_ncells','SI_tuft_ncells']
concat_list = ['gene_ids','feature_types', 'genome', 'mt']
del_list = ['ribo','hb','C_BEST4_means','C_BEST4_pctexp','C_EEC_means','C_EEC_pctexp','C_ISC_means','C_ISC_pctexp','C_TA_means','C_TA_pctexp','C_earlyCC_means',
            'C_earlyCC_pctexp','C_goblet_means','C_goblet_pctexp','C_lateCC_means','C_lateCC_pctexp','C_tuft_means','C_tuft_pctexp','SI_BEST4_means','SI_BEST4_pctexp',
            'SI_EEC_means','SI_EEC_pctexp','SI_FAE_means','SI_FAE_pctexp','SI_ISC_means','SI_ISC_pctexp','SI_TA2_means','SI_TA2_pctexp','SI_TA_means','SI_TA_pctexp',
            'SI_earlyAE_means','SI_earlyAE_pctexp','SI_goblet_means','SI_goblet_pctexp','SI_intermAE_means','SI_intermAE_pctexp','SI_matureAE_means','SI_matureAE_pctexp',
            'SI_paneth_means','SI_paneth_pctexp','SI_tuft_means','SI_tuft_pctexp','SI_6-?_means','SI_6-?_pctexp','C_goblet_prog_means','C_goblet_prog_pctexp',
            'SI_goblet_prog_means','SI_goblet_prog_pctexp','C_BEST4_ncells','C_EEC_ncells','C_ISC_ncells','C_TA_ncells','C_earlyCC_ncells','C_goblet_ncells',
            'C_goblet_prog_ncells','C_lateCC_ncells','C_tuft_ncells','SI_6-?_ncells','SI_BEST4_ncells','SI_EEC_ncells','SI_FAE_ncells','SI_ISC_ncells','SI_TA2_ncells',
            'SI_TA_ncells','SI_earlyAE_ncells','SI_goblet_ncells','SI_goblet_prog_ncells','SI_intermAE_ncells','SI_matureAE_ncells','SI_paneth_ncells','SI_tuft_ncells']
add_list = ['n_cells','n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts']

adata.varm['principle_components'] = adata.varm['PCs']
del adata.varm['PCs']

for annotation in var_list:
    # print(annotation)
    if annotation in concat_list:
        if annotation == 'gene_ids':
            concat_df = adata.var[[a for a in adata.var if annotation in a]].astype('str')
            foo = (concat_df['gene_ids-2'].str.upper() == 'NAN')
            bar = concat_df['gene_ids-2'].mask(foo,concat_df['gene_ids-1'])
            rah = (bar.str.upper() == 'NAN')
            dah = bar.mask(rah,concat_df['gene_ids-0'])
            adata.var[f'{annotation}'] = dah 
            del adata.var[f'{annotation}-0'],adata.var[f'{annotation}-1'], adata.var[f'{annotation}-2']
        elif annotation == 'mt': 
            concat_df = adata.var[[a for a in adata.var if annotation in a]].astype('str')
            foo = (concat_df['mt-2'].str.upper() == 'NAN')
            bar = concat_df['mt-2'].mask(foo,concat_df['mt-1'])
            rah = (bar.str.upper() == 'NAN')
            dah = bar.mask(rah,concat_df['mt-0'])
            adata.var[f'{annotation}'] = dah 
            del adata.var[f'{annotation}-0'],adata.var[f'{annotation}-1'], adata.var[f'{annotation}-2']
        else:
            foo = pd.DataFrame({f'{annotation}':adata.var[f'{annotation}-0']}, index = adata.var_names.tolist())
            if annotation == 'feature_types': 
                foo[f'{annotation}'] = 'Gene Expression'
                adata.var[f'{annotation}'] = foo
                del adata.var[f'feature_types-0'],adata.var[f'{annotation}-1'], adata.var[f'{annotation}-2']
            if annotation == 'genome': 
                foo[f'{annotation}'] = 'GRCh38'
                adata.var[f'{annotation}'] = foo
                del adata.var[f'{annotation}-0'],adata.var[f'{annotation}-1'], adata.var[f'{annotation}-2']
    if annotation in del_list:
        if ('means' in annotation) or ('pctexp' in annotation): 
            del adata.varm[f'{annotation}']
        elif ('_ncells' in annotation):
            del adata.uns[f'{annotation}']
    for i, donor in enumerate(adata.obs['donor'].unique()):
        # print(annotation)
        if (annotation in del_list) and (f'{annotation}-{i}' in adata.var):
            print(annotation)
            del adata.var[f'{annotation}-{i}']
        if annotation in add_list:
            if i == 0:
                adata.var[f'Donor_2-{annotation}'] = adata.var[f'{annotation}-{i}']
                del adata.var[f'{annotation}-{i}']
            if i == 1: 
                adata.var[f'Donor_1-{annotation}'] = adata.var[f'{annotation}-{i}']
                del adata.var[f'{annotation}-{i}']
            elif i==2:
                adata.var[f'Donor_3-{annotation}'] = adata.var[f'{annotation}-{i}']
                del adata.var[f'{annotation}-{i}'] 
print(adata.uns['neighbors'])

d = {'absorptive':'absorptive', 'SI_6-?':'absorptive', 'tuft':'tuft', 'goblet':'goblet', 'BEST4':'BEST4+', 'EEC':'EEC', 'ISC':'ISC', 'TA':'TA',
     'nan':'secretory_prog', 'paneth':'paneth', 'FAE':'FAE'}

adata.obs.type = adata.obs.type.map(d)

print([a for a in adata.obs.type.unique()])

pp_adata = mp.load(combined_path, 'preprocessed_adata.h5ad')
                   
pp_adata = pp_adata[adata.obs_names.tolist()]
raw_adata=adata.raw.to_adata()

adata.layers['ln_rpdm-normalized'] = raw_adata.copy()
adata.layers['raw_rpdm-normalized'] = adata.layers['raw_normalized']
del adata.layers['raw_normalized']
adata.raw = pp_adata

mp.save(adata, combined_path, 'clustered_annotated_adata_k10_lr0.92_v1.6.h5ad')

In [None]:
import gc
data = pd.read_csv(combined_path+'/shared_ENSG_ids_v2.0.csv')
adata = mp.load(combined_path, 'clustered_annotated_adata_k10_lr0.92_v1.6.h5ad')

raw_adata = adata.raw.to_adata()

print('----- Before Modifications ------')

print('\nminimum values')
print('raw_adata min: ',raw_adata.X.min())
print('raw_min: ',adata.raw.X.min())
print('raw_rpdm min: ',adata.layers['raw_rpdm-normalized'].min())
print('ln_rpdm min: ',adata.layers['ln_rpdm-normalized'].min())
print('scaled min: ',adata.X.min())

print('\nmax values')
print('raw_adata max: ',raw_adata.X.max())
print('raw_max: ',adata.raw.X.max())
print('raw_rpdm max: ',adata.layers['raw_rpdm-normalized'].max())
print('ln_rpdm max: ',adata.layers['ln_rpdm-normalized'].max())
print('scaled max: ',adata.X.max())

del adata.layers['raw_rpdm-normalized']
del adata.layers['ln_rpdm-normalized']

print(adata)

# print(adata[:,[0,1,2,3,11]])

for i,geneID in enumerate(data['ENSG ID']):
#     print(i+1)
#     print(geneID)
    
    gene_map = {gene:i for i,gene in enumerate(adata.var_names)}
    
    #generate list of genes to update and most up to date gene symbol
    subset = adata[:,adata.var['gene_ids']==geneID].copy()
    gc.collect()
    genes = subset.var_names.tolist()
    # print(genes)
    gene_symbol = data.iloc[i]['Current gene symbol']
    # print(gene_symbol)
    
    #Map genes to column position in matrix
    gene_positions = [gene_map[genes[0]], gene_map[genes[1]]]
    # print(gene_positions)
    
    #Grab entire layer, update, and save back to adata
    # print(type(np.asarray(adata.layers['raw'])))

    if i == 0:
        layer_ndarray = np.squeeze(np.asarray(adata.layers['raw'].todense()))
    else:
        layer_ndarray = adata.layers['raw']
    
    layer_ndarray[:,gene_positions[0]] = layer_ndarray[:,gene_positions].sum(axis=1) #gene_positions[0] - gets overwritten
    adata.layers['raw'] = layer_ndarray

    #Update gene name
    var_names = list(adata.var_names)
    var_names[gene_positions[0]] = gene_symbol #gene_positions[0] - name gets replaced
    adata.var.index = var_names
    raw_adata.var.index = var_names

    #Subset to remove second instance
    adata = adata[:,[i for i in range(adata.shape[1]) if i != gene_positions[1]]].copy() #gene_positions[1] - gets removed
    raw_adata = raw_adata[:,[i for i in range(raw_adata.shape[1]) if i != gene_positions[1]]].copy()
    # print(adata.shape)
    # print(raw_adata.shape)

del adata.raw
del adata.layers['raw']

adata.var_names_make_unique()

adata.layers['raw'] = raw_adata.X

results = sc.pp.normalize_total(raw_adata, inplace = False)
adata.layers['raw_rpdm-normalized'] = results['X']

ln_rpdm_normalized = sc.pp.log1p(adata.layers['raw_rpdm-normalized'], copy = True)
adata.layers['ln_rpdm-normalized'] = ln_rpdm_normalized

adata.raw = raw_adata

print('----- After Modifications ------')

print('\nminimum values')
print('raw_adata min: ',raw_adata.X.min())
print('raw_min: ',adata.raw.X.min())
print('raw_rpdm min: ',adata.layers['raw_rpdm-normalized'].min())
print('ln_rpdm min: ',adata.layers['ln_rpdm-normalized'].min())
print('scaled min: ',adata.X.min())

print('\nmax values')
print('raw_adata max: ',raw_adata.X.max())
print('raw_max: ',adata.raw.X.max())
print('raw_rpdm max: ',adata.layers['raw_rpdm-normalized'].max())
print('ln_rpdm max: ',adata.layers['ln_rpdm-normalized'].max())
print('scaled max: ',adata.X.max())

mp.save(adata, combined_path, 'clustered_annotated_adata_k10_lr0.92_v1.7.h5ad')

In [None]:
adata = mp.load(combined_path, 'clustered_annotated_adata_k10_lr0.92_v1.7.h5ad')

print(adata)

print('----- After Modifications ------')

print('\nminimum values')
print('raw_min: ',adata.raw.X.min())
print('raw_rpdm min: ',adata.layers['raw_rpdm-normalized'].min())
print('ln_rpdm min: ',adata.layers['ln_rpdm-normalized'].min())
print('scaled min: ',adata.X.min())

print('\nmax values')
print('raw_max: ',adata.raw.X.max())
print('raw_rpdm max: ',adata.layers['raw_rpdm-normalized'].max())
print('ln_rpdm max: ',adata.layers['ln_rpdm-normalized'].max())
print('scaled max: ',adata.X.max())

In [None]:
adata = mp.load(combined_path, "clustered_annotated_adata_k10_lr0.92_v1.6.h5ad")

adata.var['gene_ids'].to_csv(combined_path+'/v1.6_ENSG_values.csv')

In [None]:
## work in progress updates for standardizing colors for all mappings
### Added additional color for lineage=='paneth'
type_order = ['absorptive','tuft','goblet','BEST4','EEC','ISC','TA','secretory_prog','paneth','FAE','SI-6?']
adata.obs.type = adata.obs.type.astype('category')
print(adata.obs.type.unique())
adata.obs.type.cat.reorder_categories(type_order, inplace = True)


sc.pl.umap(adata, color = ['leiden','lineage','type'], ncols = 1)

### Add annotation for Parikh et al 2019 crypt-axis score

## Toggle for raw_normalized 
adata.X = adata.layers['raw_normalized'].copy().todense()
print(adata.X.shape)

crypt_axis = ['SELENOP','CEACAM7','PLAC8','CEACAM1','TSPAN1','CEACAM5','CEACAM6','IFI27','DHRS9','KRT20',
              'RHOC','CD177','PKIB','HPGD','LYPD8']

sc.tl.score_genes(adata, crypt_axis, score_name = 'crypt_axis_score_scanpy', use_raw = False)

subset = adata[:,crypt_axis].copy()

for gene in crypt_axis:
    score_set = subset[:,gene].copy()
    denom = max(score_set.X)
    scale_factor = 1/denom
    subset.obs['crypt_axis_score_parikh'] = subset.obs['crypt_axis_score_parikh'] + ((score_set.X.ravel())*scale_factor) #- min(score_set.X.ravel()))* scale_factor)

crypt_axis_order = ['C_ISC','C_TA','C_earlyCC','secretory_prog','C_goblet','C_tuft','C_BEST4','EEC','C_lateCC']




