## Figure to display the proportion of age associated features per cell types

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv, read_parquet, DataFrame as PandasDF
from scanpy import read_h5ad
from os.path import exists
from seaborn import scatterplot, barplot
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# parameters
modality = 'GEX' # 'GEX' or 'ATAC'
category = 'cluster_name' # 'curated_type' for broad and 'cluster_name' for specific
REGRESSION_TYPE = 'glm_tweedie'

In [None]:
# parameters
project = 'aging_phase2'
if category == 'curated_type':
    prefix_type = 'broad'
elif category == 'cluster_name':
    prefix_type = 'specific' 

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
results_dir = f'{wrk_dir}/results'
quants_dir = f'{wrk_dir}/quants'
figures_dir = f'{wrk_dir}/figures'

# in files
results_file = f'{results_dir}/{project}.{modality}.{prefix_type}.{REGRESSION_TYPE}_fdr_filtered.age.csv'
anndata_file = f'{quants_dir}/{project}.multivi.curated_final.h5ad' 

# out files
fig_filename = f'{figures_dir}/{project}.{modality}.{prefix_type}.{REGRESSION_TYPE}.summary.png'
bar_fig_filename = f'{figures_dir}/{project}.{modality}.{prefix_type}.{REGRESSION_TYPE}.summary_bar.png'
out_filename = f'{figures_dir}/{project}.{modality}.{prefix_type}.{REGRESSION_TYPE}.summary.csv'

# constants
DEBUG = False
ALPHA = 0.05
if DEBUG:
    print(f'results_file = {results_file}')
    print(f'fig_filename = {fig_filename}')
    print(f'out_filename = {out_filename}')

#### functions

In [None]:
def load_quantification(cell_name: str, verbose: bool=False) -> PandasDF:
    this_file = f'{quants_dir}/{project}.{modality}.{prefix_type}.{cell_name}.pb.parquet'
    if not exists(this_file):
        return None
    df = read_parquet(this_file)
    if verbose:
        print(f'shape of read {cell_name} quantifications {df.shape}')        
        display(df.sample(5))
    return df

### load the anndata file

need to anndata file to get cell counts for decorating plot

In [None]:
%%time
adata = read_h5ad(anndata_file)
print(adata)
if DEBUG:
    display(adata.obs.sample(5))

### load the modality's results

In [None]:
results_df = read_csv(results_file)
print(f'shape of {modality} results {results_df.shape}')
if DEBUG:
    display(results_df.sample(5))

In [None]:
print(results_df.feature.nunique())
display(results_df.tissue.value_counts())

### create stub dataframe to use for computing percent of features with age effect

In [None]:
props_df = results_df.tissue.value_counts().to_frame()
props_df.columns = ['aaf_cnt']
print(f'shape of props_df is {props_df.shape}')
if DEBUG:
    display(props_df)

### find the total features tested counted

In [None]:
%%time
props_df['feature_cnt'] = 0
for row in props_df.itertuples():
    data_df = load_quantification(row.Index)
    # col count is the feature cnt + 1
    props_df.loc[row.Index, 'feature_cnt'] = data_df.shape[1]    
    if DEBUG:
        print(row.Index, data_df.shape)
print(f'new shape of props_df is {props_df.shape}')
if DEBUG:
    display(props_df)        

### compute the percent of tested features that are age associated

In [None]:
props_df['percent_aaf'] = round(props_df.aaf_cnt/props_df.feature_cnt * 100, 2)
print(f'new shape of props_df is {props_df.shape}')
if DEBUG:
    display(props_df)  

### visualize the proportions

In [None]:
with rc_context({'figure.figsize': (15, 11), 'figure.dpi': 100}):
    plt.style.use('seaborn-v0_8-talk')
    barplot(data=props_df.sort_values('percent_aaf', ascending=False).reset_index(),
            x='tissue', y='percent_aaf', color='purple')
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.title(f'% of {modality} features that are age associated ')  
    plt.xlabel('Cell types')
    plt.ylabel('% of features')
    plt.savefig(bar_fig_filename)  
    plt.show()

#### annotate the cell-type's cell count

In [None]:
props_df = props_df.merge(adata.obs.groupby(category).size().to_frame(), 
                          left_index=True, right_index=True)
props_df = props_df.rename(columns={0: 'cell_count'})
props_df['percent_of_total_cells'] = round(props_df.cell_count/adata.obs.shape[0]*100, 2) 
print(f'new shape of props_df is {props_df.shape}')
if DEBUG:
    display(props_df)  

#### annotate the mean number of cells per donor for each cell-type

In [None]:
temp = adata.obs.groupby([category, 'sample_id']).size().groupby(level=0).mean().to_frame()
temp.columns = ['mean_cell_per_donor']
temp.mean_cell_per_donor = round(temp.mean_cell_per_donor, 0)
props_df = props_df.merge(temp, left_index=True, right_index=True)
print(f'new shape of props_df is {props_df.shape}')
if DEBUG:
    display(props_df)  

In [None]:
from seaborn import scatterplot
with rc_context({'figure.figsize': (15, 11), 'figure.dpi': 100}):
    plt.style.use('seaborn-v0_8-talk')
    scatterplot(data=props_df.sort_values('percent_aaf', ascending=False).reset_index(),
                x='index', y='percent_aaf', hue='percent_of_total_cells', size='mean_cell_per_donor', palette='colorblind')
    plt.legend(bbox_to_anchor=(1.3, 1), loc='upper right', borderaxespad=0, fontsize=8)
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.title(f'% of {modality} features that are age associated ')  
    plt.xlabel('Cell types')
    plt.ylabel('% of features')
    plt.savefig(fig_filename)  
    plt.show()

### save the summary table

In [None]:
props_df.to_csv(out_filename)

In [None]:
!date