# Visualize SNCA expression and predicted neutrophil cell fraction

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv
from numpy import log2
from seaborn import scatterplot, lmplot
import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
# for white background of figures (only for docs rendering)
%config InlineBackend.print_figure_kwargs={'facecolor' : "w"}
%config InlineBackend.figure_format='retina'

#### set notebook variables

In [None]:
# naming
set_name = 'amppd'
gene = 'SNCA'
gene_id = 'ENSG00000145335.15'

# directory 
wrk_dir = '/labshare/raph/datasets/amppd'
info_dir = f'{wrk_dir}/info'
quants_dir = f'{wrk_dir}/quants'

# in files
quants_file = f'{quants_dir}/matrix.genes.tsv'
info_file = f'{info_dir}/amp_pd_case_control.csv'
mutation_status_file = f'{info_dir}/amp_pd_participant_mutations.csv'
cell_fracs_file = f'{wrk_dir}/deconvolution/automl_tables/amppd.wb.pred_cell_fracs.csv'

# variables and constants
DEBUG = True
dpi_value = 100

if DEBUG:
    print(f'quants_file if {quants_file}')
    print(f'info_file if {info_file}')
    print(f'mutation_status_file if {mutation_status_file}')
    print(f'cell_fracs_file if {cell_fracs_file}')

## load the inputs

### load the sample/subject information

In [None]:
info_df = read_csv(info_file, index_col=0)
print(f'shape of info_df is {info_df.shape}')
if DEBUG:
    display(info_df.head())
    display(info_df.diagnosis_latest.value_counts())
    display(info_df.case_control_other_latest.value_counts())

### load the subject mutation status from WGS

In [None]:
mut_df = read_csv(mutation_status_file, index_col=0)
print(f'mut_df shape is {mut_df.shape}')
# simplify the column names
mut_df = mut_df.rename(columns=lambda x: x.replace('has_known_', ''))
mut_df = mut_df.rename(columns=lambda x: x.replace('_mutation_in_WGS', '+'))
mut_df = mut_df.rename(columns=lambda x: x.replace('_in_WGS', '+'))
if DEBUG:
    display(mut_df.head())
    display(mut_df.apply(lambda x: x.value_counts()))

### load the gene quantifications

In [None]:
%%time
quants_df = read_csv(quants_file, index_col=0, sep='\t').transpose()
print(f'shape of quants_df is {quants_df.shape}')
if DEBUG:
    display(quants_df.head())

### load the predicted whole-blood cell-type fractions

In [None]:
fracs_df = read_csv(cell_fracs_file, index_col=0)
print(f'shape of fracs_df is {fracs_df.shape}')
if DEBUG:
    display(fracs_df.head())

## merge the gene of insterest's quantification with the predicted cell-fractions

In [None]:
fq_df = fracs_df.merge(quants_df[[gene_id]], 
                       how='inner', left_index=True, right_index=True)
print(f'shape of fq_df is {fq_df.shape}')
if DEBUG:
    display(fq_df.head())

## merge the mutation info

### in order to merge need to split the subject ID from the sample ID

In [None]:
ids_df = fq_df.index.str.split('-', expand=True).to_frame()
ids_df.columns = ['group', 'individual', 'visit']
ids_df['subject'] = ids_df.group + '-' + ids_df.individual
if DEBUG:
    display(ids_df.head())

fq_df['subject'] = ids_df.subject.values
fq_df['visit'] = ids_df.visit.values
print(f'new shape of fq_df is {fq_df.shape}')
if DEBUG:
    display(fq_df.head())

### now can acutally merge additional subject info

In [None]:
merged_df = fq_df.merge(info_df, how='left', left_on='subject', right_index=True)
merged_df = merged_df.merge(mut_df, how='left', left_on='subject', right_index=True)
print(f'shape of merged_df is {merged_df.shape}')
if DEBUG:
    display(merged_df.head())

## log the gene's counts

In [None]:
merged_df[gene_id] = log2(merged_df[gene_id])

## generate the plot

In [None]:
def frac_gene_scatter(df, feature_id, feature_name, cell_name, hue_cat, dpi_val: int=100):
    with rc_context({'figure.figsize': (9, 9), 'figure.dpi': dpi_val}):
        plt.style.use('seaborn-v0_8-poster') 
        lmplot(x=cell_name, 
               y=feature_id, hue=hue_cat,
               height=12, data=df.sample(frac=1))
        plt.xlabel(cell_name)
        plt.ylabel(feature_name)
        plt.title(f'{feature_name} quantification and {cell_name} fraction with {hue_cat} interaction', 
              fontsize='large') 
        plt.show()

In [None]:
frac_gene_scatter(merged_df, gene_id, gene, 'predicted_Neutrophils', 'case_control_other_latest', dpi_val=50)

In [None]:
frac_gene_scatter(merged_df, gene_id, gene, 'predicted_Neutrophils', 'GBA+', dpi_val=50)

In [None]:
frac_gene_scatter(merged_df, gene_id, gene, 'predicted_Neutrophils', 'LRRK2+', dpi_val=50)

In [None]:
frac_gene_scatter(merged_df, gene_id, gene, 'predicted_Neutrophils', 'SNCA+', dpi_val=50)

In [None]:
frac_gene_scatter(merged_df, gene_id, gene, 'predicted_Neutrophils', 'APOE_E4+', dpi_val=50)

In [None]:
frac_gene_scatter(merged_df, gene_id, gene, 'predicted_Neutrophils', 'PD_Mutation+', dpi_val=50)