## Notebook that pulls together known subject and sample covariates cleans up as neccessary and writes to single file 

In [None]:
!date

#### import libraries and set notebook variables

In [None]:
import pandas as pd
import ppscore as pps
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

In [None]:
# parameter variables
cohort = 'pdbp'
cohort_abrv = 'PD'

In [None]:
# directories 
home_dir = '/home/jupyter'
wrk_dir = f'{home_dir}/{cohort}'
info_dir = f'{wrk_dir}/sample_info'

# input files
samples_covs_files = f'{home_dir}/amppd/sample_info/amppd_rna_sample_info.csv'
genos_pca_file = f'{info_dir}/{cohort}.freeze9.pca.eigenvec'

# output files
assay_covs_files = f'{info_dir}/{cohort}_rna_sample_info.csv'

# constants
max_cov_missing_rate = 0.5

#### load sample info

In [None]:
info_df = pd.read_csv(samples_covs_files, index_col=0)
# subj_info['wgsid'] = subj_info['participant_id']
# subj_info['participant_id'] = subj_info['participant_id'].str.replace('PP-', 'PPMI')

# subset to just this cohort
info_df = info_df.loc[info_df['cohort'] == cohort_abrv]
print(info_df.shape)
print(info_df['cohort'].value_counts())
print(info_df['visit'].value_counts())
# display(info_df.head())

#### load and merge in the genetics PCs

In [None]:
genetic_components_df = pd.read_csv(genos_pca_file, sep='\s+', index_col=1)
genetic_components_df.drop(columns=['#FID'], inplace=True)
print(genetic_components_df.shape)

# merge genetics PCs with other info
info_df = info_df.merge(genetic_components_df, how='left', left_on='wgsid', right_index=True)
print(info_df.shape)
display(info_df.head())

In [None]:
info_df.columns

#### check the dtypes and fix as neccessary

In [None]:
pd.set_option('display.max_rows', None)
# display(info_df.dtypes)
display(info_df.info())
pd.reset_option('display.max_rows')

#### not every expression file has WGS so may not have genetic PCS, fill mean
all the other covariates have already had missing filled
fill PCs with means

In [None]:
for col_name in genetic_components_df.columns:
    info_df[col_name].fillna(info_df[col_name].mean(), inplace=True)
print(info_df.shape)

In [None]:
pd.set_option('display.max_rows', None)
# display(info_df.dtypes)
display(info_df.info())
pd.reset_option('display.max_rows')

#### get rid of the columns that have single values or a lot missingness

In [None]:
cols_to_drop = []
for this_col in info_df.columns:
    drop_col = False
    try:
        percent_miss = info_df[this_col].isna().sum()/info_df.shape[0]
        if percent_miss > max_cov_missing_rate:
            drop_col = True
        else:
            total_unique = len(info_df[this_col].unique())
            if total_unique == 1 or (total_unique == info_df.shape[0] 
                                     and info_df[this_col].dtype == 'object'):
                drop_col = True
    except:
        drop_col = True

    if drop_col:
        cols_to_drop.append(this_col)

        
print(cols_to_drop)

In [None]:
info_df['visit_month'].value_counts()

In [None]:
# have to keep visit, biofind only has one visit so manually intervene
if 'visit' in cols_to_drop: cols_to_drop.remove('visit')
if 'wgsid' in cols_to_drop: cols_to_drop.remove('wgsid')

In [None]:
# see if sample without WGS is still there, ie won't have genetic PCs computed
info_df.loc[info_df['wgsid'].isna()]

#### if those columns look useless drop

In [None]:
info_df.drop(columns=cols_to_drop, inplace=True)
print(info_df.shape)

#### save the complete covariates file

In [None]:
info_df.to_csv(assay_covs_files)

#### take a look to see how corrlated or predictive covariates are and visualize

In [None]:
matrix_df = pps.matrix(info_df)
matrix_df = matrix_df.loc[matrix_df['ppscore'] > 0]
print(matrix_df.shape)

matrix_df['ppscore'] = matrix_df['ppscore'].round(2)
plot_matrix_df = matrix_df[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
print(plot_matrix_df.shape)
display(plot_matrix_df)

In [None]:
plt.figure(figsize=(20,20)) 
sns.heatmap(plot_matrix_df, vmin=0, vmax=1, cmap="Blues", linewidths=0.05, 
            annot=True, annot_kws={"fontsize":10})
plt.show()