#### Notebook to compare the significant independent singals for cohort by visit

In [None]:
!date

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import concurrent.futures

In [None]:
# parameters
cohort = 'ppmi'
months = [0, 6 , 12, 24, 36]
tissue = 'wb'

In [None]:
# naming

# directories
home_dir = '/home/jupyter'
wrk_dir = f'{home_dir}/{cohort}'
tensorqtl_dir = f'{wrk_dir}/tensorqtl'
results_dir = f'{wrk_dir}/results'

# input files

# out files
indep_results_file = f'{results_dir}/{cohort}.{tissue}.cis.csv'

# constants
autosomes = [str(x) for x in list(range(1,23))]
capture_out = !(nproc)
max_threads = int(capture_out[0])

#### load the results

In [None]:
indep_df = None
for month in months:
    cohort_build = f'{cohort}.{tissue}{month}'
    cis_indep_file = f'{results_dir}/{cohort_build}.cis.indep.csv'
    this_df = pd.read_csv(cis_indep_file, index_col=0)
    gene_cnt = len(this_df['phenotype_id'].unique())
    print(f'month {month} shape is {this_df.shape} for {gene_cnt} genes')
    this_df['month'] = month
    indep_df = pd.concat([indep_df, this_df])
    print(f'after month {month} total shape {indep_df.shape}')

In [None]:
print(indep_df.shape)
display(indep_df.sample(5))

#### make a key for the variant/pheno pair

In [None]:
indep_df['cispair'] = indep_df['phenotype_id'] + ':' + indep_df['variant_id']
print(indep_df.shape)
display(indep_df.sample(5))

In [None]:
indep_df['cispair'].value_counts().value_counts()

In [None]:
indep_df['month'].value_counts()

#### what is the max p-value in the independent signals

In [None]:
max_pvalue = indep_df['pval_nominal'].max()
print(f'max nominal pvalue {max_pvalue}')

#### now that all the cispairs of interest are known get complete data for these

In [None]:
def load_missing_qtl_results(find_items, in_file, month):
    eqtl_df = pd.read_parquet(in_file)
    eqtl_df['cispair'] = eqtl_df['phenotype_id'] + ':' + eqtl_df['variant_id']
    found_df = eqtl_df.loc[eqtl_df['cispair'].isin(find_items)].copy()
    found_df['month'] = month
    return found_df

In [None]:
%%time
fs_list = []
lm_results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=max_threads*2) as tpe:
    for month in months:
        cohort_build = f'{cohort}.{tissue}{month}'
        for chrom in autosomes:
            this_result_file = f'{tensorqtl_dir}/{cohort_build}.cis_qtl_pairs.chr{chrom}.parquet'
            fs_list.append(tpe.submit(load_missing_qtl_results, indep_df['cispair'], 
                                      this_result_file, month))
for future in concurrent.futures.as_completed(fs_list):
    lm_results.append(future.result())

In [None]:
# combine the read results
results_df = pd.concat(lm_results)
print(results_df.shape)
# display(results_df.sample(10))

In [None]:
# %%time
# results_df = None
# for month in months:
#     cohort_build = f'{cohort}.{tissue}{month}'
#     for chrom in autosomes:
#         this_result_file = f'{tensorqtl_dir}/{cohort_build}.cis_qtl_pairs.chr{chrom}.parquet'
#         chrom_eqtl_df = pd.read_parquet(this_result_file)
#         chrom_eqtl_df['cispair'] = chrom_eqtl_df['phenotype_id'] + ':' + chrom_eqtl_df['variant_id']
#         this_keep_df = chrom_eqtl_df.loc[chrom_eqtl_df['cispair'].isin(indep_df['cispair'])].copy()
#         this_keep_df['month'] = month
#         results_df = pd.concat([results_df, this_keep_df])
#     print(f'{month} total shape {results_df.shape}')

In [None]:
results_df['cispair'].value_counts().value_counts()

In [None]:
results_df['month'].value_counts()

In [None]:
results_df['log10_pvalue'] = np.log10(results_df['pval_nominal'])*-1
results_df['t_score'] = results_df['slope']/results_df['slope_se']
results_df['t_score_abs'] = np.abs(results_df['t_score'])
results_df['is_sig'] = np.where(results_df['pval_nominal'] < max_pvalue, 1, 0)

In [None]:
results_df.sample(5)

In [None]:
results_df['is_sig'].value_counts()

#### save the combined results

In [None]:
results_df.to_csv(indep_results_file, index=False)

In [None]:
temp = results_df.groupby(['cispair'])['is_sig'].agg('sum')
display(results_df.loc[results_df['is_sig'] == 0].head())
display(temp.value_counts())

In [None]:
temp[temp == 0].head()

In [None]:
results_df.groupby(['cispair'])['is_sig'].agg('sum').value_counts()

#### for the stuff that isn't in largest powered group any visit standout

In [None]:
results_df.loc[results_df['month'] != 0]['month'].value_counts()

In [None]:
results_df.loc[results_df['month'] != 0].groupby(['cispair'])['is_sig'].agg('sum').value_counts()

#### annotate direction of effect for plotting

In [None]:
results_df['Direction'] = np.where(results_df['slope'] > 0, 
                                         'Increase', 'Descrease')
display(results_df.sample(5))

#### do some quick plotting

In [None]:
sns.lmplot(x='month', y='log10_pvalue', hue='Direction', 
           data=results_df, palette='Set1')

In [None]:
sns.lmplot(x='month', y='t_score', hue='Direction', 
           data=results_df, palette='Set1')

In [None]:
sns.lmplot(x='month', y='slope', hue='Direction', 
           data=results_df, palette='Set1')

In [None]:
sns.lmplot(x='month', y='t_score_abs', hue='Direction', 
           data=results_df, palette='Set1')

In [None]:
sns.lmplot(x='month', y='af', hue='Direction', 
           data=results_df, palette='Set1')

In [None]:
sns.lmplot(x='month', y='ma_count', hue='Direction', 
           data=results_df, palette='Set1')

In [None]:
sns.lmplot(x='month', y='tss_distance', hue='Direction', 
           data=results_df, palette='Set1')

In [None]:
sns.relplot(x='month', y='t_score', kind='line', hue='Direction',
            data=results_df, palette='Set1')

In [None]:
sns.relplot(x='month', y='t_score_abs', kind='line', hue='Direction',
            data=results_df, palette='Set1')

In [None]:
sns.relplot(x='month', y='slope', kind='line', hue='Direction',
            data=results_df, palette='Set1')

In [None]:
sns.relplot(x='month', y='log10_pvalue', kind='line', hue='Direction',
            data=results_df, palette='Set1')

In [None]:
sns.relplot(x='tss_distance', y='log10_pvalue',hue='Direction',
            data=results_df, palette='Set1')

In [None]:
sns.scatterplot(x='tss_distance', y='log10_pvalue', hue='month',
                data=results_df.sample(frac=1), palette='Set1')

In [None]:
sns.relplot(x='tss_distance', y='t_score',hue='Direction',
            data=results_df, palette='Set1')

In [None]:
sns.relplot(x='tss_distance', y='t_score',hue='month',
            data=results_df.sample(frac=1), palette='Set1')