## Notebook to filter statistically signficant results that differ based on large result differences between the GLM and RLM linear regressions

Using this as a method to remove outlier driven results from the GLM results while keeping the interpretability of the GLM coefficients.

Filter on:
- not statistically significant in both result sets
- effect direction is not consistent between result sets
- if difference is effect between results is an outlier

In [None]:
!date

#### import libraries

In [None]:
from pandas import read_csv
from random import choice
from scipy import stats
from seaborn import displot

#### set notebook variables

In [None]:
# parameters
modality = 'GEX' # 'GEX' or 'ATAC'
category = 'curated_type' # 'curated_type' for broad and 'cluster_name' for specific
GENERAL_TYPE = 'glm_tweedie'
ROBUST_TYPE = 'rlm'

In [None]:
# parameters
project = 'aging_phase2'
if category == 'curated_type':
    prefix_type = 'broad'
elif category == 'cluster_name':
    prefix_type = 'specific' 

# directories
wrk_dir = '/labshare/raph/datasets/adrd_neuro/brain_aging/phase2'
results_dir = f'{wrk_dir}/results'
figures_dir = f'{wrk_dir}/figures'

# in files
general_fdr_file = f'{results_dir}/{project}.{modality}.{prefix_type}.{GENERAL_TYPE}_fdr.age.csv'
robust_fdr_file = f'{results_dir}/{project}.{modality}.{prefix_type}.{ROBUST_TYPE}.age.csv'

# out files
results_file = f'{results_dir}/{project}.{modality}.{prefix_type}.{GENERAL_TYPE}_fdr_filtered.age.csv'

# constants
DEBUG = False
MAX_Z = 3

### load data

In [None]:
%%time
general_results = read_csv(general_fdr_file)
robust_results = read_csv(robust_fdr_file)
print(f'shape of general results {general_results.shape}')
print(f'shape of robust results {robust_results.shape}')
if DEBUG:
    display(general_results.sample(4))
    display(robust_results.sample(4))

#### filter the robust based on nominal alpha

In [None]:
robust_results = robust_results.loc[robust_results['p-value'] <= 0.05]
print(f'shape of robust results {robust_results.shape}')
if DEBUG:
    display(robust_results.sample(4))

### create index/key for results pairs; ie feature-tissue

In [None]:
general_results['pair'] = general_results.feature + '_' + general_results.tissue
robust_results['pair'] = robust_results.feature + '_' + robust_results.tissue

### find intersect of just the statistically significant results

In [None]:
pair_intersect = set(general_results.pair) & set(robust_results.pair)
print(f'{len(pair_intersect)} found in both which is {(len(pair_intersect)/general_results.shape[0])*100:.2f}')

#### see an example of an excluded

In [None]:
not_found = set(general_results.pair) - set(robust_results.pair)
if len(not_found) > 0:
    sampled_item = choice(list(not_found))
    feature, tissue = sampled_item.split('_')
    print(sampled_item, feature, tissue)
    print(GENERAL_TYPE)    
    display(general_results.loc[(general_results.feature == feature) & 
            (general_results.tissue == tissue)])
    print(ROBUST_TYPE)    
    display(robust_results.loc[(robust_results.feature == feature) & 
            (robust_results.tissue == tissue)])    

In [None]:
feature = 'HSPD1'
tissue = 'Astro'
print(feature, tissue)
display(general_results.loc[(general_results.feature == feature) & 
        (general_results.tissue == tissue)])
display(robust_results.loc[(robust_results.feature == feature) & 
        (robust_results.tissue == tissue)])

#### filter on this intersect

In [None]:
filtered_results = general_results.loc[general_results.pair.isin(pair_intersect)]
print(f'shape of general results {filtered_results.shape}')
if DEBUG:
    display(filtered_results.sample(4))

### find intersect based and consistent direction of effect

In [None]:
merged = filtered_results.merge(robust_results, how='left', on='pair', 
                                suffixes=(f'_{GENERAL_TYPE}', f'_{ROBUST_TYPE}'))
print(merged.shape)
kept = merged.loc[merged[f'coef_{GENERAL_TYPE}'] * merged[f'coef_{ROBUST_TYPE}'] >= 0]
print(kept.shape)
if DEBUG:
    display(merged.sample(5))
    display(kept.sample(5))    

#### see an example of an excluded

In [None]:
not_found = set(filtered_results.pair) - set(kept.pair)
if len(not_found) > 0:
    sampled_item = choice(list(not_found))
    feature, tissue = sampled_item.split(':')
    print(sampled_item, feature, tissue)
    print(GENERAL_TYPE)    
    display(general_results.loc[(general_results.feature == feature) & 
            (general_results.tissue == tissue)])
    print(ROBUST_TYPE)
    display(robust_results.loc[(robust_results.feature == feature) & 
            (robust_results.tissue == tissue)])    
else:
    print('none found')

#### filter out the effects that differ in direction

In [None]:
filtered_results = filtered_results.loc[filtered_results.pair.isin(kept.pair)]
print(f'shape of general results {filtered_results.shape}')
if DEBUG:
    display(filtered_results.sample(4))

### find results where the difference in effect from different model is an outlier from the rest

In [None]:
kept['effect_delta'] = kept[f'coef_{GENERAL_TYPE}'] - kept[f'coef_{ROBUST_TYPE}']
kept['effect_delta_z'] = stats.zscore(kept.effect_delta)
print(kept.shape)
if DEBUG:
    display(kept.sample(5))
    display(kept.effect_delta_z.describe())

In [None]:
displot(kept.effect_delta_z, kind='kde')

In [None]:
kept.loc[abs(kept.effect_delta_z) > MAX_Z].head()

#### filter the effect outliers

In [None]:
kept = kept.loc[abs(kept.effect_delta_z) < MAX_Z]
print(kept.shape)
if DEBUG:
    display(kept.sample(5))

In [None]:
filtered_results = filtered_results.loc[filtered_results.pair.isin(kept.pair)]
print(f'shape of general results {filtered_results.shape}')
if DEBUG:
    display(filtered_results.sample(4))

### save the filtered results

In [None]:
%%time
# drop the temp pair column that was added before write
filtered_results.drop(columns=['pair']).to_csv(results_file, index=False)

In [None]:
!date