In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import ldspec
import time
import pickle

%load_ext autoreload
%autoreload 2

In [2]:
# dic_config_ld
df_annot = ldspec.util.read_annot(
    '/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001/baseline_annot/baseline_165annots_chr22.annot.gz'
)

dic_tau_star = {
    'AN:alleleage_common' : -0.24, 
    'AN:LLD_AFR_common' : -0.20, 
    'AN:LLD_AFR_lf' : -0.20,     
    'AN:recomb_rate_common' : -0.20, 
    'AN:recomb_rate_lf' : -0.20, 
    'AN:nucleotide_div_common' : -0.13, 
    'AN:nucleotide_div_lf' : -0.13,     
    'AN:Backgrd_Selection_Stat_common' : 0.11, 
    'AN:Backgrd_Selection_Stat_lf' : 0.11,     
    'AN:CpG_common' : 0.23, 
    'AN:CpG_lf' : 0.23,  
}

print('Rescale tau_star by SD and a factor such that h2 are positive')
ind_common = df_annot[['AN:mbin%d_common'%x for x in range(10)]].sum(axis=1).values>0
ind_lf = df_annot[['AN:mbin%d_lf'%x for x in range(5)]].sum(axis=1).values>0
for AN in dic_tau_star:
    ind_select = ind_common if AN.endswith('_common') else ind_lf
    dic_tau_star[AN] = dic_tau_star[AN] / df_annot.loc[ind_select, AN].std() / 20
v_h2_ld = df_annot[dic_tau_star].dot([dic_tau_star[x] for x in dic_tau_star])
print('    v_h2_ld    min=%0.3f, max=%0.3f' % (v_h2_ld.min(), v_h2_ld.max()))

enrich_max = 1.5 # maximum enrich of 1.5
scale_list = []
for AN in dic_tau_star:
    ind_select = ind_common if AN.endswith('_common') else ind_lf
    q20 = np.quantile(df_annot.loc[ind_select, AN], 0.2)
    q80 = np.quantile(df_annot.loc[ind_select, AN], 0.8)
    avg_all = v_h2_ld[ind_select].mean()
    avg_q80 = v_h2_ld[ind_select & (df_annot[AN].values>=q80)].mean() 
    avg_q20 = v_h2_ld[ind_select & (df_annot[AN].values<=q20)].mean()
    avg_qmax = max(avg_q20, avg_q80)
    scale_list.append( (enrich_max - 1) / (avg_qmax - avg_all * enrich_max))
scale_factor_legit = -0.8 / v_h2_ld.min()
print('    scale_factor_legit=%0.3f' % scale_factor_legit)
scale_factor_enrich = np.min(scale_list)
print('    scale_factor_enrich=%0.3f' % scale_factor_enrich)
scale_factor = min(scale_factor_enrich, scale_factor_legit)
print('    scale_factor=%0.3f' % scale_factor)

dic_config_ld = {}
print('LD architecture')
for AN in dic_tau_star:
    dic_config_ld[AN] = dic_tau_star[AN] * scale_factor
    
v_h2_ld = df_annot[dic_config_ld].dot([dic_config_ld[x] for x in dic_config_ld])
print('    v_h2_ld    min=%0.3f, max=%0.3f' % (v_h2_ld.min(), v_h2_ld.max()))
for AN in dic_tau_star:
    ind_select = ind_common if AN.endswith('_common') else ind_lf
    q20 = np.quantile(df_annot.loc[ind_select, AN], 0.2)
    q80 = np.quantile(df_annot.loc[ind_select, AN], 0.8)
    avg_all = v_h2_ld[ind_select].mean()
    avg_q80 = v_h2_ld[ind_select & (df_annot[AN].values>=q80)].mean() 
    avg_q20 = v_h2_ld[ind_select & (df_annot[AN].values<=q20)].mean()
    print('    %-40s %0.3f, enrich_q20=%0.3f, enrich_q80=%0.3f' % (
        AN, dic_config_ld[AN], (avg_q20+1) / (avg_all+1), (avg_q80+1) / (avg_all+1),
    ))
    
dic_config_mbin = {}
dic_config_mbin.update({'AN:mbin%d_common'%x : 0 for x in range(10)})
dic_config_mbin.update({'AN:mbin%d_lf'%x : 0 for x in range(5)})

# AN_list, pAN_list
reg_annot_list = [
    '/home/jz286/WES_analysis/LDSPEC/experiments/job.analysis_imp_geno_chimp/reg_annot_file/'
    'reg_annot_file.prox_gene_fct_all_ld.txt',
]

annot_file_list,AN_list,pAN_list = [],[],[]
for reg_annot_file in reg_annot_list:
    annot_file_list += [x for x in pd.read_csv(reg_annot_file, header=None)[0] if x not in annot_file_list]
for annot_file in annot_file_list:
    annot_name = ldspec.util.get_annot_name_from_file(annot_file)
    if annot_file.endswith(".annot.gz"):
        temp_df = ldspec.util.read_annot(annot_file.replace('@', '1'), nrows=5)
        AN_list.extend([x for x in temp_df if x.startswith("AN:")])
    if annot_file.endswith(".pannot_mat.npz"):
        pAN_list.append(annot_name)
print('AN_list', len(AN_list))
print('pAN_list', len(pAN_list))

Rescale tau_star by SD and a factor such that h2 are positive
    v_h2_ld    min=-0.114, max=0.153
    scale_factor_legit=6.997
    scale_factor_enrich=19.680
    scale_factor=6.997
LD architecture
    v_h2_ld    min=-0.800, max=1.070
    AN:alleleage_common                      -0.094, enrich_q20=1.148, enrich_q80=0.808
    AN:LLD_AFR_common                        -0.085, enrich_q20=1.088, enrich_q80=0.952
    AN:LLD_AFR_lf                            -0.094, enrich_q20=1.070, enrich_q80=0.905
    AN:recomb_rate_common                    -0.022, enrich_q20=1.094, enrich_q80=0.867
    AN:recomb_rate_lf                        -0.023, enrich_q20=1.126, enrich_q80=0.841
    AN:nucleotide_div_common                 -0.024, enrich_q20=1.130, enrich_q80=0.853
    AN:nucleotide_div_lf                     -0.024, enrich_q20=1.125, enrich_q80=0.857
    AN:Backgrd_Selection_Stat_common         0.172, enrich_q20=0.916, enrich_q80=1.123
    AN:Backgrd_Selection_Stat_lf             0.158, enrich_q20

In [3]:
dic_config = {}
alpha = -0.38
OUT_PATH = '/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001_chimp/simulation.100123'

for h2g,p_causal in [[0.5, 0.2], [0.5, 0.1], [0.2, 0.2]]:
    dic_config['null_h2g%d_p%d' % (h2g*100, p_causal*100)] = {
        'basic' : (h2g, p_causal, alpha, True), # h2g, sparsity, alpha, LD-dependent
        'AN:all' : 1,
        'AN:SuperEnhancer_Hnisz_common' : 2,
    }    
        
    dic_config['causal_h2g%d_p%d' % (h2g*100, p_causal*100)] = {
        'basic' : (h2g, p_causal, alpha, True), # h2g, sparsity, alpha, LD-dependent
        'AN:all' : 1,
        'AN:SuperEnhancer_Hnisz_common' : 2,
        # pos_ld
        'pAN:proxy_0_100_ld_p0_p100_maf_common_block' : -0.5,
        'pAN:proxy_0_100_ld_p0_p100_maf_lf_block' : -0.5, 
        'pAN:proxy_100_1000_ld_p0_p100_maf_common_block': -0.3,
        'pAN:proxy_100_1000_ld_p0_p100_maf_lf_block' : -0.3, 
        'pAN:SuperEnhancer_Hnisz_proxy_0_1000_ld_p0_p100_maf_common_block' : -0.3,
        'pAN:SuperEnhancer_Hnisz_proxy_0_1000_ld_p0_p100_maf_lf_block' : -0.3,
    }
    
    if [h2g,p_causal] != [0.5, 0.2]:
        continue
        
    dic_config['causal_neg_h2g%d_p%d' % (h2g*100, p_causal*100)] = {
        'basic' : (h2g, p_causal, alpha, True), # h2g, sparsity, alpha, LD-dependent
        'AN:all' : 1,
        'AN:SuperEnhancer_Hnisz_common' : 2,
        # pos_ld
        'pAN:proxy_0_100_ld_p0_p100_maf_common_block' : -0.5,
        'pAN:proxy_0_100_ld_p0_p100_maf_lf_block' : -0.5, 
        'pAN:proxy_100_1000_ld_p0_p100_maf_common_block': -0.3,
        'pAN:proxy_100_1000_ld_p0_p100_maf_lf_block' : -0.3, 
        'pAN:SuperEnhancer_Hnisz_proxy_0_1000_ld_p0_p100_maf_common_block' : -0.3,
        'pAN:SuperEnhancer_Hnisz_proxy_0_1000_ld_p0_p100_maf_lf_block' : -0.3,
         # neg_ld
        'pAN:proxy_0_100_ld_n100_p0_maf_common_block' : -0.5,
        'pAN:proxy_0_100_ld_n100_p0_maf_lf_block' : -0.5,  
        'pAN:proxy_100_1000_ld_n100_p0_maf_common_block': -0.3,
        'pAN:proxy_100_1000_ld_n100_p0_maf_lf_block' : -0.3, 
        'pAN:SuperEnhancer_Hnisz_proxy_0_1000_ld_n100_p0_maf_common_block' : -0.3,
        'pAN:SuperEnhancer_Hnisz_proxy_0_1000_ld_n100_p0_maf_lf_block' : -0.3,
    }
    
    dic_config['causal_pos_h2g%d_p%d' % (h2g*100, p_causal*100)] = {
        'basic' : (h2g, p_causal, alpha, True), # h2g, sparsity, alpha, LD-dependent
        'AN:all' : 1,
        'AN:SuperEnhancer_Hnisz_common' : 2,
        # pos_ld
        'pAN:proxy_0_100_ld_p0_p100_maf_common_block' : 0.5,
        'pAN:proxy_0_100_ld_p0_p100_maf_lf_block' : 0.5, 
        'pAN:proxy_100_1000_ld_p0_p100_maf_common_block': 0.3,
        'pAN:proxy_100_1000_ld_p0_p100_maf_lf_block' : 0.3, 
        'pAN:SuperEnhancer_Hnisz_proxy_0_1000_ld_p0_p100_maf_common_block' : 0.3,
        'pAN:SuperEnhancer_Hnisz_proxy_0_1000_ld_p0_p100_maf_lf_block' : 0.3,
         # neg_ld
        'pAN:proxy_0_100_ld_n100_p0_maf_common_block' : 0.5,
        'pAN:proxy_0_100_ld_n100_p0_maf_lf_block' : 0.5,  
        'pAN:proxy_100_1000_ld_n100_p0_maf_common_block': 0.3,
        'pAN:proxy_100_1000_ld_n100_p0_maf_lf_block' : 0.3, 
        'pAN:SuperEnhancer_Hnisz_proxy_0_1000_ld_n100_p0_maf_common_block' : 0.3,
        'pAN:SuperEnhancer_Hnisz_proxy_0_1000_ld_n100_p0_maf_lf_block' : 0.3,
    }

fpath_list = []
for simu in dic_config:
    print(simu)
    if set(dic_config[simu]) - set(AN_list + pAN_list) != set(['basic']):
        print('    Missing: ' + ','.join([x for x in dic_config[simu] if x not in AN_list + pAN_list]))
    
    try:
        os.mkdir(OUT_PATH+'/%s' % simu)
    except OSError as error:
        pass
    
    fpath_list.append(OUT_PATH+'/%s' % simu)
    
    with open(OUT_PATH+'/%s/config' % simu, 'w') as f:
        h2g,p_causal,alpha,flag_ld = dic_config[simu]['basic']
        f.write('h2g\t%0.3f\n' % h2g)
        f.write('p_causal\t%0.3f\n' % p_causal)
        f.write('alpha\t%0.3f\n' % alpha)
        if flag_ld:
            for AN in dic_config_ld:
                f.write('%s\t%0.3f\n' % (AN, dic_config_ld[AN]))
        for AN in dic_config_mbin:
            f.write('%s\t%0.3f\n' % (AN, dic_config_mbin[AN]))
        for AN in [x for x in dic_config[simu] if x!='basic']:
            f.write('%s\t%0.3f\n' % (AN, dic_config[simu][AN]))
            
with open(OUT_PATH+'/simu_list.txt', 'w') as f:
    for fpath in fpath_list:
        f.write('%s\n' % fpath)

null_h2g50_p20
causal_h2g50_p20
causal_neg_h2g50_p20
causal_pos_h2g50_p20
null_h2g50_p10
causal_h2g50_p10
null_h2g20_p20
causal_h2g20_p20


In [4]:
# Convert config files into a table
df_list = []
simu_list_paper = [
    'null_h2g50_p20', 'null_h2g50_p10', 'null_h2g20_p20',
    'causal_h2g50_p20', 'causal_h2g50_p10', 'causal_h2g20_p20',
    'causal_neg_h2g50_p20', 'causal_pos_h2g50_p20',
]
for simu in sorted(pd.read_csv(OUT_PATH+'/simu_list.txt', header=None)[0]):
    simu_sf = simu.split('/')[-1]
    if simu_sf not in simu_list_paper:
        continue
    print(simu)
    temp_df = pd.read_csv(simu+'/config', header=None, delim_whitespace=True)
    temp_df = temp_df.loc[temp_df[1]!=0]
    temp_df[2] = simu.split('/')[-1]
    temp_df = temp_df[[2,0,1]]
    temp_df.columns = ['simulation', 'term', 'value']
    df_list.append(temp_df)
df_config = pd.concat(df_list, axis=0)
df_config.to_csv('/n/groups/price/martin/LDSPEC_data/results/tables/simu_param.tsv', sep='\t', index=False)

/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001_chimp/simulation.100123/causal_h2g20_p20
/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001_chimp/simulation.100123/causal_h2g50_p10
/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001_chimp/simulation.100123/causal_h2g50_p20
/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001_chimp/simulation.100123/causal_neg_h2g50_p20
/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001_chimp/simulation.100123/causal_pos_h2g50_p20
/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001_chimp/simulation.100123/null_h2g20_p20
/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001_chimp/simulation.100123/null_h2g50_p10
/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001_chimp/simulation.100123/null_h2g50_p20


### Check

In [5]:
# df_phen = pd.read_csv(
#     '/n/groups/price/martin/data_GDREG/UKBimp_337K_MAF001_chimp/simulation.040123/null_h2g50_p20/rep0.phen',
#     sep='\t'
# )