In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import time
import os
import ldspec
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from statsmodels.stats.multitest import multipletests
import statsmodels.api as sm


# autoreload
%load_ext autoreload
%autoreload 2

In [2]:
# CHR_LIST = [22]
CHR_LIST = np.arange(1, 23)
DATA_PATH = '/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001_chimp'
SCORE_FILE_LIST = [
    DATA_PATH + '/ldspec_score_file/baseline.c@_score.tsv.gz',
    DATA_PATH + '/ldspec_score_file/pannot_prox_ld.c@_score.tsv.gz',
    DATA_PATH + '/ldspec_score_file/pannot_gene_ld.c@_score.tsv.gz',
    DATA_PATH + '/ldspec_score_file/pannot_baseline_0_100_ld.c@_score.tsv.gz',
    DATA_PATH + '/ldspec_score_file/pannot_baseline_0_1000_ld.c@_score.tsv.gz',
]

In [3]:
# Load --score_file
print("# Loading --score_file")
df_score = None
for score_file in SCORE_FILE_LIST:
    df_list = []
    for CHR in CHR_LIST:
        fpath = score_file.replace("@", "%d" % CHR)
        if os.path.exists(fpath):
            temp_df = pd.read_csv(fpath, sep="\t", index_col=None)
            col_list = [x for x in temp_df if x.startswith(("E", "LD", "DLD"))]
            temp_df[col_list] = temp_df[col_list].astype(np.float32)
            df_list.append(temp_df.copy())

    temp_df = pd.concat(df_list, axis=0)
    temp_df.index = temp_df["SNP"]
    if df_score is None:
        df_score = temp_df.copy()
    else:
        col_list = [x for x in temp_df if x not in df_score]
        df_score = df_score.join(temp_df[col_list])
    del temp_df

df_score = df_score.loc[df_score.isna().sum(axis=1) == 0].copy()
df_score.sort_values(["CHR", "BP"], inplace=True)
df_score.index = df_score["SNP"]
df_score.columns = [x.replace('DLD:', '').replace('LD:', '') for x in df_score]
n_snp = df_score.shape[0]
n_LD = len([x for x in df_score if x.startswith("AN:")])
n_DLD = len([x for x in df_score if x.startswith("pAN:")])
print("    Loaded: %d SNPs, %d LD scores, %d DLD scores" % (n_snp, n_LD, n_DLD))

# Loading --score_file
    Loaded: 14820648 SNPs, 165 LD scores, 164 DLD scores


In [7]:
AN_list = [x for x in df_score if x.startswith('AN:')]
print('n_annot=%d' % len(AN_list))

with open(
    '/n/groups/price/martin/LDSPEC_data/UKBimp_337K_MAF001_chimp/'
    'ldspec_res_071623.prox_gene_fct_all_ld/blood_PLATELET_COUNT.pickle', 'br'
) as f:
    temp_df = pickle.load(f)
pAN_list = [x for x in df_score if x.startswith('pAN:')]
pAN_list = [x for x in pAN_list if x in temp_df['summary']['omega'].index]
print('n_pannot=%d' % len(pAN_list))
df_cor = df_score[AN_list + pAN_list].corr()
df_cor.to_csv('/n/groups/price/martin/LDSPEC_data/results/tables/score_corr.tsv', sep='\t')

n_annot=165
n_pannot=136


### Analysis

In [8]:
df_cor = pd.read_csv('/n/groups/price/martin/LDSPEC_data/results/tables/score_corr.tsv', 
                     sep='\t', index_col=0)

In [9]:
# AN vs. pAN
for AN in AN_list:
    for pAN in pAN_list:
        if df_cor.loc[AN, pAN] > 0.9:
                print('%-40s %-40s %0.3f' % (
                    AN.replace('AN:', ''), pAN.replace('pAN:', ''), df_cor.loc[AN, pAN]
                ))

In [12]:
# pAN vs. pAN
for mbin1,mbin2 in [['common', 'common'], ['lf', 'lf']]:
    print('%s %s' % (mbin1, mbin2))
    pAN_list1 = [x for x in pAN_list if mbin1 in x]
    pAN_list2 = [x for x in pAN_list if mbin2 in x]
    for i in range(len(pAN_list1)):
        start_ind = i+1 if mbin1==mbin2 else 0
        for j in range(start_ind, len(pAN_list2)):
            pAN_i,pAN_j = pAN_list1[i],pAN_list2[j]
            if np.absolute(df_cor.loc[pAN_i, pAN_j]) > 0.95:
                print('%-40s %-40s %0.5f' % (
                    pAN_i.replace('pAN:', ''), pAN_j.replace('pAN:', ''), df_cor.loc[pAN_i, pAN_j]
                ))

    print('')

common common
proxy_0_100_ld_p0_p100_maf_common_block  proxy_100_1000_ld_p0_p100_maf_common_block 0.99877
proxy_0_100_ld_p0_p100_maf_common_block  proxy_1000_10000_ld_p0_p100_maf_common_block 0.99575
proxy_0_100_ld_p0_p100_maf_common_block  proxy_0_100_ld_n100_p0_maf_common_block  -0.98103
proxy_0_100_ld_p0_p100_maf_common_block  proxy_100_1000_ld_n100_p0_maf_common_block -0.98400
proxy_0_100_ld_p0_p100_maf_common_block  proxy_1000_10000_ld_n100_p0_maf_common_block -0.98273
proxy_0_100_ld_p0_p100_maf_common_block  H3K4me1_Trynka_proxy_0_100_ld_p0_p100_maf_common_block 0.95096
proxy_0_100_ld_p0_p100_maf_common_block  H3K4me3_Trynka_proxy_0_1000_ld_p0_p100_maf_common_block 0.96173
proxy_0_100_ld_p0_p100_maf_common_block  H3K4me3_Trynka_proxy_0_1000_ld_n100_p0_maf_common_block -0.95456
proxy_100_1000_ld_p0_p100_maf_common_block proxy_1000_10000_ld_p0_p100_maf_common_block 0.99682
proxy_100_1000_ld_p0_p100_maf_common_block proxy_0_100_ld_n100_p0_maf_common_block  -0.98240
proxy_100_1000_ld

In [None]:
for mbin in ['common', 'lf']:
    temp_list = [x for x in df_cor if ('p0' in x) & (mbin in x)]
    temp_df = df_cor.loc[temp_list, temp_list].copy()
    temp_df.columns = [x.replace('pAN:proxy_', '').replace('_maf', '').replace('_block', '') for x in temp_df]
    temp_df.index = [x.replace('pAN:proxy_', '').replace('_maf', '').replace('_block', '') for x in temp_df.index]
    plt.figure(figsize=[5,5])
    sns.heatmap(temp_df, vmax=1, vmin=-1, cmap='RdBu_r', annot=temp_df, fmt='0.4g')
    plt.show()
#     display(df_cor.loc[temp_list, temp_list])

In [18]:
np.corrcoef(df_score['pAN:proxy_0_100_common_common'], df_score['pAN:proxy_0_1000_common_common'])[0,1]

0.9928360190214107

In [19]:
np.corrcoef(df_score['pAN:proxy_0_100_common_common'], 
            df_score['pAN:proxy_0_1000_common_common'] - df_score['pAN:proxy_0_100_common_common'])[0,1]

0.9905246158453058

In [16]:
df_score[['pAN:proxy_0_100_common_common', 'pAN:proxy_0_1000_common_common']]

Unnamed: 0_level_0,pAN:proxy_0_100_common_common,pAN:proxy_0_1000_common_common
SNP,Unnamed: 1_level_1,Unnamed: 2_level_1
1:66435_ATT_A,0.059866,0.146189
rs555652149,0.295387,1.184963
rs531531651,0.139579,0.603959
rs185487977,0.340735,0.599530
rs548107800,0.003909,0.147143
...,...,...
rs370652263,-1.189226,1.232365
rs541098394,0.133695,0.215772
rs553081191,0.026397,0.196966
rs149712012,-0.009371,0.087071


In [15]:
pAN_list

['pAN:proxy_0_100_common_common',
 'pAN:proxy_0_100_lf_lf',
 'pAN:proxy_0_1000_common_common',
 'pAN:proxy_0_1000_lf_lf',
 'pAN:proxy_0_10000_common_common',
 'pAN:proxy_0_10000_lf_lf',
 'pAN:exon_common_common',
 'pAN:exon_lf_lf',
 'pAN:exonic_gene_common_common',
 'pAN:exonic_gene_lf_lf',
 'pAN:gene_common_common',
 'pAN:gene_lf_lf',
 'pAN:cS2G_promoter_common_common',
 'pAN:cS2G_promoter_lf_lf',
 'pAN:protein_domain_common_common',
 'pAN:protein_domain_lf_lf',
 'pAN:H3K27ac_Hnisz_0_100_common_common',
 'pAN:H3K27ac_Hnisz_0_100_lf_lf',
 'pAN:H3K27ac_PGC2_0_100_common_common',
 'pAN:H3K27ac_PGC2_0_100_lf_lf',
 'pAN:H3K4me1_Trynka_0_100_common_common',
 'pAN:H3K4me1_Trynka_0_100_lf_lf',
 'pAN:Intron_UCSC_0_100_common_common',
 'pAN:Intron_UCSC_0_100_lf_lf',
 'pAN:Repressed_Hoffman_0_100_common_common',
 'pAN:Repressed_Hoffman_0_100_lf_lf',
 'pAN:Transcribed_Hoffman_0_100_common_common',
 'pAN:Transcribed_Hoffman_0_100_lf_lf',
 'pAN:DGF_ENCODE_0_1000_common_common',
 'pAN:DGF_ENCODE_0_1