In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import time
import os
import argparse
import gdreg
import matplotlib.pyplot as plt

# autoreload
%load_ext autoreload
%autoreload 2
%load_ext memory_profiler
%load_ext line_profiler

In [2]:
JOB = "regress"
PGEN_FILE = "/n/groups/price/martin/WES_analysis/toy_1K/chr@_v1.SPB.hg19.toy_1K"
LD_FILE = "/n/groups/price/martin/WES_analysis/toy_1K/results/top_1K_chr@.ld.npy"
SUMSTATS_FILE = "/n/groups/price/martin/WES_analysis/toy_1K/sanity_nd_rep0.sumstats.gz"
ANNOT_FILE = "/n/groups/price/martin/WES_analysis/toy_1K/toy.annot.gz," + \
    "/n/groups/price/martin/WES_analysis/toy_1K/toy.pannot.gz," + \
    "/n/groups/price/martin/WES_analysis/toy_1K/toy.pannot_hr.gz"
PREFIX_OUT = "/n/groups/price/martin/WES_analysis/toy_1K/results/toy_1K"
MEMORY = 64
RANDOM_SEED = 0
SNP_RANGE = None

In [3]:
sys_start_time = time.time()

###########################################################################################
######                                    Parse Options                              ######
###########################################################################################

# JOB = args.job
# PGEN_FILE = args.pgen_file
# LD_FILE = args.ld_file
# SUMSTATS_FILE = args.sumstats_file
# ANNOT_FILE = args.annot_file
# PREFIX_OUT = args.prefix_out
# MEMORY = args.memory
# RANDOM_SEED = args.random_seed
# SNP_RANGE = args.snp_range

# Parse and check arguments
LEGAL_JOB_LIST = ["compute_ld", "compute_score", "regress"]
err_msg = "# run_gdreg: --job=%s not supported" % JOB
assert JOB in LEGAL_JOB_LIST, err_msg

if JOB in ["compute_score", "regress"]:
    assert LD_FILE is not None, "--ld_file required for --job=%s" % JOB
if JOB in ["regress"]:
    assert SUMSTATS_FILE is not None, "--sumstats_file required for --job=%s" % JOB
if JOB in ["compute_score", "regress"]:
    assert ANNOT_FILE is not None, "--annot_file required for --job=%s" % JOB
if JOB in ["compute_ld"]:
    assert SNP_RANGE is not None, "--snp_range required for --job=%s" % JOB
    DIC_RANGE = gdreg.util.parse_snp_range(SNP_RANGE)

# Print input options
header = gdreg.util.get_cli_head()
header += "Call: run_gdreg.py \\\n"
header += "--job %s\\\n" % JOB
header += "--pgen_file %s\\\n" % PGEN_FILE
header += "--ld_file %s\\\n" % LD_FILE
header += "--sumstats_file %s\\\n" % SUMSTATS_FILE
header += "--annot_file %s\\\n" % ANNOT_FILE
header += "--prefix_out %s\\\n" % PREFIX_OUT
header += "--snp_range %s\\\n" % SNP_RANGE
header += "--memory %d\\\n" % MEMORY
header += "--random_seed %d\n" % RANDOM_SEED
print(header)

******************************************************************************
* Gene-level directional effect regression (GDReg)
* Version 0.0.2
* Martin Jinye Zhang
* HSPH / Broad Institute
* MIT License
******************************************************************************
Call: run_gdreg.py \
--job regress\
--pgen_file /n/groups/price/martin/WES_analysis/toy_1K/chr@_v1.SPB.hg19.toy_1K\
--ld_file /n/groups/price/martin/WES_analysis/toy_1K/results/top_1K_chr@.ld.npy\
--sumstats_file /n/groups/price/martin/WES_analysis/toy_1K/sanity_nd_rep0.sumstats.gz\
--annot_file /n/groups/price/martin/WES_analysis/toy_1K/toy.annot.gz,/n/groups/price/martin/WES_analysis/toy_1K/toy.pannot.gz,/n/groups/price/martin/WES_analysis/toy_1K/toy.pannot_hr.gz\
--prefix_out /n/groups/price/martin/WES_analysis/toy_1K/results/toy_1K\
--snp_range None\
--memory 64\
--random_seed 0



In [70]:
###########################################################################################
######                                   Data Loading                                ######
###########################################################################################
# Load --pgen_file
if JOB in ["compute_ld", "compute_score", "regress"]:
    print("# Loading --pgen_file")
    dic_data = {}
    if "@" not in PGEN_FILE:
        temp_dic = gdreg.util.read_pgen(PGEN_FILE)
        dic_data[temp_dic["pvar"]["CHR"][0]] = temp_dic.copy()
    else:
        for CHR in range(1, 23):
            if os.path.exists(PGEN_FILE.replace("@", "%s" % CHR) + ".pgen"):
                dic_data[CHR] = gdreg.util.read_pgen(
                    PGEN_FILE.replace("@", "%s" % CHR)
                )

    for CHR in dic_data:
        n_sample = dic_data[CHR]["psam"].shape[0]
        n_snp = dic_data[CHR]["pvar"].shape[0]
        mat_X = gdreg.util.read_geno(
            dic_data[CHR]["pgen"], 0, 50, n_sample=None, n_snp=None
        )
        sparsity = (mat_X != 0).mean()
        print(
            "    CHR%2d: %d samples, %d SNPs, %0.1f%% non-zeros for first 50 SNPs"
            % (CHR, n_sample, n_snp, sparsity * 100)
        )
    print("    " + gdreg.util.get_sys_info(sys_start_time))

# Load --ld_file
if JOB in ["compute_score", "regress"]:
    print("# Loading --ld_file")
    dic_ld = {}
    for CHR in dic_data:
        err_msg = "--ld_file missing for CHR%d" % CHR
        assert os.path.exists(LD_FILE.replace("@", "%s" % CHR)), err_msg
        dic_ld[CHR] = np.load(LD_FILE.replace("@", "%s" % CHR))  # TODO : sp.sparse
        err_msg = "CHR%2d n_snp=%d, mismatch with --pgen_file" % (
            CHR,
            dic_ld[CHR].shape[0],
        )
        assert dic_ld[CHR].shape[0] == dic_data[CHR]["pvar"].shape[0], err_msg
    print("    LD info loaded, matching --pgen_file")
    print("    " + gdreg.util.get_sys_info(sys_start_time))

# Load --sumstats_file
if JOB in ["regress"]:
    print("# Loading --sumstats_file")
    df_sumstats = pd.read_csv(SUMSTATS_FILE, sep="\t", index_col=None)
    print("    .sumstats.gz loaded, %d SNPs" % df_sumstats.shape[0])
    print("    " + gdreg.util.get_sys_info(sys_start_time))

# Load --annot_file
if JOB in ["compute_score", "regress"]:
    print("# Loading --annot_file")
    df_annot = None
    pannot_list = []
    pannot_hr_list = []
    for annot_file in ANNOT_FILE.split(","):
        err_msg = "--annot_file missing : '%s'" % annot_file
        assert os.path.exists(annot_file), err_msg
        temp_df = gdreg.util.read_annot(annot_file)

        if annot_file.endswith(".annot.gz"):
            temp_df.index = temp_df["SNP"]
            if df_annot is None:
                df_annot = temp_df.copy()
            else:
                col_list = [x for x in temp_df if x.startswith("AN:")]
                df_annot = df_annot.join(temp_df[col_list])
        if annot_file.endswith(".pannot.gz"):
            pannot_list.append(temp_df.copy())
        if annot_file.endswith(".pannot_hr.gz"):
            pannot_hr_list.append(temp_df.copy())
    AN_list = [x for x in df_annot if x.startswith("AN:")]
    print(
        "    .annot.gz (%d SNPs and %d annots): %s"
        % (df_annot.shape[0], len(AN_list), ",".join(AN_list))
    )
    temp_list = ["%s (%d SNPs)" % (x.columns[-1], x.shape[0]) for x in pannot_list]
    print(
        "    .pannot.gz (%d pannots): %s" % (len(pannot_list), ",".join(temp_list)),
    )
    temp_list = [
        "%s (%d pairs)" % (x.columns[-1], x.shape[0]) for x in pannot_hr_list
    ]
    print(
        "    .pannot_hr.gz (%d pannots): %s"
        % (len(pannot_hr_list), ",".join(temp_list)),
    )
    print("    " + gdreg.util.get_sys_info(sys_start_time))

# Loading --pgen_file
    CHR 1: 1000 samples, 1400 SNPs, 25.9% non-zeros for first 50 SNPs
    CHR 2: 1000 samples, 1322 SNPs, 43.1% non-zeros for first 50 SNPs
    CHR 3: 1000 samples, 1326 SNPs, 33.2% non-zeros for first 50 SNPs
    CHR 4: 1000 samples, 1341 SNPs, 24.1% non-zeros for first 50 SNPs
    CHR 5: 1000 samples, 1311 SNPs, 36.9% non-zeros for first 50 SNPs
    CHR 6: 1000 samples, 1341 SNPs, 38.7% non-zeros for first 50 SNPs
    CHR 7: 1000 samples, 1375 SNPs, 41.9% non-zeros for first 50 SNPs
    CHR 8: 1000 samples, 1320 SNPs, 32.8% non-zeros for first 50 SNPs
    CHR 9: 1000 samples, 1307 SNPs, 40.0% non-zeros for first 50 SNPs
    CHR10: 1000 samples, 1280 SNPs, 44.0% non-zeros for first 50 SNPs
    sys_time=0.9s, sys_mem=0.19GB
# Loading --ld_file
    LD info loaded, matching --pgen_file
    sys_time=1.0s, sys_mem=0.19GB
# Loading --sumstats_file
    .sumstats.gz loaded, 13323 SNPs
    sys_time=1.0s, sys_mem=0.19GB
# Loading --annot_file
    .annot.gz (13323 SNPs and 

In [71]:
df_score = gdreg.score.compute_score(
    dic_data, 
    dic_ld,
    df_annot,
    verbose = True,
    win_size = 1e6,
)

# Call: gdreg.score.compute_score
    13323 SNPs from 10 CHRs: CHR1 (1280 SNPs), CHR2 (1280 SNPs), CHR3 (1280 SNPs), CHR4 (1280 SNPs), CHR5 (1280 SNPs), CHR6 (1280 SNPs), CHR7 (1280 SNPs), CHR8 (1280 SNPs), CHR9 (1280 SNPs), CHR10 (1280 SNPs)
    Single-SNP annots : AN:ALL, AN:CHR1, AN:ODD
    SNP-pair annots : 
    win_size=1.0MB, memory=128MB
    Completed, time=3.4s


In [72]:
dic_res = gdreg.regress.estimate(
    dic_data,
    dic_ld,
    df_sumstats,
    df_annot,
    pannot_list=pannot_list,
    pannot_hr_list=pannot_hr_list,
    df_score=df_score,
    n_jn_block=100,
    sym_non_pAN="non-pAN",
    win_size=int(1e6),
    memory=128,
    verbose=True,
)

print(dic_res['term'])
print(dic_res['coef'])
print(dic_res['coef_jn'])
print(np.sqrt(np.diag(dic_res['coef_jn_cov'])))

# Call: gdreg.regress.estimate
    df_sumstats : n_snp=13323, n_sample_zsq=1000
        Remove duplicates or ZSQ>80.0 SNPs, 13323 remaining, avg. zsq=1.44
    dic_data : n_snp=13323, n_sample=13323
    Regression : n_snp=13323, n_block=100
# Call: gdreg.regress.regress
    n_snp=13323, n_block=100, n_sample_zsq=1000
    4 regressors : LD:AN:ALL, LD:AN:CHR1, LD:AN:ODD, E
    Completed, time=0.0s
['LD:AN:ALL', 'LD:AN:CHR1', 'LD:AN:ODD', 'E']
[ 7.29445273e-05 -9.96738047e-06 -1.81773528e-06 -7.70946702e-02]
[ 7.43887043e-05 -1.29410174e-05 -2.64721405e-06 -8.93448563e-02]
[1.7546739e-05 2.4852536e-05 1.8103095e-05 2.3912357e-01]


In [75]:
df_eff = pd.read_csv("/n/groups/price/martin/WES_analysis/toy_1K/sanity_nd_rep0.eff.gz", sep='\t')
dic_eff = { x: y**2 for x, y in zip(df_eff["SNP"], df_eff["EFF"]) }

In [76]:
dic_zsq = {
        x: y**2
        for x, y in zip(df_sumstats["SNP"], df_sumstats["Z"])
    }

In [77]:
temp_df = df_score.copy()
temp_df['ZSQ'] = [dic_zsq[x] for x in temp_df['SNP']]
temp_df['EFFSQ'] = [dic_eff[x] for x in temp_df['SNP']]

In [78]:
print(temp_df.loc[temp_df['CHR']==1, 'ZSQ'].mean(), temp_df.loc[temp_df['CHR']==1, 'EFFSQ'].mean())

1.6246130128083283 6.593789775767758e-05


In [79]:
print(temp_df.loc[temp_df['CHR']%2==1, 'ZSQ'].mean(), temp_df.loc[temp_df['CHR']%2==1, 'EFFSQ'].mean())

1.3814940852157935 4.529622099183642e-05


In [80]:
print(temp_df.loc[temp_df['CHR']%2==0, 'ZSQ'].mean(), temp_df.loc[temp_df['CHR']%2==0, 'EFFSQ'].mean())

1.4900900441050557 2.6552901228308824e-05


In [63]:
temp_df.loc[temp_df['CHR']%2==0, 'ZSQ'].mean()

1.5692726874599383

In [51]:
temp_df

Unnamed: 0_level_0,CHR,SNP,BP,E,LD:AN:ALL,LD:AN:CHR1,LD:AN:ODD,ZSQ
SNP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1:69761:A:T,1,1:69761:A:T,69761,1.576045,16.098497,4.175498,9.494497,7.971064e-01
1:931131:I:4.01,1,1:931131:I:4.01,866511,1.018779,17.292175,5.369176,10.688176,3.852517e+00
1:935954:G:T,1,1:935954:G:T,871334,0.986834,21.083496,9.160498,14.479498,5.033972e+00
1:939436:I:1,1,1:939436:I:1,874816,0.997427,16.382248,4.459248,9.778248,6.570135e+00
1:941119:A:G,1,1:941119:A:G,876499,0.972234,33.504211,21.581213,26.900213,1.325128e-01
...,...,...,...,...,...,...,...,...
10:32922299:T:G,10,10:32922299:T:G,33211227,1.052189,22.531540,1.400000,6.719000,1.001930e-01
10:32925874:A:G,10,10:32925874:A:G,33214802,0.992592,17.438335,1.400000,6.719000,9.999468e-07
10:32928182:G:A,10,10:32928182:G:A,33217110,1.050863,22.543594,1.400000,6.719000,1.087097e-01
10:32930080:T:G,10,10:32930080:T:G,33219008,1.064022,19.813097,1.400000,6.719000,2.373483e-01


In [35]:
(dic_res['coef_jn_cov'])

array([[ 3.0477796e-09, -3.2312857e-09, -1.9074742e-09, -3.1509055e-05],
       [-3.2312857e-09,  1.8975375e-07, -1.9631807e-10, -1.9752802e-04],
       [-1.9074742e-09, -1.9631807e-10,  1.5272847e-09,  2.0051453e-05],
       [-3.1509055e-05, -1.9752802e-04,  2.0051453e-05,  6.3156730e-01]],
      dtype=float32)

In [39]:
gdreg.regress.get_block(df_score, n_block=10)

0 0 1333
1 1333 2666
2 2666 3999
3 3999 5332
4 5332 6665
5 6665 7998
6 7998 9331
7 9331 10664
8 10664 11997
9 11997 13330


array([0, 0, 0, ..., 9, 9, 9])

### Test for correctness

In [39]:
# Read in all LD matrices 
dic_ld_full = {}
temp_path = "/n/groups/price/martin/WES_analysis/toy_1K/results/full_ld"
for CHR in range(1,11):
    for CHR_REF in range(1,11):
        dic_ld_full[(CHR,CHR_REF)] = np.load(
            temp_path + '/top_1K_chr%d_chr%d.ld.npy' % (CHR, CHR_REF)
        )
n_sample = 1000

### Test for gdreg.score.compute_dld_score

In [184]:
n_snp = 10
n_snp_ref = 50
np.random.seed(0)
mat_ld = np.random.randn(n_snp_ref, n_snp)
mat_G = np.random.choice([0,1], size=[n_snp_ref, n_snp_ref])
mat_G = (mat_G + mat_G.T).astype(bool)
# mat_G = sp.sparse.csr_matrix(mat_G)
v_annot = np.random.randn(n_snp_ref)
v_ps_sd = np.random.rand(n_snp_ref) 

v_score_gold = [
    mat_ld[:,x].reshape([-1,1]).T.dot(
        mat_G * 0.5 * (np.outer(v_annot, np.ones(n_snp_ref)) + np.outer(np.ones(n_snp_ref), v_annot))
        * np.outer(v_ps_sd, v_ps_sd)).dot(
        mat_ld[:,x].reshape([-1,1])
    )[0][0] for x in range(n_snp)
]
v_score_gold = np.array(v_score_gold)

v_score = gdreg.score.compute_dld_score(mat_ld, mat_G, v_annot, v_ps_sd)
print('abs_dif=%0.3g'%np.absolute(v_score_gold - v_score).sum())

v_score = gdreg.score.compute_dld_score(mat_ld, sp.sparse.csr_matrix(mat_G), v_annot, v_ps_sd)
print('abs_dif=%0.3g'%np.absolute(v_score_gold - v_score).sum())

abs_dif=1.22e-14
abs_dif=1.22e-14
