In [24]:
import pandas as pd
import numpy as np
import scipy as sp
import time
import os
import re
import argparse
import gdreg
import matplotlib.pyplot as plt
import pickle

# autoreload
%load_ext autoreload
%autoreload 2
%load_ext memory_profiler
%load_ext line_profiler

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler
The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [25]:
JOB = "regress"
PGEN_FILE = "/n/groups/price/martin/WES_analysis/toy_10K/chr@_v1.SPB.hg19.toy_10K"
SCORE_FILE = "/n/groups/price/martin/WES_analysis/toy_10K/gdreg_file_score_cross_term/toy_10K.@_score.tsv.gz"
SUMSTATS_FILE = "/n/groups/price/martin/WES_analysis/toy_10K/trait_sanity/sanity_rep1.sumstats.gz"
ANNOT_FILE = "/n/groups/price/martin/WES_analysis/toy_10K/toy.annot.gz," + \
    "/n/groups/price/martin/WES_analysis/toy_10K/toy.pannot.gz," + \
    "/n/groups/price/martin/WES_analysis/toy_10K/toy.pannot_hr.gz"
PREFIX_OUT = "/n/groups/price/martin/WES_analysis/toy_10K/gdreg_res/sanity_rep1"

LD_FILE = False
MEMORY = 512
RANDOM_SEED = 0
SNP_RANGE = None
FLAG_CROSS_TERM = False
FLAG_FULL_LD = False

In [26]:
sys_start_time = time.time()

###########################################################################################
######                                    Parse Options                              ######
###########################################################################################

# JOB = args.job
# PGEN_FILE = args.pgen_file
# LD_FILE = args.ld_file
# SCORE_FILE = args.score_file
# SUMSTATS_FILE = args.sumstats_file
# ANNOT_FILE = args.annot_file
# PREFIX_OUT = args.prefix_out
# MEMORY = args.memory
# RANDOM_SEED = args.random_seed
# SNP_RANGE = args.snp_range
# FLAG_FULL_LD = args.flag_full_ld
# FLAG_CROSS_TERM = args.flag_cross_term

# Parse and check arguments
LEGAL_JOB_LIST = ["get_snp_block", "compute_ld", "compute_score", "regress"]
err_msg = "# run_gdreg: --job=%s not supported" % JOB
assert JOB in LEGAL_JOB_LIST, err_msg

if JOB in ["compute_score"]:
    assert LD_FILE is not None, "--ld_file required for --job=%s" % JOB
if JOB in ["regress"]:
    assert SCORE_FILE is not None, "--score_file required for --job=%s" % JOB
if JOB in ["regress"]:
    assert SUMSTATS_FILE is not None, "--sumstats_file required for --job=%s" % JOB
if JOB in ["compute_score", "regress"]:
    assert ANNOT_FILE is not None, "--annot_file required for --job=%s" % JOB
if JOB in ["compute_ld"]:
    assert SNP_RANGE is not None, "--snp_range required for --job=%s" % JOB
    DIC_RANGE = gdreg.util.parse_snp_range(SNP_RANGE)

# Print input options
header = gdreg.util.get_cli_head()
header += "Call: run_gdreg.py \\\n"
header += "--job %s\\\n" % JOB
header += "--pgen_file %s\\\n" % PGEN_FILE
header += "--ld_file %s\\\n" % LD_FILE
header += "--score_file %s\\\n" % SCORE_FILE
header += "--sumstats_file %s\\\n" % SUMSTATS_FILE
header += "--annot_file %s\\\n" % ANNOT_FILE
header += "--prefix_out %s\\\n" % PREFIX_OUT
header += "--snp_range %s\\\n" % SNP_RANGE
header += "--memory %d\\\n" % MEMORY
header += "--random_seed %d\\\n" % RANDOM_SEED
header += "--flag_full_ld %s\\\n" % FLAG_FULL_LD
header += "--flag_cross_term %s\n" % FLAG_CROSS_TERM
print(header)

******************************************************************************
* Gene-level directional effect regression (GDREG)
* Version 0.0.2
* Martin Jinye Zhang
* HSPH / Broad Institute
* MIT License
******************************************************************************
Call: run_gdreg.py \
--job regress\
--pgen_file /n/groups/price/martin/WES_analysis/toy_10K/chr@_v1.SPB.hg19.toy_10K\
--ld_file False\
--score_file /n/groups/price/martin/WES_analysis/toy_10K/gdreg_file_score_cross_term/toy_10K.@_score.tsv.gz\
--sumstats_file /n/groups/price/martin/WES_analysis/toy_10K/trait_sanity/sanity_rep1.sumstats.gz\
--annot_file /n/groups/price/martin/WES_analysis/toy_10K/toy.annot.gz,/n/groups/price/martin/WES_analysis/toy_10K/toy.pannot.gz,/n/groups/price/martin/WES_analysis/toy_10K/toy.pannot_hr.gz\
--prefix_out /n/groups/price/martin/WES_analysis/toy_10K/gdreg_res/sanity_rep1\
--snp_range None\
--memory 512\
--random_seed 0\
--flag_full_ld False\
--flag_cross_term False



In [27]:
###########################################################################################
######                                   Data Loading                                ######
###########################################################################################
# Load --pgen_file
if JOB in ["get_snp_block", "compute_ld", "compute_score", "regress"]:
    print("# Loading --pgen_file")
    dic_data = {}
    if "@" not in PGEN_FILE:
        temp_dic = gdreg.util.read_pgen(PGEN_FILE)
        dic_data[temp_dic["pvar"]["CHR"][0]] = temp_dic.copy()
    else:
        for CHR in range(1, 23):
            if os.path.exists(PGEN_FILE.replace("@", "%s" % CHR) + ".pgen"):
                dic_data[CHR] = gdreg.util.read_pgen(
                    PGEN_FILE.replace("@", "%s" % CHR)
                )

    for CHR in dic_data:
        n_sample = dic_data[CHR]["psam"].shape[0]
        n_snp = dic_data[CHR]["pvar"].shape[0]
        mat_X = gdreg.util.read_geno(
            dic_data[CHR]["pgen"], 0, 50, n_sample=None, n_snp=None
        )
        sparsity = (mat_X != 0).mean()
        print(
            "    CHR%2d: %d samples, %d SNPs, %0.1f%% non-zeros for first 50 SNPs"
            % (CHR, n_sample, n_snp, sparsity * 100)
        )
    print("    " + gdreg.util.get_sys_info(sys_start_time))

# Load --ld_file
if JOB in ["compute_score"]:
    print("# Loading --ld_file")
    assert os.path.exists(LD_FILE), "--ld_file does not exist"
    mat_ld, dic_range = gdreg.util.read_ld(LD_FILE)
    if dic_range["chr_ref"] is None:
        dic_range["chr_ref"] = dic_range["chr"]
    err_msg = "n_snp=%d, mismatch with --pgen_file" % mat_ld.shape[0]
    assert mat_ld.shape[0] == dic_data[dic_range["chr"]]["pvar"].shape[0], err_msg
    print(
        "    chr=%d, start=%d, end=%d, chr_ref=%d"
        % (
            dic_range["chr"],
            dic_range["start"],
            dic_range["end"],
            dic_range["chr_ref"],
        )
    )
    print("    n_snp=%d, n_snp_ref=%d" % (mat_ld.shape[1], mat_ld.shape[0]))
    print("    LD info loaded, matching --pgen_file")
    print("    " + gdreg.util.get_sys_info(sys_start_time))

# Load --score_file
if JOB in ["regress"]:
    print("# Loading --score_file")
    flist = sorted(gdreg.util.from_filepattern(SCORE_FILE))
    print("    find %d score files" % len(flist))
    df_score = None
    for fpath in flist:
        temp_df = pd.read_csv(fpath, sep="\t", index_col=None)

        if df_score is None:
            df_score = temp_df.copy()
        else:
            df_score = pd.concat([df_score, temp_df], axis=0)

    df_score.sort_values(["CHR", "BP"], inplace=True)
    LD_list = [x for x in df_score if x.startswith("LD:")]
    DLD_list = [x for x in df_score if x.startswith("DLD:")]

    print(
        "    score file loaded for %d SNPs, %d LD scores, %d DLD scores"
        % (df_score.shape[0], len(LD_list), len(DLD_list))
    )
    print("    " + gdreg.util.get_sys_info(sys_start_time))

# Load --sumstats_file
if JOB in ["regress"]:
    print("# Loading --sumstats_file")
    df_sumstats = pd.read_csv(SUMSTATS_FILE, sep="\t", index_col=None)
    print("    .sumstats.gz loaded, %d SNPs" % df_sumstats.shape[0])
    print("    " + gdreg.util.get_sys_info(sys_start_time))

# Load --annot_file
if JOB in ["compute_score", "regress"]:
    print("# Loading --annot_file")
    df_annot = None
    pannot_list = []
    pannot_hr_list = []
    for annot_file in ANNOT_FILE.split(","):
        err_msg = "--annot_file missing : '%s'" % annot_file
        assert os.path.exists(annot_file), err_msg
        temp_df = gdreg.util.read_annot(annot_file)

        if annot_file.endswith(".annot.gz"):
            temp_df.index = temp_df["SNP"]
            if df_annot is None:
                df_annot = temp_df.copy()
            else:
                col_list = [x for x in temp_df if x.startswith("AN:")]
                df_annot = df_annot.join(temp_df[col_list])
        if annot_file.endswith(".pannot.gz"):
            pannot_list.append(temp_df.copy())
        if annot_file.endswith(".pannot_hr.gz"):
            pannot_hr_list.append(temp_df.copy())
    AN_list = [x for x in df_annot if x.startswith("AN:")]
    print(
        "    .annot.gz (%d SNPs and %d annots): %s"
        % (df_annot.shape[0], len(AN_list), ",".join(AN_list))
    )
    temp_list = ["%s (%d SNPs)" % (x.columns[-1], x.shape[0]) for x in pannot_list]
    print(
        "    .pannot.gz (%d pannots): %s" % (len(pannot_list), ",".join(temp_list)),
    )
    temp_list = [
        "%s (%d pairs)" % (x.columns[-1], x.shape[0]) for x in pannot_hr_list
    ]
    print(
        "    .pannot_hr.gz (%d pannots): %s"
        % (len(pannot_hr_list), ",".join(temp_list)),
    )
    print("    " + gdreg.util.get_sys_info(sys_start_time))

# Loading --pgen_file
    CHR 1: 10000 samples, 4232 SNPs, 26.7% non-zeros for first 50 SNPs
    CHR 2: 10000 samples, 4056 SNPs, 43.5% non-zeros for first 50 SNPs
    CHR 3: 10000 samples, 4067 SNPs, 33.9% non-zeros for first 50 SNPs
    CHR 4: 10000 samples, 4027 SNPs, 24.2% non-zeros for first 50 SNPs
    CHR 5: 10000 samples, 4106 SNPs, 36.7% non-zeros for first 50 SNPs
    CHR 6: 10000 samples, 4154 SNPs, 39.6% non-zeros for first 50 SNPs
    CHR 7: 10000 samples, 4071 SNPs, 41.8% non-zeros for first 50 SNPs
    CHR 8: 10000 samples, 3891 SNPs, 33.3% non-zeros for first 50 SNPs
    CHR 9: 10000 samples, 4149 SNPs, 40.1% non-zeros for first 50 SNPs
    CHR10: 10000 samples, 4129 SNPs, 40.3% non-zeros for first 50 SNPs
    sys_time=1.3s, sys_mem=0.33GB
# Loading --score_file
    find 20 score files
    score file loaded for 214936 SNPs, 4 LD scores, 2 DLD scores
    sys_time=2.3s, sys_mem=0.32GB
# Loading --sumstats_file
    .sumstats.gz loaded, 40882 SNPs
    sys_time=2.4s, sys_mem

In [32]:
if JOB == "regress":
    print("# Running --job regress")

    dic_res = gdreg.regress.estimate(
        dic_data,
        df_score,
        df_sumstats,
        df_annot,
        pannot_list=pannot_list,
        pannot_hr_list=pannot_hr_list,
        cross_term=True,
        n_jn_block=100,
        sym_non_pAN="non-pAN",
        verbose=True,
    )

#     # Store the entire file and a summary df
#     dbfile = open(PREFIX_OUT + ".pickle", "wb")
#     pickle.dump(dic_res, dbfile)
#     dbfile.close()
#     dic_res[0]["summary"]["tau"].to_csv(
#         PREFIX_OUT + ".tau.tsv", sep="\t", index=False
#     )
#     dic_res[1]["summary"]["tau"].to_csv(
#         PREFIX_OUT + ".joint_tau.tsv", sep="\t", index=False
#     )
#     dic_res[1]["summary"]["rho"].to_csv(
#         PREFIX_OUT + ".joint_rho.tsv", sep="\t", index=False
#     )

    print("    " + gdreg.util.get_sys_info(sys_start_time))

# Running --job regress
# Call: gdreg.regress.estimate
    dic_data : n_snp=40882, n_sample=40882
    df_score : n_snp=214936, 4 LD scores, 2 DLD scores
    df_sumstats : n_snp=40882, n_sample_zsq=10000
        Remove duplicate or ZSQ>80.0 SNPs, 40882 remaining, avg. zsq=1.68
    Regression : n_sample=214936 (SNP or SNP pairs), n_block=100
    # Call: gdreg.regress.regress
        n_snp=40882, n_block=103, n_sample_zsq=10000
        5 regressors : LD:AN:CHR1t5_common, LD:AN:CHR1t5_lf, LD:AN:all_common, LD:AN:all_lf, E
40882
here
        Completed, time=0.2s


  self._set_arrayXarray(i, j, x)


    # Call: gdreg.regress.regress
        n_snp=214936, n_block=98, n_sample_zsq=10000
        7 regressors : LD:AN:CHR1t5_common, LD:AN:CHR1t5_lf, LD:AN:all_common, LD:AN:all_lf, DLD:pAN:gene, DLD:pAN:proxy, E
40882
here
        Completed, time=1.1s


  self._set_arrayXarray(i, j, x)


    Completed, time=6.6s
    sys_time=116.0s, sys_mem=6.8GB


In [33]:
display(dic_res[0]['summary']['tau'])
display(dic_res[1]['summary']['tau'])
display(dic_res[1]['summary']['rho'])

Unnamed: 0,annot,n_snp,tau,tau_se,h2,h2_se,enrich,enrich_se
AN:CHR1t5_common,AN:CHR1t5_common,10135,-3e-06,2e-06,0.062,0.012225,0.805104,0.098344
AN:CHR1t5_lf,AN:CHR1t5_lf,10353,2e-06,2e-06,0.122708,0.021048,1.094208,0.101741
AN:all_common,AN:all_common,20025,9e-06,1e-06,0.152156,0.019958,,
AN:all_lf,AN:all_lf,20857,1e-05,2e-06,0.225921,0.032292,,


Unnamed: 0,annot,n_snp,tau,tau_se,h2,h2_se,enrich,enrich_se
AN:CHR1t5_common,AN:CHR1t5_common,10135,5.496487e-07,2e-06,0.053615,0.022561,1.054091,0.234194
AN:CHR1t5_lf,AN:CHR1t5_lf,10353,4.313293e-06,5e-06,0.168513,0.042786,1.154012,0.181602
AN:all_common,AN:all_common,20025,4.740414e-06,2e-06,0.100497,0.035002,,
AN:all_lf,AN:all_lf,20857,1.196343e-05,4e-06,0.294177,0.071867,,


Unnamed: 0,pannot,n_snp_pair,rho,rho_se,cov,cov_se,r2,r2_se
pAN:gene,pAN:gene,122500,3.447009e-06,7.230072e-07,0.391289,0.09281,0.346093,0.08209
pAN:proxy,pAN:proxy,242838,-3.064979e-07,1.46607e-07,0.340891,0.106997,0.150127,0.047121


In [6]:
display(dic_res[0]['summary']['tau'])
display(dic_res[1]['summary']['tau'])
display(dic_res[1]['summary']['rho'])

Unnamed: 0,annot,n_snp,tau,tau_se,h2,h2_se,enrich,enrich_se
AN:CHR1t5_common,AN:CHR1t5_common,10135,-3e-06,2e-06,0.062985,0.013088,0.803116,0.109984
AN:CHR1t5_lf,AN:CHR1t5_lf,10353,3e-06,2e-06,0.121436,0.024513,1.155466,0.122887
AN:all_common,AN:all_common,20025,9e-06,2e-06,0.154956,0.022625,,
AN:all_lf,AN:all_lf,20857,9e-06,2e-06,0.211726,0.034986,,


Unnamed: 0,annot,n_snp,tau,tau_se,h2,h2_se,enrich,enrich_se
AN:CHR1t5_common,AN:CHR1t5_common,10135,-3e-06,2e-06,0.06869,0.013203,0.808125,0.097858
AN:CHR1t5_lf,AN:CHR1t5_lf,10353,1e-06,2e-06,0.108185,0.017031,1.073105,0.104956
AN:all_common,AN:all_common,20025,1e-05,2e-06,0.167944,0.023443,,
AN:all_lf,AN:all_lf,20857,9e-06,2e-06,0.2031,0.029204,,


Unnamed: 0,pannot,n_snp_pair,rho,rho_se,cov,cov_se,r2,r2_se
pAN:gene,pAN:gene,122500,1.973282e-06,9.419935e-07,0.208967,0.115836,0.188476,0.104477
pAN:proxy,pAN:proxy,242838,-2.918325e-07,9.155464e-08,0.155536,0.119065,0.070717,0.054135


### gdreg.regress.summarize

In [35]:
temp_dic = gdreg.regress.summarize(dic_res[1], df_annot, pannot_list=pannot_list, pannot_hr_list=pannot_hr_list)
display(temp_dic['tau'])
display(temp_dic['rho'])

  self._set_arrayXarray(i, j, x)


Unnamed: 0,annot,n_snp,tau,tau_se,h2,h2_se,enrich,enrich_se
AN:ALL,AN:ALL,40882,7e-06,3e-06,0.274499,0.072075,,
AN:CHR1t5,AN:CHR1t5,20488,-2e-06,2e-06,0.124001,0.02707,0.901397,0.146869
AN:ODD,AN:ODD,20625,2e-06,2e-06,0.154597,0.037006,1.116344,0.125943


Unnamed: 0,pannot,n_snp_pair,rho,rho_se,cov,cov_se,r2,r2_se
pAN:gene,pAN:gene,122500,3.332093e-07,2.417477e-07,0.001545,0.031186,0.001881,0.037953
pAN:proxy,pAN:proxy,242838,-3.105122e-07,7.965019e-08,-0.062654,0.039794,-0.037855,0.024043


In [34]:
0.001545 / 122500 / 0.274499 * 40882

0.0018783813271391419

### gdreg.regress.get_block

In [40]:
temp_df = pd.DataFrame(data={
    'CHR' : [1,1,1,1,1,1,1,2,2,2,2,2,3,3],
})
temp_pannot_list = [
    pd.DataFrame(data={
    'pAN:gene' : [
        'a','a','a','non-pAN','c', 'c', 'non-pAN',
        'non-pAN', 'b','b','b','b',
        'non-pAN','non-pAN'
    ],
})
]
gdreg.regress.get_block(temp_df, temp_pannot_list, n_block=3)

{0: (0, 6), 1: (6, 7), 2: (7, 12), 3: (12, 14)}

### Does the data look OK???

In [9]:
df_eff = pd.read_csv("/n/groups/price/martin/WES_analysis/toy_1K/sanity_nd_rep0.eff.gz", sep='\t')
dic_eff = { x: y**2 for x, y in zip(df_eff["SNP"], df_eff["EFF"]) }
dic_zsq = { x: y**2 for x, y in zip(df_sumstats["SNP"], df_sumstats["Z"])}

temp_df = df_score.copy()
temp_df['ZSQ'] = [dic_zsq[x] for x in temp_df['SNP']]
temp_df['EFFSQ'] = [dic_eff[x] for x in temp_df['SNP']]

In [19]:
(temp_df['EFFSQ']).sum()

0.4894335692078715

In [32]:
print(temp_df.loc[temp_df['CHR']==1, 'ZSQ'].mean(), temp_df.loc[temp_df['CHR']==1, 'EFFSQ'].mean())

KeyError: 'ZSQ'

In [15]:
print(temp_df.loc[temp_df['CHR']%2==0, 'ZSQ'].mean(), temp_df.loc[temp_df['CHR']%2==0, 'EFFSQ'].mean())

1.039649622100858 6.992735878720677e-06


In [16]:
print(temp_df.loc[temp_df['CHR']%2==1, 'ZSQ'].mean(), temp_df.loc[temp_df['CHR']%2==1, 'EFFSQ'].mean())

1.103828699394063 1.5495125005248987e-05


### Global LD score : gives the same result

In [None]:
# Read in all LD matrices 
dic_ld_full = {}
temp_path = "/n/groups/price/martin/WES_analysis/toy_1K/results/full_ld"
for CHR in range(1,11):
    for CHR_REF in range(1,11):
        dic_ld_full[(CHR,CHR_REF)] = np.load(
            temp_path + '/top_1K_chr%d_chr%d.ld.npy' % (CHR, CHR_REF)
        )
        
# df_score_g
df_score_g = df_score.copy()
for CHR in range(1,11):
    mat_ld_chr = np.concatenate([dic_ld_full[(CHR,x)] for x in range(1,11)], axis=0)
    for AN in ['AN:ALL', 'AN:CHR1', 'AN:ODD']:
        v_annot = df_annot[AN].values
        v_ld_score = ((mat_ld_chr**2).T*v_annot).sum(axis=1)
        df_score_g.loc[df_score_g['CHR']==CHR, 'LD:%s'%AN] = v_ld_score
        