In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import time
import os
import re
import argparse
import gdreg
import matplotlib.pyplot as plt
import pickle

# autoreload
%load_ext autoreload
%autoreload 2
%load_ext memory_profiler
%load_ext line_profiler

In [11]:
df_score[[x for x in df_score if "LD" in x] + ['E']].corr()

Unnamed: 0,LD:AN:CHR1t5_common,LD:AN:CHR1t5_lf,LD:AN:all_common,LD:AN:all_lf,DLD:pAN:gene,DLD:pAN:proxy,E
LD:AN:CHR1t5_common,1.0,0.194349,0.702512,-0.006646,0.051507,0.149948,0.017776
LD:AN:CHR1t5_lf,0.194349,1.0,0.006568,0.64999,0.26652,0.283807,-0.029946
LD:AN:all_common,0.702512,0.006568,1.0,-0.059985,0.024448,0.362602,0.01515
LD:AN:all_lf,-0.006646,0.64999,-0.059985,1.0,0.21872,0.534622,-0.05664
DLD:pAN:gene,0.051507,0.26652,0.024448,0.21872,1.0,0.131682,-0.01165
DLD:pAN:proxy,0.149948,0.283807,0.362602,0.534622,0.131682,1.0,-0.067066
E,0.017776,-0.029946,0.01515,-0.05664,-0.01165,-0.067066,1.0


In [2]:
JOB = "regress"
PGEN_FILE = "/n/groups/price/martin/WES_analysis/toy_10K/chr@_v1.SPB.hg19.toy_10K"
SCORE_FILE = "/n/groups/price/martin/WES_analysis/toy_10K/gdreg_file/toy_10K.@_score.tsv.gz"
SUMSTATS_FILE = "/n/groups/price/martin/WES_analysis/toy_10K/sanity_rep1.sumstats.gz"
ANNOT_FILE = "/n/groups/price/martin/WES_analysis/toy_10K/toy.annot.gz," + \
    "/n/groups/price/martin/WES_analysis/toy_10K/toy.pannot.gz," + \
    "/n/groups/price/martin/WES_analysis/toy_10K/toy.pannot_hr.gz"
PREFIX_OUT = "/n/groups/price/martin/WES_analysis/toy_10K/gdreg_res/sanity_rep1"

LD_FILE = False
MEMORY = 512
RANDOM_SEED = 0
SNP_RANGE = None
FLAG_FULL_LD = False

In [3]:
sys_start_time = time.time()

###########################################################################################
######                                    Parse Options                              ######
###########################################################################################

# JOB = args.job
# PGEN_FILE = args.pgen_file
# LD_FILE = args.ld_file
# SCORE_FILE = args.score_file
# SUMSTATS_FILE = args.sumstats_file
# ANNOT_FILE = args.annot_file
# PREFIX_OUT = args.prefix_out
# MEMORY = args.memory
# RANDOM_SEED = args.random_seed
# SNP_RANGE = args.snp_range
# FLAG_FULL_LD = args.flag_full_ld

# Parse and check arguments
LEGAL_JOB_LIST = ["get_snp_block", "compute_ld", "compute_score", "regress"]
err_msg = "# run_gdreg: --job=%s not supported" % JOB
assert JOB in LEGAL_JOB_LIST, err_msg

if JOB in ["compute_score"]:
    assert LD_FILE is not None, "--ld_file required for --job=%s" % JOB
if JOB in ["regress"]:
    assert SCORE_FILE is not None, "--score_file required for --job=%s" % JOB
if JOB in ["regress"]:
    assert SUMSTATS_FILE is not None, "--sumstats_file required for --job=%s" % JOB
if JOB in ["compute_score", "regress"]:
    assert ANNOT_FILE is not None, "--annot_file required for --job=%s" % JOB
if JOB in ["compute_ld"]:
    assert SNP_RANGE is not None, "--snp_range required for --job=%s" % JOB
    DIC_RANGE = gdreg.util.parse_snp_range(SNP_RANGE)

# Print input options
header = gdreg.util.get_cli_head()
header += "Call: run_gdreg.py \\\n"
header += "--job %s\\\n" % JOB
header += "--pgen_file %s\\\n" % PGEN_FILE
header += "--ld_file %s\\\n" % LD_FILE
header += "--score_file %s\\\n" % SCORE_FILE
header += "--sumstats_file %s\\\n" % SUMSTATS_FILE
header += "--annot_file %s\\\n" % ANNOT_FILE
header += "--prefix_out %s\\\n" % PREFIX_OUT
header += "--snp_range %s\\\n" % SNP_RANGE
header += "--memory %d\\\n" % MEMORY
header += "--random_seed %d\\\n" % RANDOM_SEED
header += "--flag_full_ld %s\n" % FLAG_FULL_LD
print(header)

******************************************************************************
* Gene-level directional effect regression (GDREG)
* Version 0.0.2
* Martin Jinye Zhang
* HSPH / Broad Institute
* MIT License
******************************************************************************
Call: run_gdreg.py \
--job regress\
--pgen_file /n/groups/price/martin/WES_analysis/toy_10K/chr@_v1.SPB.hg19.toy_10K\
--ld_file False\
--score_file /n/groups/price/martin/WES_analysis/toy_10K/gdreg_file/toy_10K.@_score.tsv.gz\
--sumstats_file /n/groups/price/martin/WES_analysis/toy_10K/sanity_rep1.sumstats.gz\
--annot_file /n/groups/price/martin/WES_analysis/toy_10K/toy.annot.gz,/n/groups/price/martin/WES_analysis/toy_10K/toy.pannot.gz,/n/groups/price/martin/WES_analysis/toy_10K/toy.pannot_hr.gz\
--prefix_out /n/groups/price/martin/WES_analysis/toy_10K/gdreg_res/sanity_rep1\
--snp_range None\
--memory 512\
--random_seed 0\
--flag_full_ld False



In [4]:
###########################################################################################
######                                   Data Loading                                ######
###########################################################################################
# Load --pgen_file
if JOB in ["get_snp_block", "compute_ld", "compute_score", "regress"]:
    print("# Loading --pgen_file")
    dic_data = {}
    if "@" not in PGEN_FILE:
        temp_dic = gdreg.util.read_pgen(PGEN_FILE)
        dic_data[temp_dic["pvar"]["CHR"][0]] = temp_dic.copy()
    else:
        for CHR in range(1, 23):
            if os.path.exists(PGEN_FILE.replace("@", "%s" % CHR) + ".pgen"):
                dic_data[CHR] = gdreg.util.read_pgen(
                    PGEN_FILE.replace("@", "%s" % CHR)
                )

    for CHR in dic_data:
        n_sample = dic_data[CHR]["psam"].shape[0]
        n_snp = dic_data[CHR]["pvar"].shape[0]
        mat_X = gdreg.util.read_geno(
            dic_data[CHR]["pgen"], 0, 50, n_sample=None, n_snp=None
        )
        sparsity = (mat_X != 0).mean()
        print(
            "    CHR%2d: %d samples, %d SNPs, %0.1f%% non-zeros for first 50 SNPs"
            % (CHR, n_sample, n_snp, sparsity * 100)
        )
    print("    " + gdreg.util.get_sys_info(sys_start_time))

# Load --ld_file
if JOB in ["compute_score"]:
    print("# Loading --ld_file")
    assert os.path.exists(LD_FILE), "--ld_file does not exist"
    mat_ld, dic_range = gdreg.util.read_ld(LD_FILE)
    if dic_range["chr_ref"] is None:
        dic_range["chr_ref"] = dic_range["chr"]
    err_msg = "n_snp=%d, mismatch with --pgen_file" % mat_ld.shape[0]
    assert mat_ld.shape[0] == dic_data[dic_range["chr"]]["pvar"].shape[0], err_msg
    print(
        "    chr=%d, start=%d, end=%d, chr_ref=%d"
        % (
            dic_range["chr"],
            dic_range["start"],
            dic_range["end"],
            dic_range["chr_ref"],
        )
    )
    print("    n_snp=%d, n_snp_ref=%d" % (mat_ld.shape[1], mat_ld.shape[0]))
    print("    LD info loaded, matching --pgen_file")
    print("    " + gdreg.util.get_sys_info(sys_start_time))
    
# Load --score_file
if JOB in ["regress"]:
    print("# Loading --score_file")
    flist = sorted(gdreg.util.from_filepattern(SCORE_FILE))
    print("    find %d score files" % len(flist))
    df_score = None
    for fpath in flist:
        temp_df = pd.read_csv(fpath, sep='\t', index_col=None)
        
        if df_score is None:
            df_score = temp_df.copy()
        else:
            df_score = pd.concat([df_score, temp_df], axis=0)
            
    df_score.sort_values(['CHR', 'BP'], inplace=True)
    LD_list = [x for x in df_score if x.startswith("LD:")]
    DLD_list = [x for x in df_score if x.startswith("DLD:")]
        
    print("    score file loaded for %d SNPs, %d LD scores, %d DLD scores" % (
        df_score.shape[0], len(LD_list), len(DLD_list)
    ))
    print("    " + gdreg.util.get_sys_info(sys_start_time))

# Load --sumstats_file
if JOB in ["regress"]:
    print("# Loading --sumstats_file")
    df_sumstats = pd.read_csv(SUMSTATS_FILE, sep="\t", index_col=None)
    print("    .sumstats.gz loaded, %d SNPs" % df_sumstats.shape[0])
    print("    " + gdreg.util.get_sys_info(sys_start_time))

# Load --annot_file
if JOB in ["compute_score", "regress"]:
    print("# Loading --annot_file")
    df_annot = None
    pannot_list = []
    pannot_hr_list = []
    for annot_file in ANNOT_FILE.split(","):
        err_msg = "--annot_file missing : '%s'" % annot_file
        assert os.path.exists(annot_file), err_msg
        temp_df = gdreg.util.read_annot(annot_file)

        if annot_file.endswith(".annot.gz"):
            temp_df.index = temp_df["SNP"]
            if df_annot is None:
                df_annot = temp_df.copy()
            else:
                col_list = [x for x in temp_df if x.startswith("AN:")]
                df_annot = df_annot.join(temp_df[col_list])
        if annot_file.endswith(".pannot.gz"):
            pannot_list.append(temp_df.copy())
        if annot_file.endswith(".pannot_hr.gz"):
            pannot_hr_list.append(temp_df.copy())
    AN_list = [x for x in df_annot if x.startswith("AN:")]
    print(
        "    .annot.gz (%d SNPs and %d annots): %s"
        % (df_annot.shape[0], len(AN_list), ",".join(AN_list))
    )
    temp_list = ["%s (%d SNPs)" % (x.columns[-1], x.shape[0]) for x in pannot_list]
    print(
        "    .pannot.gz (%d pannots): %s" % (len(pannot_list), ",".join(temp_list)),
    )
    temp_list = [
        "%s (%d pairs)" % (x.columns[-1], x.shape[0]) for x in pannot_hr_list
    ]
    print(
        "    .pannot_hr.gz (%d pannots): %s"
        % (len(pannot_hr_list), ",".join(temp_list)),
    )
    print("    " + gdreg.util.get_sys_info(sys_start_time))

# Loading --pgen_file
    CHR 1: 10000 samples, 4232 SNPs, 26.7% non-zeros for first 50 SNPs
    CHR 2: 10000 samples, 4056 SNPs, 43.5% non-zeros for first 50 SNPs
    CHR 3: 10000 samples, 4067 SNPs, 33.9% non-zeros for first 50 SNPs
    CHR 4: 10000 samples, 4027 SNPs, 24.2% non-zeros for first 50 SNPs
    CHR 5: 10000 samples, 4106 SNPs, 36.7% non-zeros for first 50 SNPs
    CHR 6: 10000 samples, 4154 SNPs, 39.6% non-zeros for first 50 SNPs
    CHR 7: 10000 samples, 4071 SNPs, 41.8% non-zeros for first 50 SNPs
    CHR 8: 10000 samples, 3891 SNPs, 33.3% non-zeros for first 50 SNPs
    CHR 9: 10000 samples, 4149 SNPs, 40.1% non-zeros for first 50 SNPs
    CHR10: 10000 samples, 4129 SNPs, 40.3% non-zeros for first 50 SNPs
    sys_time=2.1s, sys_mem=0.11GB
# Loading --score_file
    find 20 score files
    score file loaded for 40882 SNPs, 4 LD scores, 2 DLD scores
    sys_time=2.4s, sys_mem=0.12GB
# Loading --sumstats_file
    .sumstats.gz loaded, 40882 SNPs
    sys_time=2.5s, sys_mem=

In [5]:
if JOB == "regress":
    print("# Running --job regress")
    
    dic_res = gdreg.regress.estimate(
        dic_data,
        df_score,
        df_sumstats,
        df_annot,
        pannot_list=pannot_list,
        pannot_hr_list=pannot_hr_list,
        n_jn_block=100,
        sym_non_pAN="non-pAN",
        verbose=True,
    )
    
    # Store the entire file and a summary df
    dbfile = open(PREFIX_OUT+'.pickle', 'wb')      
    pickle.dump(dic_res, dbfile)                     
    dbfile.close()
    dic_res[0]['summary']['tau'].to_csv(PREFIX_OUT+'.tau.tsv', sep='\t', index=False)
    dic_res[1]['summary']['tau'].to_csv(PREFIX_OUT+'.joint_tau.tsv', sep='\t', index=False)
    dic_res[1]['summary']['tau'].to_csv(PREFIX_OUT+'.joint_rho.tsv', sep='\t', index=False)
    
    print("    " + gdreg.util.get_sys_info(sys_start_time))
    
    pass

# Running --job regress
# Call: gdreg.regress.estimate
    dic_data : n_snp=40882, n_sample=40882
    df_score : n_snp=40882, 4 LD scores, 2 DLD scores
    df_sumstats : n_snp=40882, n_sample_zsq=10000
        Remove duplicate or ZSQ>80.0 SNPs, 40882 remaining, avg. zsq=1.68
    Regression : n_snp=40882, n_block=103
    # Call: gdreg.regress.regress
        n_snp=40882, n_block=103, n_sample_zsq=10000
        5 regressors : LD:AN:CHR1t5_common, LD:AN:CHR1t5_lf, LD:AN:all_common, LD:AN:all_lf, E
        Completed, time=0.0s
    # Call: gdreg.regress.regress
        n_snp=40882, n_block=103, n_sample_zsq=10000
        7 regressors : LD:AN:CHR1t5_common, LD:AN:CHR1t5_lf, LD:AN:all_common, LD:AN:all_lf, DLD:pAN:gene, DLD:pAN:proxy, E
        Completed, time=0.0s
    Completed, time=0.4s


In [7]:
temp_dic = gdreg.regress.summarize(
    dic_res[1],
    df_annot,
    pannot_list=pannot_list,
    pannot_hr_list=pannot_hr_list,
    sym_non_pAN="non-pAN",
)
display(temp_dic['tau'])
display(temp_dic['rho'])

  self._set_arrayXarray(i, j, x)


Unnamed: 0,annot,n_snp,tau,tau_se,h2,h2_se,enrich,enrich_se
AN:CHR1t5_common,AN:CHR1t5_common,10135,-2e-06,3e-06,0.056444,0.018166,0.84106,0.192418
AN:CHR1t5_lf,AN:CHR1t5_lf,10353,6e-06,3e-06,0.102088,0.027676,1.422452,0.206174
AN:all_common,AN:all_common,20025,8e-06,2e-06,0.132598,0.033991,,
AN:all_lf,AN:all_lf,20857,4e-06,3e-06,0.144585,0.046931,,


Unnamed: 0,pannot,n_snp_pair,rho,rho_se,cov,cov_se,r2,r2_se
pAN:gene,pAN:gene,122500,2.219741e-06,1.194626e-06,0.26726,0.149245,0.327872,0.183092
pAN:proxy,pAN:proxy,242838,-7.903239e-08,1.020353e-07,0.259801,0.15644,0.159198,0.095861


In [8]:
df_annot

Unnamed: 0_level_0,CHR,SNP,BP,CM,AN:CHR1t5_common,AN:CHR1t5_lf,AN:all_common,AN:all_lf
SNP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1:931131:I:4.01,1,1:931131:I:4.01,866511,0,1,0,1,0
1:935954:G:T,1,1:935954:G:T,871334,0,1,0,1,0
1:939436:I:1,1,1:939436:I:1,874816,0,0,1,0,1
1:941119:A:G,1,1:941119:A:G,876499,0,0,1,0,1
1:942335:C:G,1,1:942335:C:G,877715,0,0,1,0,1
...,...,...,...,...,...,...,...,...
10:116645109:A:G,10,10:116645109:A:G,118404620,0,0,0,1,0
10:116664061:G:A,10,10:116664061:G:A,118423572,0,0,0,0,1
10:116676557:G:T,10,10:116676557:G:T,118436068,0,0,0,1,0
10:116692299:A:G,10,10:116692299:A:G,118451810,0,0,0,1,0


In [38]:
if JOB == "regress":
    print("# Running --job regress")
    
    dic_res = gdreg.regress.estimate(
        dic_data,
        df_score,
        df_sumstats,
        df_annot,
        pannot_list=pannot_list,
        pannot_hr_list=pannot_hr_list,
        n_jn_block=100,
        sym_non_pAN="non-pAN",
        win_size=int(1e7),
        memory=MEMORY,
        verbose=True,
    )
    
    dic_res = gdreg.regress.estimate(
        dic_data,
        df_sumstats,
        dic_ld,
        df_annot,
        pannot_list=pannot_list,
        pannot_hr_list=pannot_hr_list,
        n_jn_block=100,
        sym_non_pAN="non-pAN",
        win_size=int(1e7),
        memory=MEMORY,
        verbose=True,
        n_iter=5,
    )
    
    # Store the entire file and a summary df
    dbfile = open(PREFIX_OUT+'.pickle', 'wb')      
    pickle.dump(dic_res, dbfile)                     
    dbfile.close()
    for res in dic_res:
        dic_res[res]['summary'].to_csv(PREFIX_OUT+'_res%s.tsv' % res, sep='\t', index=False)
    dic_res[res]['summary'].to_csv(PREFIX_OUT+'_res%s.tsv' % res, sep='\t', index=False)
    print("    " + gdreg.util.get_sys_info(sys_start_time))
    
    pass

# Running --job regress
# Call: gdreg.regress.estimate
    dic_data : n_snp=40882, n_sample=40882
    df_sumstats : n_snp=40882, n_sample_zsq=10000
    Remove duplicates or ZSQ>80.0 SNPs, 40882 remaining, avg. zsq=1.96
    Regression : n_snp=40882, n_block=103
# Call: gdreg.score.compute_score
    40882 SNPs from 10 CHRs: CHR1 (4232 SNPs), CHR2 (4056 SNPs), CHR3 (4067 SNPs), CHR4 (4027 SNPs), CHR5 (4106 SNPs), CHR6 (4154 SNPs), CHR7 (4071 SNPs), CHR8 (3891 SNPs), CHR9 (4149 SNPs), CHR10 (4129 SNPs)
    Single-SNP annots : AN:ALL, AN:CHR1t5, AN:ODD
    SNP-pair annots : 
    win_size=10.0MB, memory=512MB
    Completed, time=18.7s
    # Call: gdreg.regress.regress
        n_snp=40882, n_block=103, n_sample_zsq=10000
        4 regressors : LD:AN:ALL, LD:AN:CHR1t5, LD:AN:ODD, E
        Completed, time=0.0s
    # Call: gdreg.score.compute_score
        40882 SNPs from 10 CHRs: CHR1 (4232 SNPs), CHR2 (4056 SNPs), CHR3 (4067 SNPs), CHR4 (4027 SNPs), CHR5 (4106 SNPs), CHR6 (4154 SNPs), CHR7 (4

In [5]:
df_score = gdreg.score.compute_score(
    dic_data, 
    dic_ld,
    df_annot,
    pannot_list = pannot_list, 
    pannot_hr_list = pannot_hr_list,
    verbose = True,
    win_size = 1e7,
    memory=512
)

# Call: gdreg.score.compute_score
    40882 SNPs from 10 CHRs: CHR1 (4232 SNPs), CHR2 (4056 SNPs), CHR3 (4067 SNPs), CHR4 (4027 SNPs), CHR5 (4106 SNPs), CHR6 (4154 SNPs), CHR7 (4071 SNPs), CHR8 (3891 SNPs), CHR9 (4149 SNPs), CHR10 (4129 SNPs)
    Single-SNP annots : AN:ALL, AN:CHR1t5, AN:ODD
    SNP-pair annots : pAN:gene, pAN:proxy (hr)
    win_size=10.0MB, memory=512MB
    Completed, time=47.6s


In [37]:
dic_res = gdreg.regress.estimate(
    dic_data,
    df_score,
    df_sumstats,
    df_annot,
    pannot_list=pannot_list,
    pannot_hr_list=pannot_hr_list,
    n_jn_block=100,
    sym_non_pAN="non-pAN",
    win_size=int(1e7),
    memory=512,
    verbose=True,
)

# print(dic_res['term'])
# print(dic_res['coef'])
# print(dic_res['coef_jn'])
# print(np.sqrt(np.diag(dic_res['coef_jn_cov'])))

# Call: gdreg.regress.estimate
    dic_data : n_snp=40882, n_sample=40882
    df_score : n_snp=40882, 3 LD scores, 2 DLD scores
    df_sumstats : n_snp=40882, n_sample_zsq=10000
        Remove duplicate or ZSQ>80.0 SNPs, 40882 remaining, avg. zsq=1.56
    Regression : n_snp=40882, n_block=103
    # Call: gdreg.regress.regress
        n_snp=40882, n_block=103, n_sample_zsq=10000
        4 regressors : LD:AN:ALL, LD:AN:CHR1t5, LD:AN:ODD, E
        Completed, time=0.0s


  self._set_arrayXarray(i, j, x)


    # Call: gdreg.regress.regress
        n_snp=40882, n_block=103, n_sample_zsq=10000
        6 regressors : LD:AN:ALL, LD:AN:CHR1t5, LD:AN:ODD, DLD:pAN:gene, DLD:pAN:proxy, E
        Completed, time=0.0s


  self._set_arrayXarray(i, j, x)


    Completed, time=3.4s


### gdreg.regress.summarize

In [35]:
temp_dic = gdreg.regress.summarize(dic_res[1], df_annot, pannot_list=pannot_list, pannot_hr_list=pannot_hr_list)
display(temp_dic['tau'])
display(temp_dic['rho'])

  self._set_arrayXarray(i, j, x)


Unnamed: 0,annot,n_snp,tau,tau_se,h2,h2_se,enrich,enrich_se
AN:ALL,AN:ALL,40882,7e-06,3e-06,0.274499,0.072075,,
AN:CHR1t5,AN:CHR1t5,20488,-2e-06,2e-06,0.124001,0.02707,0.901397,0.146869
AN:ODD,AN:ODD,20625,2e-06,2e-06,0.154597,0.037006,1.116344,0.125943


Unnamed: 0,pannot,n_snp_pair,rho,rho_se,cov,cov_se,r2,r2_se
pAN:gene,pAN:gene,122500,3.332093e-07,2.417477e-07,0.001545,0.031186,0.001881,0.037953
pAN:proxy,pAN:proxy,242838,-3.105122e-07,7.965019e-08,-0.062654,0.039794,-0.037855,0.024043


In [34]:
0.001545 / 122500 / 0.274499 * 40882

0.0018783813271391419

In [48]:
temp_df[1]

Unnamed: 0,pannot,n_snp_pair,rho,rho_se,cov,cov_se
pAN:gene,pAN:gene,122500,3.332093e-07,2.417477e-07,0.00154541,0.0311862
pAN:proxy,pAN:proxy,242838,-3.105122e-07,7.965019e-08,-0.0626536,0.0397941


In [25]:
gdreg.util.sizeof_sparse(temp_df['pAN:gene'])

0.740081787109375

In [9]:
np.repeat(np.zeros(2), 3)

array([0., 0., 0., 0., 0., 0.])

In [22]:
dic_res['rho_0']['coef_jn']

[autoreload of gdreg.regress failed: Traceback (most recent call last):
  File "/home/jz286/myenv/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/home/jz286/myenv/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 434, in superreload
    module = reload(module)
  File "/home/jz286/myenv/lib/python3.7/imp.py", line 314, in reload
    return importlib.reload(module)
  File "/home/jz286/myenv/lib/python3.7/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 630, in _exec
  File "<frozen importlib._bootstrap_external>", line 724, in exec_module
  File "<frozen importlib._bootstrap_external>", line 860, in get_code
  File "<frozen importlib._bootstrap_external>", line 791, in source_to_code
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "/home/jz286/WES_analysis/GDReg/gdreg/regr

array([ 1.31643489e-05, -1.05026951e-05,  1.00880876e-05,  1.51030537e-01,
        4.34122391e-01, -1.54721691e-01, -7.90281414e-02,  4.68305434e-02,
        4.05449622e-02,  3.00883501e-01])

### gdreg.regress.get_block

In [40]:
temp_df = pd.DataFrame(data={
    'CHR' : [1,1,1,1,1,1,1,2,2,2,2,2,3,3],
})
temp_pannot_list = [
    pd.DataFrame(data={
    'pAN:gene' : [
        'a','a','a','non-pAN','c', 'c', 'non-pAN',
        'non-pAN', 'b','b','b','b',
        'non-pAN','non-pAN'
    ],
})
]
gdreg.regress.get_block(temp_df, temp_pannot_list, n_block=3)

{0: (0, 6), 1: (6, 7), 2: (7, 12), 3: (12, 14)}

### Does the data look OK???

In [9]:
df_eff = pd.read_csv("/n/groups/price/martin/WES_analysis/toy_1K/sanity_nd_rep0.eff.gz", sep='\t')
dic_eff = { x: y**2 for x, y in zip(df_eff["SNP"], df_eff["EFF"]) }
dic_zsq = { x: y**2 for x, y in zip(df_sumstats["SNP"], df_sumstats["Z"])}

temp_df = df_score.copy()
temp_df['ZSQ'] = [dic_zsq[x] for x in temp_df['SNP']]
temp_df['EFFSQ'] = [dic_eff[x] for x in temp_df['SNP']]

In [19]:
(temp_df['EFFSQ']).sum()

0.4894335692078715

In [32]:
print(temp_df.loc[temp_df['CHR']==1, 'ZSQ'].mean(), temp_df.loc[temp_df['CHR']==1, 'EFFSQ'].mean())

KeyError: 'ZSQ'

In [15]:
print(temp_df.loc[temp_df['CHR']%2==0, 'ZSQ'].mean(), temp_df.loc[temp_df['CHR']%2==0, 'EFFSQ'].mean())

1.039649622100858 6.992735878720677e-06


In [16]:
print(temp_df.loc[temp_df['CHR']%2==1, 'ZSQ'].mean(), temp_df.loc[temp_df['CHR']%2==1, 'EFFSQ'].mean())

1.103828699394063 1.5495125005248987e-05


### Global LD score : gives the same result

In [None]:
# Read in all LD matrices 
dic_ld_full = {}
temp_path = "/n/groups/price/martin/WES_analysis/toy_1K/results/full_ld"
for CHR in range(1,11):
    for CHR_REF in range(1,11):
        dic_ld_full[(CHR,CHR_REF)] = np.load(
            temp_path + '/top_1K_chr%d_chr%d.ld.npy' % (CHR, CHR_REF)
        )
        
# df_score_g
df_score_g = df_score.copy()
for CHR in range(1,11):
    mat_ld_chr = np.concatenate([dic_ld_full[(CHR,x)] for x in range(1,11)], axis=0)
    for AN in ['AN:ALL', 'AN:CHR1', 'AN:ODD']:
        v_annot = df_annot[AN].values
        v_ld_score = ((mat_ld_chr**2).T*v_annot).sum(axis=1)
        df_score_g.loc[df_score_g['CHR']==CHR, 'LD:%s'%AN] = v_ld_score
        