In [1]:
import numpy as np
import pandas as pd
import gdreg
import time
import os
import matplotlib.pyplot as plt

# autoreload
%load_ext autoreload
%autoreload 2

In [19]:
# JOB = "compute_phen"
# PGEN_FILE = "/n/groups/price/martin/WES_analysis/toy_1K/chr@_v1.SPB.hg19.toy_1K"
# EFF_FILE = "/n/groups/price/martin/WES_analysis/toy_1K/sanity_rep0.eff.gz"
# PHEN_FILE = None
# PREFIX_OUT = "/n/groups/price/martin/WES_analysis/toy_1K/sanity_rep0"
# RANDOM_SEED = 0

In [5]:
JOB = "compute_sumstats"
PGEN_FILE = "/n/groups/price/martin/WES_analysis/toy_1K/chr@_v1.SPB.hg19.toy_1K"
EFF_FILE = None
PHEN_FILE = "/n/groups/price/martin/WES_analysis/toy_1K/sanity_rep0.phen"
PREFIX_OUT = "/n/groups/price/martin/WES_analysis/toy_1K/sanity_rep0"
RANDOM_SEED = 0

In [7]:
sys_start_time = time.time()

###########################################################################################
######                                    Parse Options                              ######
###########################################################################################

# JOB = args.job
# PGEN_FILE = args.pgen_file
# EFF_FILE = args.eff_file
# PHEN_FILE = args.phen_file
# PREFIX_OUT = args.prefix_out
# RANDOM_SEED = args.random_seed

# Check if the options are legal
LEGAL_JOB_LIST = [
    "compute_phen",
    "compute_sumstats",
]
err_msg = "# run_simulation: --job=%s not supported" % JOB
assert JOB in LEGAL_JOB_LIST, err_msg

if (EFF_FILE is None) & (JOB in ["compute_phen"]):
    raise ValueError("# run_simulation.py: --eff_file required for --job=%s" % JOB)
if (PHEN_FILE is None) & (JOB in ["compute_sumstats"]):
    raise ValueError("# run_simulation.py: --phen_file required for --job=%s" % JOB)

# Print input options
header = gdreg.util.get_cli_head()
header += "Call: run_simulation.py \\\n"
header += "--job %s\\\n" % JOB
header += "--pgen_file %s\\\n" % PGEN_FILE
header += "--eff_file %s\\\n" % EFF_FILE
header += "--phen_file %s\\\n" % PHEN_FILE
header += "--random_seed %d\\\n" % RANDOM_SEED
header += "--prefix_out %s\n" % PREFIX_OUT
print(header)

******************************************************************************
* Gene-level directional effect regression (GDReg)
* Version 0.0.2
* Martin Jinye Zhang
* HSPH / Broad Institute
* MIT License
******************************************************************************
Call: run_simulation.py \
--job compute_sumstats\
--pgen_file /n/groups/price/martin/WES_analysis/toy_1K/chr@_v1.SPB.hg19.toy_1K\
--eff_file None\
--phen_file /n/groups/price/martin/WES_analysis/toy_1K/sanity_rep0.phen\
--random_seed 0\
--prefix_out /n/groups/price/martin/WES_analysis/toy_1K/sanity_rep0



In [9]:
###########################################################################################
######                                   Data Loading                                ######
###########################################################################################
# Load genotype data
if JOB in ["compute_phen", "compute_sumstats"]:
    print("# Loading --pgen")

    dic_data = {}
    for CHR in range(1, 23):
        if os.path.exists(PGEN_FILE.replace("@", "%s" % CHR) + ".pgen"):
            dic_data[CHR] = gdreg.util.read_pgen(PGEN_FILE.replace("@", "%s" % CHR))
    print("    Genotype data for %d CHRs:" % len(dic_data))
    for CHR in dic_data:
        n_sample = dic_data[CHR]["psam"].shape[0]
        n_snp = dic_data[CHR]["pvar"].shape[0]
        print("        CHR%2d (%d samples %d SNPs)" % (CHR, n_sample, n_snp))
    print("    " + gdreg.util.get_sys_info(sys_start_time))

# Load EFF_FILE
if JOB in ["compute_phen"]:
    print("# Loading --eff_file")

    df_effect = pd.read_csv(EFF_FILE, sep="\t", index_col=None)
    
    n_snp = df_effect.shape[0]
    n_CHR = len(set(df_effect["CHR"]))
    h2g = (df_effect['EFF']**2).sum()
    print(
        "    %s SNPs from %d CHRs, h2g=%0.3f" % (n_snp, n_CHR, h2g)
    )
    print("    " + gdreg.util.get_sys_info(sys_start_time))
    
# Load PHEN_FILE
if JOB in ["compute_sumstats"]:
    print("# Loading --phen_file")
    df_phen = pd.read_csv(PHEN_FILE, sep="\t", index_col=None)
    phen_name = df_phen.columns[2]

    print("    %d samples, phen_name=%s" % (df_phen.shape[0], phen_name))
    print("    " + gdreg.util.get_sys_info(sys_start_time))
    

# Loading --pgen
    Genotype data for 10 CHRs:
        CHR 1 (1000 samples 1400 SNPs)
        CHR 2 (1000 samples 1322 SNPs)
        CHR 3 (1000 samples 1326 SNPs)
        CHR 4 (1000 samples 1341 SNPs)
        CHR 5 (1000 samples 1311 SNPs)
        CHR 6 (1000 samples 1341 SNPs)
        CHR 7 (1000 samples 1375 SNPs)
        CHR 8 (1000 samples 1320 SNPs)
        CHR 9 (1000 samples 1307 SNPs)
        CHR10 (1000 samples 1280 SNPs)
    sys_time=425.2s, sys_mem=0.098GB
# Loading --phen_file
    1000 samples, phen_name=TRAIT
    sys_time=425.2s, sys_mem=0.098GB
