# Description

It runs the GWAS harmonization step from the script in: https://github.com/hakyimlab/summary-gwas-imputation

# Modules

In [1]:
import os
from glob import glob
from pathlib import Path

from utils import read_log_file_and_check_line_exists
import conf

# Settings

Apparently, there is no easy way to get the parent directory of
a notebook in Jupyter, so here I get that information either from
the parameter sent by `nbs/run_nbs.sh` (if called from command-line) or
from `os.getcwd()` (if called from browser).

In [2]:
PHENOPLIER_NOTEBOOK_FILEPATH = None
PHENOPLIER_NOTEBOOK_DIR = os.getcwd()

PROJECTS_TRAIT_KEY = "CHRONOTYPE"

In [3]:
# Parameters
PHENOPLIER_NOTEBOOK_FILEPATH = (
    "projects/chronotype/nbs/10_gwas_harmonization/01-run_harmonization.ipynb"
)

In [4]:
if PHENOPLIER_NOTEBOOK_FILEPATH is not None:
    PHENOPLIER_NOTEBOOK_DIR = str(Path(PHENOPLIER_NOTEBOOK_FILEPATH).parent)

display(PHENOPLIER_NOTEBOOK_DIR)

'projects/chronotype/nbs/10_gwas_harmonization'

In [5]:
OUTPUT_DIR = conf.PROJECTS[PROJECTS_TRAIT_KEY]["RESULTS_DIR"] / "harmonized_gwas"
display(OUTPUT_DIR)

OUTPUT_DIR_STR = str(OUTPUT_DIR)
display(OUTPUT_DIR_STR)

PosixPath('/opt/data/projects/chronotype/results/harmonized_gwas')

'/opt/data/projects/chronotype/results/harmonized_gwas'

# Run

In [6]:
%%bash -s "$PHENOPLIER_NOTEBOOK_DIR" "$PROJECTS_TRAIT_KEY" "$OUTPUT_DIR_STR"
set -euo pipefail
# IFS=$'\n\t'

# read the notebook directory parameter and remove $1
export PHENOPLIER_NOTEBOOK_DIR="${PHENOPLIER_CODE_DIR}/$1"
shift

# read trait key
export PROJECTS_TRAIT_KEY="$1"
shift

# read output dir
export OUTPUT_DIR="$1"
shift

run_job () {
    # run_job is a standard function name that performs a particular job
    # depending on the context. It will be called by GNU Parallel below.
    #
    # The implementation here runs the GLS model of PhenoPLIER on a trait.

    # read trait information
    # the first parameter to this function is a string with values separated by
    # commas (,). So here I split those into different variables.
    IFS=',' read -r pheno_id file sample_size n_cases <<< "$1"

#     INPUT_FILENAME="${file%.*}"
#
#     # get input GWAS file, there should be a single file
#     # here I make sure that there are no other files in the folder that
#     # match this phenotype/trait filename prefix
#     env_name="PHENOPLIER_PROJECTS_${PROJECTS_TRAIT_KEY}_DATA_DIR"
#     GWAS_DIR="${!env_name}/gwas"
#     N_GWAS_FILES=$(ls ${GWAS_DIR}/${INPUT_FILENAME}* | wc -l)
#     if [ "${N_GWAS_FILES}" != "1" ]; then
#         echo "ERROR: found ${N_GWAS_FILES} GWAS files instead of one"
#         exit 1
#     fi
#     INPUT_GWAS_FILEPATH=$(ls ${GWAS_DIR}/${INPUT_FILENAME}*)

    env_name="PHENOPLIER_PROJECTS_${PROJECTS_TRAIT_KEY}_DATA_DIR"
    GWAS_DIR="${!env_name}/gwas"

    INPUT_GWAS_FILEPATH="${GWAS_DIR}/${file}"

    mkdir -p "${OUTPUT_DIR}"

    # make sure we are not also parallelizing within numpy, etc
    export NUMBA_NUM_THREADS=1
    export MKL_NUM_THREADS=1
    export OPEN_BLAS_NUM_THREADS=1
    export NUMEXPR_NUM_THREADS=1
    export OMP_NUM_THREADS=1

    echo "Running for $pheno_id"
    echo "Saving results in ${OUTPUT_DIR}"

    bash "${PHENOPLIER_CODE_DIR}/scripts/gwas_harmonize.sh" \
        --input-gwas-file "${INPUT_GWAS_FILEPATH}" \
        --sample-size "${sample_size}" \
        --sample-n-cases "${n_cases}" \
        --liftover-chain-file "${PHENOPLIER_GENERAL_LIFTOVER_HG19_TO_HG38}" \
        --output-dir "${OUTPUT_DIR}"

    # print errors here in the notebook
    LOG_FILE="${INPUT_GWAS_FILEPATH}.log"

#     # first, look for the log file for this trait
#     pattern="${OUTPUT_DIR}/${INPUT_FILENAME}*.log"
#
#     N_LOG_FILES=$(ls ${pattern} | wc -l)
#     if [ "${N_LOG_FILES}" != "1" ]; then
#         echo "ERROR: found ${N_LOG_FILES} log files instead of one"
#         exit 1
#     fi
#     LOG_FILE=$(ls ${pattern})

    cat "${LOG_FILE}" | grep -iE "warning|error"

    echo
}

# export function so GNU Parallel can see it
export -f run_job

# generate a list of run_job calls for GNU Parallel
# here I read a file with information about traits (one trait per line)
env_name="PHENOPLIER_PROJECTS_${PROJECTS_TRAIT_KEY}_TRAITS_INFO_FILE"
while IFS= read -r line; do
    echo run_job "${line}"
done < <(tail -n "+2" "${!env_name}") |
    parallel -k --lb --halt 2 -j${PHENOPLIER_GENERAL_N_JOBS}

Running for chronotype
Saving results in /opt/data/projects/chronotype/results/harmonized_gwas
Parameter --samples-n-cases is NA, skipping
Loading GWAS config file: /opt/data/projects/chronotype/data/gwas/chronotype_raw_BOLT.output_HRC.only_plus.metrics_maf0.001_hwep1em12_info0.3.txt.gz.config


+ /opt/data/software/conda_envs/summary_gwas_imputation/bin/python /opt/data/software/summary-gwas-imputation/src/gwas_parsing.py -gwas_file /opt/data/projects/chronotype/data/gwas/chronotype_raw_BOLT.output_HRC.only_plus.metrics_maf0.001_hwep1em12_info0.3.txt.gz -separator '	' -snp_reference_metadata /opt/data/data/phenomexcan/ld_blocks/reference_panel_1000G/variant_metadata.txt.gz METADATA --chromosome_format -output_column_map CHR chromosome -output_column_map SNP variant_id -output_column_map BP position -output_column_map ALLELE1 effect_allele -output_column_map ALLELE0 non_effect_allele -output_column_map BETA effect_size -output_column_map SE standard_error -output_column_map P_BOLT_LMM pvalue --insert_value sample_size 449734 -output_order variant_id panel_variant_id chromosome position effect_allele non_effect_allele frequency pvalue zscore effect_size standard_error sample_size n_cases -liftover /opt/data/data/liftover/chains/hg19ToHg38.over.chain.gz -output /opt/data/project




+ set +x
cat: /opt/data/projects/chronotype/data/gwas/chronotype_raw_BOLT.output_HRC.only_plus.metrics_maf0.001_hwep1em12_info0.3.txt.gz.log: No such file or directory


# Perform some checks in output and log files

In [7]:
assert OUTPUT_DIR.exists()

In [8]:
log_files = OUTPUT_DIR.glob("*.log")

In [9]:
for f in log_files:
    read_log_file_and_check_line_exists(
        f,
        [
            "INFO - Finished converting GWAS in",
        ],
    )