# Description

It runs PLINK2 on GWAS results to check that the genomic inflation factor is withint acceptable limits.

# Modules

In [6]:
import os
import re
import subprocess
from pathlib import Path
import tempfile
import shutil
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd

In [4]:
# Utils
def chunker(seq, size):
    """
    Divides a sequence in chunks according to the given size. For example, if
    given a list

        [0,1,2,3,4,5,6,7]

    and size is 3, it will return

        [[0, 1, 2], [3, 4, 5], [6, 7]]
    """
    return (seq[pos : pos + size] for pos in range(0, len(seq), size))


In [30]:
# Configs
N_JOBS = 2

# Paths

In [18]:
GWAS_DIR = Path(os.environ["PHENOPLIER_RESULTS_GLS_NULL_SIMS_UKB_40PCS"]) / "uncompressed_gwas"
display(GWAS_DIR)

PosixPath('/tmp/phenoplier/results/gls/gwas/null_sims/ukb_40pcs/uncompressed_gwas')

In [13]:
PLINK2 = Path(os.environ["PHENOPLIER_DEPENDENCIES_PLINK_EXECUTABLE_VERSION_2"])
display(PLINK2)

PosixPath('/tmp/phenoplier/software/plink/plink2')

# GWAS results files

In [27]:
gwas_files = sorted(list(GWAS_DIR.glob("*.glm.linear.assoc.txt")))
display(len(gwas_files))
display(gwas_files[:10])

10

[PosixPath('/tmp/phenoplier/results/gls/gwas/null_sims/ukb_40pcs/uncompressed_gwas/plink_gwas.plink2.pheno0.glm.linear.assoc.txt'),
 PosixPath('/tmp/phenoplier/results/gls/gwas/null_sims/ukb_40pcs/uncompressed_gwas/plink_gwas.plink2.pheno1.glm.linear.assoc.txt'),
 PosixPath('/tmp/phenoplier/results/gls/gwas/null_sims/ukb_40pcs/uncompressed_gwas/plink_gwas.plink2.pheno2.glm.linear.assoc.txt'),
 PosixPath('/tmp/phenoplier/results/gls/gwas/null_sims/ukb_40pcs/uncompressed_gwas/plink_gwas.plink2.pheno3.glm.linear.assoc.txt'),
 PosixPath('/tmp/phenoplier/results/gls/gwas/null_sims/ukb_40pcs/uncompressed_gwas/plink_gwas.plink2.pheno4.glm.linear.assoc.txt'),
 PosixPath('/tmp/phenoplier/results/gls/gwas/null_sims/ukb_40pcs/uncompressed_gwas/plink_gwas.plink2.pheno5.glm.linear.assoc.txt'),
 PosixPath('/tmp/phenoplier/results/gls/gwas/null_sims/ukb_40pcs/uncompressed_gwas/plink_gwas.plink2.pheno6.glm.linear.assoc.txt'),
 PosixPath('/tmp/phenoplier/results/gls/gwas/null_sims/ukb_40pcs/uncompresse

# Check inflation factor

In [20]:
PAT = re.compile(
    r"Genomic inflation est\. lambda \(based on median chisq\) = (?P<inf_factor>[0-9\.]+)\."
)

In [21]:
# testing
input_text = """
PLINK v2.00a3LM 64-bit Intel (26 Apr 2022)     www.cog-genomics.org/plink/2.0/
(C) 2005-2022 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to plink2.log.
Options in effect:
  --adjust-file /opt/data/data/1000g/genotypes/gwas/random.pheno0.glm.linear test=ADD

Start time: Fri Apr 29 16:12:24 2022
64185 MiB RAM detected; reserving 32092 MiB for main workspace.
Using up to 4 compute threads.
--adjust: Genomic inflation est. lambda (based on median chisq) = 1.00316.
--adjust-file values (5923554 tests) written to plink2.adjusted .
End time: Fri Apr 29 16:12:33 2022
"""

m = PAT.search(input_text)
assert m.group("inf_factor") == "1.00316"

In [22]:
# testing
input_text = """
PLINK v2.00a3LM 64-bit Intel (26 Apr 2022)     www.cog-genomics.org/plink/2.0/
(C) 2005-2022 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to plink2.log.
Options in effect:
  --adjust-file base/data/1000g/genotypes/gwas/random.pheno1.glm.linear test=ADD

Start time: Fri Apr 29 12:19:51 2022
64185 MiB RAM detected; reserving 32092 MiB for main workspace.
Using up to 4 compute threads.
--adjust: Genomic inflation est. lambda (based on median chisq) = 1.
--adjust-file values (5923554 tests) written to plink2.adjusted .
End time: Fri Apr 29 12:19:59 2022
"""

m = PAT.search(input_text)
display(m.group("inf_factor"))
assert m.group("inf_factor") == "1"

'1'

In [24]:
def _compute_inflation_factor(gwas_files_group):
    res = {}
    for gwas_file in gwas_files_group:
        output_dir = Path(tempfile.mkdtemp(prefix="plink-adjust-"))
        output_file = output_dir / "outfile"
        result = subprocess.run(
            [
                PLINK2,
                "--adjust-file",
                str(gwas_file),
                "test=ADD",
                "--threads",
                str(N_JOBS),
                "--out",
                str(output_file),
            ],
            stdout=subprocess.PIPE,
        )

        assert result.returncode == 0

        result_output = result.stdout.decode("utf-8")
        inf_factor = float(PAT.search(result_output).group("inf_factor"))
        res[gwas_file.name] = inf_factor

        # delete temporary folder
        shutil.rmtree(output_dir)

    return res

In [28]:
# testing
_gwas_file = gwas_files[0]
display(_gwas_file)

_tmp = _compute_inflation_factor([_gwas_file])
assert _tmp is not None
assert _gwas_file.name in _tmp
display(_tmp)
assert 1.005 >= _tmp[_gwas_file.name] >= 1.0

PosixPath('/tmp/phenoplier/results/gls/gwas/null_sims/ukb_40pcs/uncompressed_gwas/plink_gwas.plink2.pheno0.glm.linear.assoc.txt')

{'plink_gwas.plink2.pheno0.glm.linear.assoc.txt': 1.01129}

AssertionError: 

In [31]:
gwas_files_chunks = list(
    chunker(
        gwas_files,
        int(min(10, len(gwas_files) / N_JOBS)),
    )
)

In [32]:
len(gwas_files_chunks)

2

In [33]:
all_results = {}

with ProcessPoolExecutor(max_workers=N_JOBS) as executor:
    tasks = [
        executor.submit(_compute_inflation_factor, chunk) for chunk in gwas_files_chunks
    ]
    for future in as_completed(tasks):
        res = future.result()
        all_results.update(res)

In [34]:
assert len(all_results) == len(gwas_files)

# Create dataframe

In [35]:
all_results_df = pd.Series(all_results, name="inflation_factor").rename_axis(
    "phenotype_code"
)

In [36]:
all_results_df.shape

(10,)

In [37]:
all_results_df.head()

phenotype_code
plink_gwas.plink2.pheno5.glm.linear.assoc.txt    1.00149
plink_gwas.plink2.pheno6.glm.linear.assoc.txt    1.00000
plink_gwas.plink2.pheno7.glm.linear.assoc.txt    1.00633
plink_gwas.plink2.pheno8.glm.linear.assoc.txt    1.00000
plink_gwas.plink2.pheno9.glm.linear.assoc.txt    1.00000
Name: inflation_factor, dtype: float64

# Checks

In [38]:
all_results_df.describe()

count    10.000000
mean      1.002274
std       0.003784
min       1.000000
25%       1.000000
50%       1.000180
75%       1.002825
max       1.011290
Name: inflation_factor, dtype: float64

In [39]:
assert all_results_df.min() >= 1.0
assert all_results_df.max() <= 1.04

In [40]:
all_results_df.sort_values(ascending=False).head(20)

phenotype_code
plink_gwas.plink2.pheno0.glm.linear.assoc.txt    1.01129
plink_gwas.plink2.pheno7.glm.linear.assoc.txt    1.00633
plink_gwas.plink2.pheno4.glm.linear.assoc.txt    1.00327
plink_gwas.plink2.pheno5.glm.linear.assoc.txt    1.00149
plink_gwas.plink2.pheno3.glm.linear.assoc.txt    1.00036
plink_gwas.plink2.pheno6.glm.linear.assoc.txt    1.00000
plink_gwas.plink2.pheno9.glm.linear.assoc.txt    1.00000
plink_gwas.plink2.pheno8.glm.linear.assoc.txt    1.00000
plink_gwas.plink2.pheno2.glm.linear.assoc.txt    1.00000
plink_gwas.plink2.pheno1.glm.linear.assoc.txt    1.00000
Name: inflation_factor, dtype: float64

# Save

In [41]:
all_results_df.to_csv(
    GWAS_DIR / "random_pheno-gwas-inflation_factors.tsv", sep="\t"
)