# Description

Explore GWAS file structure

# Modules

In [1]:
import re
import subprocess
from pathlib import Path
import tempfile
import shutil
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import pandas as pd

import conf
from utils import chunker

# Settings

In [2]:
PROJECTS_TRAIT_KEY = "CHRONOTYPE"

In [3]:
# Parameters
PHENOPLIER_NOTEBOOK_FILEPATH = "projects/chronotype/nbs/05_gwas/03-gwas-explore.ipynb"

# Paths

In [4]:
INPUT_GWAS_DIR = conf.PROJECTS[PROJECTS_TRAIT_KEY]["DATA_DIR"] / "gwas"
display(INPUT_GWAS_DIR)
assert INPUT_GWAS_DIR.exists()

PosixPath('/opt/data/projects/chronotype/data/gwas')

In [5]:
PLINK2 = conf.PLINK["EXECUTABLE_VERSION_2"]
display(PLINK2)
assert PLINK2.exists()

PosixPath('/opt/data/software/plink/plink2')

# GWAS results files

In [6]:
# check files in directory
gwas_files = sorted(list(INPUT_GWAS_DIR.glob("*.gz")))
display(len(gwas_files))
display(gwas_files[:10])

1

[PosixPath('/opt/data/projects/chronotype/data/gwas/chronotype_raw_BOLT.output_HRC.only_plus.metrics_maf0.001_hwep1em12_info0.3.txt.gz')]

In [7]:
# get files from traits info file
traits_info = pd.read_csv(conf.PROJECTS[PROJECTS_TRAIT_KEY]["TRAITS_INFO_FILE"])

In [8]:
traits_info.shape

(1, 4)

In [9]:
traits_info

Unnamed: 0,id,gwas_file,sample_size,n_cases
0,chronotype,chronotype_raw_BOLT.output_HRC.only_plus.metri...,449734,


In [10]:
gwas_files = [INPUT_GWAS_DIR / t.gwas_file for _, t in traits_info.iterrows()]

In [11]:
len(gwas_files)

1

In [12]:
gwas_files

[PosixPath('/opt/data/projects/chronotype/data/gwas/chronotype_raw_BOLT.output_HRC.only_plus.metrics_maf0.001_hwep1em12_info0.3.txt.gz')]

# Load GWAS

In [13]:
df = pd.read_csv(gwas_files[0], sep="\t", nrows=10)

In [14]:
df

Unnamed: 0,SNP,CHR,BP,ALLELE1,ALLELE0,A1FREQ,INFO,BETA,SE,P_BOLT_LMM,HWE_P
0,rs10399793,1,49298,T,C,0.376391,0.342797,-0.003118,0.004764,0.52,0.82583
1,rs2462492,1,54676,C,T,0.599144,0.340158,-0.000357,0.004721,0.96,0.633757
2,rs3107975,1,55326,T,C,0.991604,0.324228,0.009961,0.026451,0.72,0.405636
3,1:70728_C_T,1,70728,C,T,0.997841,0.365713,0.025184,0.047746,0.73,0.666235
4,rs2462495,1,79033,A,G,0.001262,0.536566,0.026157,0.0614,0.65,0.411469
5,rs114608975,1,86028,T,C,0.896414,0.340885,0.00977,0.007551,0.24,0.992579
6,rs6702460,1,91536,G,T,0.542929,0.340746,-0.005976,0.004649,0.25,0.846043
7,rs8179466,1,234313,C,T,0.925428,0.311447,-0.00097,0.009158,0.96,0.901059
8,rs6680723,1,534192,C,T,0.759155,0.349843,0.003497,0.005314,0.52,0.839277
9,rs6683466,1,534583,C,G,0.993323,0.345195,0.030821,0.030115,0.27,0.516151


In [15]:
df = pd.read_csv(gwas_files[0], sep="\t")

In [16]:
df.shape

(11977111, 11)

In [17]:
df.head()

Unnamed: 0,SNP,CHR,BP,ALLELE1,ALLELE0,A1FREQ,INFO,BETA,SE,P_BOLT_LMM,HWE_P
0,rs10399793,1,49298,T,C,0.376391,0.342797,-0.003118,0.004764,0.52,0.82583
1,rs2462492,1,54676,C,T,0.599144,0.340158,-0.000357,0.004721,0.96,0.633757
2,rs3107975,1,55326,T,C,0.991604,0.324228,0.009961,0.026451,0.72,0.405636
3,1:70728_C_T,1,70728,C,T,0.997841,0.365713,0.025184,0.047746,0.73,0.666235
4,rs2462495,1,79033,A,G,0.001262,0.536566,0.026157,0.0614,0.65,0.411469
