In [6]:
import pandas as pd
import polars as pl
from tqdm import tqdm

In [16]:
NON_EXONIC = [
    "intergenic_variant",
    "intron_variant",
    "upstream_gene_variant",
    "downstream_gene_variant"
]

cre_classes = ["PLS", "pELS", "dELS", "DNase-H3K4me3", "CTCF-only"]
cre_flank_classes = [f"{c}_flank" for c in cre_classes]

In [4]:
traits = pd.read_csv(
    "../../results/gwas_gokcen/raw/disease_list.txt", delim_whitespace=True, header=None,
    names=["trait", "source", "name"]
).trait.unique()
traits

array(['AD', 'ADHD', 'AIT', 'ASM', 'BMD-HT', 'BMI', 'BP', 'Breast_cancer',
       'CAD', 'CD', 'Celiac', 'CVD', 'ECOL', 'Eczema', 'EY', 'FG',
       'Glucose', 'HbA1c', 'HDL', 'Height', 'HT', 'HTN', 'IBD',
       'Insomnia', 'Intel', 'LDL', 'MCH', 'MDD', 'MNP', 'MNR', 'MS',
       'NRT', 'PLT', 'RA', 'RBC', 'RDW', 'RR-ENT', 'SCZ', 'SLE', 'SWB',
       'T2D', 'TC', 'TG', 'WHR'], dtype=object)

In [17]:
V_cre = pl.read_parquet("../../results/gwas_gokcen/coords.annot_with_cre.parquet")
V_cre = V_cre.filter(V_cre["consequence"].is_in(NON_EXONIC + cre_classes + cre_flank_classes))
V_cre

chrom,pos,ref,alt,consequence
str,i64,str,str,str
"""1""",950296,"""C""","""A""","""intron_variant"""
"""1""",951408,"""G""","""A""","""CTCF-only_flank"""
"""1""",952180,"""A""","""C""","""CTCF-only_flank"""
"""1""",955679,"""C""","""T""","""dELS"""
"""1""",956565,"""A""","""G""","""intron_variant"""
…,…,…,…,…
"""22""",50712045,"""G""","""A""","""intron_variant"""
"""22""",50712922,"""C""","""T""","""intron_variant"""
"""22""",50713203,"""C""","""T""","""dELS_flank"""
"""22""",50713214,"""C""","""T""","""dELS_flank"""


In [9]:
res = []
for trait in tqdm(traits):
    V = pl.read_parquet(f"../../results/gwas_gokcen/processed/{trait}.parquet")
    res.append([trait, (V["PIP"] > 0.9).sum(), (V["PIP"] < 0.01).sum()])
res = pd.DataFrame(res, columns=["trait", "n_pos", "n_neg"])
res = res.sort_values("n_pos", ascending=False)
res

100%|████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 104.98it/s]


Unnamed: 0,trait,n_pos,n_neg
19,Height,571,258346
4,BMD-HT,258,107833
32,PLT,252,110446
26,MCH,240,85326
18,HDL,198,50863
34,RBC,196,71452
35,RDW,189,67325
42,TG,146,45949
22,IBD,125,1530
17,HbA1c,113,37285


In [19]:
res = []
for trait in tqdm(traits):
    V = pl.read_parquet(f"../../results/gwas_gokcen/processed/{trait}.parquet")
    V = V.join(V_cre, on=["chrom", "pos", "ref", "alt"], how="inner")
    res.append([trait, (V["PIP"] > 0.9).sum(), (V["PIP"] < 0.01).sum()])
res = pd.DataFrame(res, columns=["trait", "n_pos", "n_neg"])
res = res.sort_values("n_pos", ascending=False)
res

100%|█████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 54.90it/s]


Unnamed: 0,trait,n_pos,n_neg
19,Height,372,240293
4,BMD-HT,195,101474
32,PLT,187,100804
26,MCH,170,78005
18,HDL,161,46626
35,RDW,128,61335
34,RBC,128,65382
42,TG,119,42399
22,IBD,112,1327
9,CD,83,1249
