In [1]:
import pandas as pd
import polars as pl
from tqdm import tqdm

In [16]:
NON_EXONIC = [
    "intergenic_variant",
    "intron_variant",
    "upstream_gene_variant",
    "downstream_gene_variant"
]

cre_classes = ["PLS", "pELS", "dELS", "DNase-H3K4me3", "CTCF-only"]
cre_flank_classes = [f"{c}_flank" for c in cre_classes]

In [4]:
traits = pd.read_csv(
    "../../results/gwas_gokcen/raw/disease_list.txt", delim_whitespace=True, header=None,
    names=["trait", "source", "name"]
).trait.unique()
traits

array(['AD', 'ADHD', 'AIT', 'ASM', 'BMD-HT', 'BMI', 'BP', 'Breast_cancer',
       'CAD', 'CD', 'Celiac', 'CVD', 'ECOL', 'Eczema', 'EY', 'FG',
       'Glucose', 'HbA1c', 'HDL', 'Height', 'HT', 'HTN', 'IBD',
       'Insomnia', 'Intel', 'LDL', 'MCH', 'MDD', 'MNP', 'MNR', 'MS',
       'NRT', 'PLT', 'RA', 'RBC', 'RDW', 'RR-ENT', 'SCZ', 'SLE', 'SWB',
       'T2D', 'TC', 'TG', 'WHR'], dtype=object)

In [17]:
V_cre = pl.read_parquet("../../results/gwas_gokcen/coords.annot_with_cre.parquet")
V_cre = V_cre.filter(V_cre["consequence"].is_in(NON_EXONIC + cre_classes + cre_flank_classes))
V_cre

chrom,pos,ref,alt,consequence
str,i64,str,str,str
"""1""",950296,"""C""","""A""","""intron_variant"""
"""1""",951408,"""G""","""A""","""CTCF-only_flank"""
"""1""",952180,"""A""","""C""","""CTCF-only_flank"""
"""1""",955679,"""C""","""T""","""dELS"""
"""1""",956565,"""A""","""G""","""intron_variant"""
…,…,…,…,…
"""22""",50712045,"""G""","""A""","""intron_variant"""
"""22""",50712922,"""C""","""T""","""intron_variant"""
"""22""",50713203,"""C""","""T""","""dELS_flank"""
"""22""",50713214,"""C""","""T""","""dELS_flank"""


In [9]:
res = []
for trait in tqdm(traits):
    V = pl.read_parquet(f"../../results/gwas_gokcen/processed/{trait}.parquet")
    res.append([trait, (V["PIP"] > 0.9).sum(), (V["PIP"] < 0.01).sum()])
res = pd.DataFrame(res, columns=["trait", "n_pos", "n_neg"])
res = res.sort_values("n_pos", ascending=False)
res

100%|████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 104.98it/s]


Unnamed: 0,trait,n_pos,n_neg
19,Height,571,258346
4,BMD-HT,258,107833
32,PLT,252,110446
26,MCH,240,85326
18,HDL,198,50863
34,RBC,196,71452
35,RDW,189,67325
42,TG,146,45949
22,IBD,125,1530
17,HbA1c,113,37285


In [19]:
res = []
for trait in tqdm(traits):
    V = pl.read_parquet(f"../../results/gwas_gokcen/processed/{trait}.parquet")
    V = V.join(V_cre, on=["chrom", "pos", "ref", "alt"], how="inner")
    res.append([trait, (V["PIP"] > 0.9).sum(), (V["PIP"] < 0.01).sum()])
res = pd.DataFrame(res, columns=["trait", "n_pos", "n_neg"])
res = res.sort_values("n_pos", ascending=False)
res

100%|█████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 54.90it/s]


Unnamed: 0,trait,n_pos,n_neg
19,Height,372,240293
4,BMD-HT,195,101474
32,PLT,187,100804
26,MCH,170,78005
18,HDL,161,46626
35,RDW,128,61335
34,RBC,128,65382
42,TG,119,42399
22,IBD,112,1327
9,CD,83,1249


In [4]:
df = pl.read_csv("../../results/gwas_gokcen/raw/IBD-deLange2017.susie.gwfinemap.b38.gz", separator="\t")
df

CHR,SNP,BP,A1,A2,SNPVAR,MAF,N,Z,P,PIP,BETA_MEAN,BETA_SD,CREDIBLE_SET
str,str,i64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,i64
"""chr1""","""rs61769339""",727242,"""G""","""A""",2.1580e-9,0.11408,58331.0,1.0238,0.30593,0.0,6.2064e-179,8.1394e-91,0
"""chr1""","""rs12238997""",758351,"""A""","""G""",2.1580e-9,0.11937,58331.0,-0.90083,0.367679,0.0,-6.7713e-181,9.2966e-92,0
"""chr1""","""rs189800799""",766455,"""T""","""C""",2.1580e-9,0.032521,58331.0,-1.8394,0.0658564,0.0,-1.3112e-179,3.4150e-91,0
"""chr1""","""rs114983708""",778639,"""A""","""G""",2.1580e-9,0.044318,58331.0,-1.2408,0.21468,0.0,-3.6255e-179,6.0389e-91,0
"""chr1""","""rs138660747""",778897,"""C""","""A""",2.1580e-9,0.0071805,58331.0,0.084495,0.932663,0.0,1.0589e-180,9.3934e-92,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr22""","""rs189416433""",50788192,"""T""","""A""",2.1580e-9,0.010137,58331.0,-0.011561,0.990776,0.000425,1.4565e-7,0.0000746,0
"""chr22""","""rs9616985""",50791377,"""T""","""C""",2.1580e-9,0.073586,58331.0,-0.41176,0.680515,0.000451,-5.7109e-7,0.000081,0
"""chr22""","""rs144549712""",50791427,"""G""","""A""",2.1580e-9,0.14197,58331.0,-0.74009,0.459245,0.000516,-0.000001,0.000097,0
"""chr22""","""rs191117135""",50796371,"""G""","""A""",2.1580e-9,0.015159,58331.0,-2.7733,0.005549,0.00537,-0.000044,0.000651,0


In [5]:
df = df.filter(pl.col("PIP") > 0.9)
df

CHR,SNP,BP,A1,A2,SNPVAR,MAF,N,Z,P,PIP,BETA_MEAN,BETA_SD,CREDIBLE_SET
str,str,i64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,i64
"""chr1""","""rs2275916""",1439425,"""G""","""C""",2.1580e-9,0.012386,58331.0,-3.2356,0.001214,1.0,-0.270234,0.003744,9
"""chr1""","""rs1892289""",1439454,"""A""","""G""",1.3956e-8,0.23856,58331.0,4.0179,0.000059,1.0,1.22924,0.003744,6
"""chr1""","""rs4590""",1440430,"""C""","""G""",2.1580e-9,0.22371,58331.0,3.5207,0.00043,1.0,1.06555,0.003744,4
"""chr1""","""rs711180""",1442793,"""G""","""A""",2.1580e-9,0.16582,58331.0,-2.6368,0.008369,1.0,-2.0247,0.003744,5
"""chr1""","""rs147959152""",1443620,"""G""","""A""",2.1580e-9,0.04146,58331.0,-0.93684,0.348841,1.0,-1.06692,0.003744,3
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr22""","""rs9611176""",39441998,"""A""","""G""",2.1580e-9,0.25461,58331.0,2.3214,0.0202653,1.0,0.269764,0.004086,9
"""chr22""","""rs113200473""",39457735,"""G""","""A""",2.1580e-9,0.24594,58331.0,-1.6667,0.0955741,1.0,-0.493977,0.004083,7
"""chr22""","""rs7286917""",39464863,"""A""","""G""",2.1580e-9,0.24642,58331.0,1.7241,0.0846898,1.0,0.234959,0.004083,2
"""chr22""","""rs151114897""",39497671,"""C""","""T""",2.1580e-9,0.015723,58331.0,2.0734,0.0381351,0.999537,-0.087313,0.00449,8


In [6]:
df.filter(pl.col("P") < 5e-8)

CHR,SNP,BP,A1,A2,SNPVAR,MAF,N,Z,P,PIP,BETA_MEAN,BETA_SD,CREDIBLE_SET
str,str,i64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,i64
"""chr1""","""rs59523164""",7911767,"""G""","""A""",2.1580e-9,0.17623,58331.0,6.3432,2.2504e-10,1.0,0.189047,0.005769,3
"""chr1""","""rs2641116""",7969434,"""T""","""G""",2.1580e-9,0.17445,58331.0,-7.1287,1.0132e-12,0.921796,-0.097197,0.0394404,6
"""chr1""","""rs1883679""",8040391,"""T""","""G""",2.1580e-9,0.17091,58331.0,-7.2544,4.0345e-13,1.0,-0.211305,0.006536,2
"""chr1""","""rs34045935""",8126172,"""C""","""T""",2.1580e-9,0.16619,58331.0,7.343,2.0886e-13,1.0,0.230465,0.005774,1
"""chr1""","""rs72635736""",8229192,"""T""","""C""",2.1580e-9,0.2295,58331.0,-5.7712,7.8709e-9,1.0,-0.092932,0.00408,4
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr21""","""rs1736161""",15460903,"""G""","""A""",2.1580e-9,0.44324,58331.0,11.551,7.2966e-31,0.909562,0.137112,0.0592834,8
"""chr21""","""rs2876932""",44198653,"""A""","""G""",2.1580e-9,0.38458,58331.0,-10.079,6.8419e-24,1.0,-0.090716,0.004122,1
"""chr22""","""rs5754467""",21630805,"""A""","""G""",2.1580e-9,0.19923,58331.0,7.5409,4.6674e-14,1.0,0.0893791,0.004118,1
"""chr22""","""rs714027""",30181782,"""A""","""G""",1.4870e-7,0.45413,58331.0,-5.5484,2.8830e-8,1.0,-0.059962,0.004113,2


In [7]:
df.filter(pl.col("P") < 0.05)

CHR,SNP,BP,A1,A2,SNPVAR,MAF,N,Z,P,PIP,BETA_MEAN,BETA_SD,CREDIBLE_SET
str,str,i64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,i64
"""chr1""","""rs2275916""",1439425,"""G""","""C""",2.1580e-9,0.012386,58331.0,-3.2356,0.001214,1.0,-0.270234,0.003744,9
"""chr1""","""rs1892289""",1439454,"""A""","""G""",1.3956e-8,0.23856,58331.0,4.0179,0.000059,1.0,1.22924,0.003744,6
"""chr1""","""rs4590""",1440430,"""C""","""G""",2.1580e-9,0.22371,58331.0,3.5207,0.00043,1.0,1.06555,0.003744,4
"""chr1""","""rs711180""",1442793,"""G""","""A""",2.1580e-9,0.16582,58331.0,-2.6368,0.008369,1.0,-2.0247,0.003744,5
"""chr1""","""rs34388881""",1446303,"""G""","""A""",2.1580e-9,0.012799,58331.0,-2.8913,0.003837,1.0,-0.61367,0.003744,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""chr22""","""rs149953174""",39329225,"""G""","""A""",2.1580e-9,0.012864,58331.0,2.3612,0.0182159,1.0,0.121061,0.004081,5
"""chr22""","""rs5750808""",39394982,"""G""","""A""",2.1580e-9,0.30277,58331.0,-3.8872,0.000101,0.986959,-0.471763,0.0543812,3
"""chr22""","""rs7288760""",39423964,"""A""","""G""",2.1580e-9,0.28478,58331.0,3.25,0.001154,1.0,0.479741,0.004083,6
"""chr22""","""rs9611176""",39441998,"""A""","""G""",2.1580e-9,0.25461,58331.0,2.3214,0.0202653,1.0,0.269764,0.004086,9
