In [4]:
import numpy as np
import polars as pl

In [38]:
COORDINATES = ["chrom", "pos", "ref", "alt"]

In [32]:
dataset = "gwas_matched_9"
V = (
    pl.read_parquet(f"../../results/dataset/{dataset}/test.parquet")
    .filter(pl.col("label"))
    .filter(pl.col("consequence") != "missense_variant")
    .with_columns(
        pl.col("trait").str.split(","),
        id=V["chrom"] + "_" + V["pos"].cast(str) + "_" + V["ref"] + "_" + V["alt"]
    )
)
V

chrom,pos,ref,alt,pip,trait,label,maf,ld_score,consequence,tss_dist,match_group
str,i64,str,str,f64,list[str],bool,f64,f64,str,i64,str
"""1""",2293397,"""G""","""A""",0.999932,"[""Height""]",true,0.37057,32.302,"""dELS""",65077,"""dELS_0"""
"""1""",3080038,"""T""","""C""",0.999895,"[""MCH"", ""MCV"", … ""RBC""]",true,0.23272,31.606,"""dELS""",10826,"""dELS_1"""
"""1""",3774964,"""A""","""G""",0.999973,"[""Hb"", ""HbA1c"", … ""RBC""]",true,0.23057,95.317,"""dELS""",2138,"""dELS_2"""
"""1""",9181780,"""G""","""A""",1.0,"[""AST"", ""Mono""]",true,0.074322,35.472,"""non_coding_transcript_exon_var…",52677,"""non_coding_transcript_exon_var…"
"""1""",9295877,"""G""","""T""",0.993319,"[""DVT""]",true,0.26506,46.307,"""dELS""",1348,"""dELS_3"""
…,…,…,…,…,…,…,…,…,…,…,…
"""22""",45969257,"""G""","""A""",0.999905,"[""eBMD""]",true,0.0026413,9.5349,"""dELS""",3457,"""dELS_207"""
"""22""",45979731,"""C""","""T""",0.98026,"[""eBMD""]",true,0.16714,37.776,"""dELS""",2568,"""dELS_208"""
"""22""",46219479,"""G""","""A""",0.945802,"[""BW""]",true,0.12871,164.85,"""dELS_flank""",28805,"""dELS_flank_108"""
"""22""",47990921,"""C""","""T""",0.997536,"[""BMI""]",true,0.43761,53.514,"""intron_variant""",498631,"""intron_variant_124"""


In [None]:
def n_common_hits(x, y):
    return len(
        set(V.filter(pl.col("trait").list.contains(x))["id"])
        &
        set(V.filter(pl.col("trait").list.contains(y))["id"])
    )

In [33]:
traits = np.unique(np.concatenate(V["trait"]))
traits

array(['AFib', 'AG', 'AID_Combined', 'ALP', 'ALT', 'AST',
       'Age_at_Menarche', 'Age_at_Menopause', 'Alb', 'Alzheimer_LTFH',
       'ApoA', 'ApoB', 'Asthma', 'BFP', 'BMI', 'BW', 'Balding_Type4',
       'Baso', 'BrC', 'CAD', 'CRC', 'CRP', 'Ca', 'Cholelithiasis', 'DBP',
       'DVT', 'Eosino', 'FEV1FVC', 'Fibroblastic_Disorders', 'GGT',
       'Glaucoma_Combined', 'Glucose', 'HDLC', 'Hb', 'HbA1c', 'Height',
       'Ht', 'Hypothyroidism', 'IBD', 'IGF1', 'Insomnia', 'Irritability',
       'LDLC', 'LOY', 'LipoA', 'Lym', 'MAP', 'MCH', 'MCHC', 'MCP', 'MCV',
       'Migraine_Self', 'Miserableness', 'Mono', 'Mood_Swings',
       'Morning_Person', 'Neutro', 'PP', 'Plt', 'PrC', 'RBC',
       'Risk_Taking', 'SBP', 'SHBG', 'Sensitivity', 'Smoking_Ever_Never',
       'Suffer_from_Nerves', 'T2D', 'T2D_BMI', 'TBil', 'TC', 'TG', 'TP',
       'Testosterone', 'UA', 'Urea', 'VitD', 'WBC', 'WHRadjBMI',
       'Worrier', 'eBMD', 'eGFR', 'eGFRcys'], dtype='<U22')

In [34]:
n_hits = [
    len(V.filter(pl.col("trait").list.contains(trait)))
    for trait in traits
]
n_chroms = [
    len(V.filter(pl.col("trait").list.contains(trait))["chrom"].unique())
    for trait in traits
]
df = pl.DataFrame(
    {
        "trait": traits,
        "n_hits": n_hits,
        "n_chroms": n_chroms,
    }
)
df

trait,n_hits,n_chroms
str,i64,i64
"""AFib""",5,4
"""AG""",40,15
"""AID_Combined""",2,2
"""ALP""",41,17
"""ALT""",23,16
…,…,…
"""WHRadjBMI""",24,12
"""Worrier""",1,1
"""eBMD""",68,21
"""eGFR""",32,17


In [95]:
df2 = df.filter(pl.col("n_hits") >= 30, pl.col("n_chroms") >= 3).sort("n_hits", descending=True)
df2

trait,n_hits,n_chroms
str,i64,i64
"""Height""",81,18
"""MCV""",70,20
"""Plt""",70,17
"""MCH""",68,21
"""eBMD""",68,21
…,…,…
"""CRP""",33,16
"""ApoA""",32,16
"""eGFR""",32,17
"""BW""",30,17


In [96]:
exclude = np.full(len(df2), False)

for i, trait1 in enumerate(df2["trait"]):
    if exclude[i]: continue
    n1 = df2.filter(trait=trait1)[0, "n_hits"]
    for j, trait2 in enumerate(df2["trait"]):
        if i < j:
            if exclude[j]: continue
            n = n_common_hits(trait1, trait2)
            if n > n1 * 0.1:
                exclude[j] = True
                

In [97]:
df3 = df2.filter(~exclude)
df3

trait,n_hits,n_chroms
str,i64,i64
"""Height""",81,18
"""MCV""",70,20
"""Plt""",70,17
"""eBMD""",68,21
"""Mono""",66,19
…,…,…
"""Hb""",36,16
"""Neutro""",35,15
"""CRP""",33,16
"""BW""",30,17


In [101]:
df3.select("trait").write_csv("../../config/gwas/independent_traits_filtered.csv", include_header=False)