In [1]:
import numpy as np
import polars as pl

## Mendelian traits

In [13]:
dataset = "mendelian_matched_9"
V = (
    pl.read_parquet(f"../../results/dataset/{dataset}/test.parquet")
    .filter(pl.col("label"))
    .filter(pl.col("consequence") != "missense_variant")
    .with_columns(
        pl.col("OMIM").str.split_exact(" ", 1).struct.rename_fields(["dummy", "trait"])
    )
    .unnest("OMIM").drop("dummy")
)
V

chrom,pos,ref,alt,consequence,source,trait,label,tss_dist,match_group
str,i64,str,str,str,str,str,bool,i64,str
"""1""",7961859,"""C""","""G""","""PLS""","""OMIM""","""606324""",true,34,"""PLS_0"""
"""1""",9943502,"""A""","""T""","""5_prime_UTR_variant""","""OMIM""","""608553""",true,26,"""5_prime_UTR_variant_0"""
"""1""",9943503,"""C""","""T""","""5_prime_UTR_variant""","""OMIM""","""608553""",true,27,"""5_prime_UTR_variant_1"""
"""1""",11023351,"""G""","""A""","""3_prime_UTR_variant""","""OMIM""","""612069""",true,1206,"""3_prime_UTR_variant_0"""
"""1""",21509427,"""C""","""T""","""5_prime_UTR_variant""","""OMIM""","""241500""",true,0,"""5_prime_UTR_variant_2"""
…,…,…,…,…,…,…,…,…,…
"""X""",155022770,"""A""","""G""","""PLS""","""OMIM""","""306700""",true,46,"""PLS_69"""
"""X""",155022771,"""G""","""A""","""PLS""","""OMIM""","""306700""",true,47,"""PLS_70"""
"""X""",155022773,"""A""","""T""","""PLS""","""OMIM""","""306700""",true,49,"""PLS_71"""
"""X""",155022807,"""T""","""C""","""PLS""","""OMIM""","""306700""",true,83,"""PLS_72"""


In [17]:
df = V["trait"].value_counts().sort("count", descending=True).filter(pl.col("count") >= 10)
df

trait,count
str,u32
"""600886""",25
"""613985""",24
"""614743""",22
"""306900""",21
"""250250""",21
"""174500""",14
"""143890""",12
"""210710""",10


In [22]:
# V.filter(trait="600886")  #5' UTR
#V.filter(trait="613985")  # 3' UTR
#V.filter(trait="614743")  # ncRNA
#V.filter(trait="306900")  # promoter
#V.filter(trait="250250")  # ncRNA
#V.filter(trait="174500")  # Enhancer

chrom,pos,ref,alt,consequence,source,trait,label,tss_dist,match_group
str,i64,str,str,str,str,str,bool,i64,str
"""7""",156791137,"""T""","""C""","""dELS_flank""","""OMIM""","""174500""",true,101849,"""dELS_flank_1"""
"""7""",156791255,"""G""","""C""","""dELS_flank""","""OMIM""","""174500""",true,101731,"""dELS_flank_2"""
"""7""",156791257,"""G""","""A""","""dELS_flank""","""OMIM""","""174500""",true,101729,"""dELS_flank_3"""
"""7""",156791413,"""A""","""C""","""dELS""","""OMIM""","""174500""",true,101573,"""dELS_1"""
"""7""",156791459,"""T""","""C""","""dELS""","""OMIM""","""174500""",true,101527,"""dELS_2"""
…,…,…,…,…,…,…,…,…,…
"""7""",156791571,"""T""","""A""","""dELS""","""OMIM""","""174500""",true,101415,"""dELS_9"""
"""7""",156791579,"""C""","""T""","""dELS""","""OMIM""","""174500""",true,101407,"""dELS_10"""
"""7""",156791581,"""A""","""G""","""dELS""","""OMIM""","""174500""",true,101405,"""dELS_11"""
"""7""",156791771,"""G""","""C""","""dELS_flank""","""OMIM""","""174500""",true,101215,"""dELS_flank_4"""


In [30]:
df.select("trait").write_csv("../../config/omim/filtered_traits.txt", include_header=False)

## Complex traits

In [17]:
dataset = "gwas_matched_9"
V = (
    pl.read_parquet(f"../../results/dataset/{dataset}/test.parquet")
    .filter(pl.col("label"))
    .filter(pl.col("consequence") != "missense_variant")
)
V = V.with_columns(
    pl.col("trait").str.split(","),
    id=V["chrom"] + "_" + V["pos"].cast(str) + "_" + V["ref"] + "_" + V["alt"]
)
V

chrom,pos,ref,alt,pip,trait,label,maf,ld_score,consequence,tss_dist,match_group,id
str,i64,str,str,f64,list[str],bool,f64,f64,str,i64,str,str
"""1""",2293397,"""G""","""A""",0.999932,"[""Height""]",true,0.37057,32.302,"""dELS""",65077,"""dELS_0""","""1_2293397_G_A"""
"""1""",3080038,"""T""","""C""",0.999895,"[""MCH"", ""MCV"", … ""RBC""]",true,0.23272,31.606,"""dELS""",10826,"""dELS_1""","""1_3080038_T_C"""
"""1""",3774964,"""A""","""G""",0.999973,"[""Hb"", ""HbA1c"", … ""RBC""]",true,0.23057,95.317,"""dELS""",2138,"""dELS_2""","""1_3774964_A_G"""
"""1""",9181780,"""G""","""A""",1.0,"[""AST"", ""Mono""]",true,0.074322,35.472,"""non_coding_transcript_exon_var…",52677,"""non_coding_transcript_exon_var…","""1_9181780_G_A"""
"""1""",9295877,"""G""","""T""",0.993319,"[""DVT""]",true,0.26506,46.307,"""dELS""",1348,"""dELS_3""","""1_9295877_G_T"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""22""",45969257,"""G""","""A""",0.999905,"[""eBMD""]",true,0.0026413,9.5349,"""dELS""",3457,"""dELS_207""","""22_45969257_G_A"""
"""22""",45979731,"""C""","""T""",0.98026,"[""eBMD""]",true,0.16714,37.776,"""dELS""",2568,"""dELS_208""","""22_45979731_C_T"""
"""22""",46219479,"""G""","""A""",0.945802,"[""BW""]",true,0.12871,164.85,"""dELS_flank""",28805,"""dELS_flank_108""","""22_46219479_G_A"""
"""22""",47990921,"""C""","""T""",0.997536,"[""BMI""]",true,0.43761,53.514,"""intron_variant""",498631,"""intron_variant_124""","""22_47990921_C_T"""


In [18]:
def n_common_hits(x, y):
    return len(
        set(V.filter(pl.col("trait").list.contains(x))["id"])
        &
        set(V.filter(pl.col("trait").list.contains(y))["id"])
    )

In [19]:
traits = np.unique(np.concatenate(V["trait"]))
traits

array(['AFib', 'AG', 'AID_Combined', 'ALP', 'ALT', 'AST',
       'Age_at_Menarche', 'Age_at_Menopause', 'Alb', 'Alzheimer_LTFH',
       'ApoA', 'ApoB', 'Asthma', 'BFP', 'BMI', 'BW', 'Balding_Type4',
       'Baso', 'BrC', 'CAD', 'CRC', 'CRP', 'Ca', 'Cholelithiasis', 'DBP',
       'DVT', 'Eosino', 'FEV1FVC', 'Fibroblastic_Disorders', 'GGT',
       'Glaucoma_Combined', 'Glucose', 'HDLC', 'Hb', 'HbA1c', 'Height',
       'Ht', 'Hypothyroidism', 'IBD', 'IGF1', 'Insomnia', 'Irritability',
       'LDLC', 'LOY', 'LipoA', 'Lym', 'MAP', 'MCH', 'MCHC', 'MCP', 'MCV',
       'Migraine_Self', 'Miserableness', 'Mono', 'Mood_Swings',
       'Morning_Person', 'Neutro', 'PP', 'Plt', 'PrC', 'RBC',
       'Risk_Taking', 'SBP', 'SHBG', 'Sensitivity', 'Smoking_Ever_Never',
       'Suffer_from_Nerves', 'T2D', 'T2D_BMI', 'TBil', 'TC', 'TG', 'TP',
       'Testosterone', 'UA', 'Urea', 'VitD', 'WBC', 'WHRadjBMI',
       'Worrier', 'eBMD', 'eGFR', 'eGFRcys'], dtype='<U22')

In [20]:
n_hits = [
    len(V.filter(pl.col("trait").list.contains(trait)))
    for trait in traits
]
n_chroms = [
    len(V.filter(pl.col("trait").list.contains(trait))["chrom"].unique())
    for trait in traits
]
df = pl.DataFrame(
    {
        "trait": traits,
        "n_hits": n_hits,
        "n_chroms": n_chroms,
    }
)
df

trait,n_hits,n_chroms
str,i64,i64
"""AFib""",5,4
"""AG""",40,15
"""AID_Combined""",2,2
"""ALP""",41,17
"""ALT""",23,16
…,…,…
"""WHRadjBMI""",24,12
"""Worrier""",1,1
"""eBMD""",68,21
"""eGFR""",32,17


In [21]:
#df2 = df.filter(pl.col("n_hits") >= 30, pl.col("n_chroms") >= 3).sort("n_hits", descending=True)
df2 = df.filter(pl.col("n_hits") >= 10).sort("n_hits", descending=True)
df2

trait,n_hits,n_chroms
str,i64,i64
"""Height""",81,18
"""MCV""",70,20
"""Plt""",70,17
"""MCH""",68,21
"""eBMD""",68,21
…,…,…
"""BFP""",13,8
"""Baso""",13,7
"""SBP""",13,9
"""Glucose""",12,7


In [22]:
exclude = np.full(len(df2), False)

for i, trait1 in enumerate(df2["trait"]):
    if exclude[i]: continue
    n1 = df2.filter(trait=trait1)[0, "n_hits"]
    for j, trait2 in enumerate(df2["trait"]):
        if i < j:
            if exclude[j]: continue
            n = n_common_hits(trait1, trait2)
            if n > n1 * 0.1:
                exclude[j] = True
                

In [23]:
df3 = df2.filter(~exclude)
df3

trait,n_hits,n_chroms
str,i64,i64
"""Height""",81,18
"""MCV""",70,20
"""Plt""",70,17
"""eBMD""",68,21
"""Mono""",66,19
…,…,…
"""LOY""",16,10
"""AST""",15,10
"""DVT""",15,11
"""BFP""",13,8


In [24]:
df3.select("trait").write_csv("../../config/gwas/independent_traits_filtered.csv", include_header=False)