# Visualizing Gene Variant Data

## Setup


In [18]:
import polars as pl
import ipywidgets as widgets
import os
import plotly_express as px

In [19]:
data_dir = "../data/processed/"

# Create a dropdown widget to select the gene
gene_options = [os.path.basename(f).replace("-pop-vars.parquet", "") for f in os.listdir(data_dir) if f.endswith("-pop-vars.parquet")]
gene_dropdown = widgets.Dropdown(
    options=gene_options,
    description="Select Gene:",
)

def load_gene(gene_id, verbose=False):
    vars_path = os.path.join(data_dir, f"{gene_id}-variants.parquet")
    pops_path = os.path.join(data_dir, f"{gene_id}-pop-vars.parquet")
    df_vars = pl.read_parquet(vars_path)
    df_pops = pl.read_parquet(pops_path)
    if verbose:
        display(df_vars.head(), df_pops.head())
    return df_vars, df_pops

out = widgets.interactive_output(load_gene, {"gene_id": gene_dropdown, "verbose": widgets.fixed(True)})

display(gene_dropdown, out)


Dropdown(description='Select Gene:', options=('HLA-A', 'CYP2D6', 'HLA-B'), value='HLA-A')

Output()

## Variant Position in Gene

In [None]:
df_vars, df_pops = load_gene(gene_dropdown.value)

print(f"Loaded {df_vars.height} variants for gene {gene_dropdown.value}.")

fig = px.histogram(
    df_vars.to_pandas(),
    x="pos",
    nbins=100,
    title=f"Variant Position Distribution for {gene_dropdown.value}",
    labels={"pos": "Genomic Position"}
)
fig.update_yaxes(title_text="Number of Variants")
fig.show()

Loaded 4666 variants for gene HLA-B.


## Variant Effects

In [49]:
display(df_vars.get_column("consequence").value_counts().sort("count", descending=True).head(10))
display(df_vars.get_column("consequence").value_counts().sort("count", descending=True).tail(10))



consequence,count
str,u32
"""intron_variant""",2496
"""non_coding_transcript_exon_var…",917
"""5_prime_UTR_variant""",416
"""missense_variant""",274
"""3_prime_UTR_variant""",168
"""synonymous_variant""",104
"""frameshift_variant""",97
"""splice_polypyrimidine_tract_va…",48
"""splice_region_variant""",44
"""inframe_deletion""",21


consequence,count
str,u32
"""stop_gained""",16
"""splice_donor_variant""",14
"""splice_donor_region_variant""",11
"""mature_miRNA_variant""",10
"""splice_acceptor_variant""",10
"""inframe_insertion""",10
"""protein_altering_variant""",3
"""splice_donor_5th_base_variant""",3
"""stop_lost""",2
"""start_lost""",2


## Clinical Significance of Variants

In [65]:
display(
    df_vars.get_column("clinical_significance")
    .value_counts()
    .sort("count", descending=True)
    .head(10)
)

# Filter variants with clinical significance
df_vars.filter(
    pl.col("clinical_significance") != [],
)

clinical_significance,count
list[str],u32
[],4657
"[""likely benign""]",6
"[""benign""]",2
"[""not provided""]",1


id,chr,pos,ref,alt,consequence,clinical_significance
str,str,i64,str,str,str,list[str]
"""rs1582538797""","""6""",31354498,"""C""","""A""","""synonymous_variant""","[""likely benign""]"
"""rs753723124""","""6""",31355144,"""A""","""G""","""synonymous_variant""","[""likely benign""]"
"""rs1065502""","""6""",31355378,"""C""","""T""","""synonymous_variant""","[""likely benign""]"
"""rs766450595""","""6""",31356246,"""CC""","""-""","""frameshift_variant""","[""benign""]"
"""rs1168937188""","""6""",31356687,"""C""","""T""","""splice_donor_variant""","[""not provided""]"
"""rs576010607""","""6""",31356717,"""A""","""-""","""frameshift_variant""","[""likely benign""]"
"""rs66473235""","""6""",31356720,"""TT""","""T""","""frameshift_variant""","[""likely benign""]"
"""rs750527298""","""6""",31356748,"""CC""","""-""","""frameshift_variant""","[""benign""]"
"""rs9266183""","""6""",31356870,"""T""","""C""","""missense_variant""","[""likely benign""]"


## Minor Allele Frequency (MAF) Distribution

In [70]:
# Remove very rare or common variants
df_pops = df_pops.filter(
    (pl.col("MAF") >= 0.01) & (pl.col("MAF") <= 0.99)
)

px.histogram(
    df_pops.to_pandas(),
    x="MAF",
    title=f"Allele Counts by Population for {gene_dropdown.value}",
)

In [73]:
px.box(
    df_pops.to_pandas(),
    y="MAF",
    x="population",
    title=f"MAF Distribution by Population for {gene_dropdown.value}",
)