# Variant regulatory consequence analysis

This notebook introduces new dataset `Epiraction`, which contains the regulatory consequences for variants.


## Epiraction dataset ingesion


In [75]:
from __future__ import annotations

import pandas as pd
from gentropy.common.session import Session
from pyspark.sql import Column
from pyspark.sql import functions as f

from manuscript_methods.datasets import LeadVariantEffect


In [21]:
session = Session(extended_spark_conf={"spark.driver.memory": "120G"})
dataset_path = "../../data/epiraction_v1.6"

lead_variant_maf = "../../data/lead-maf-vep"
so_terms_path = "../../data/so_terms.tsv"


In [3]:
session.spark


In [30]:
epirack = session.spark.read.parquet(dataset_path)
lve = LeadVariantEffect.from_parquet(session=session, path=lead_variant_maf)
so_terms = pd.read_csv(so_terms_path, sep="\t")


In [None]:
# Prepare SO terms (change them to from SO:0000001 to SO_0000001)
so_terms["featureId"] = so_terms["featureId"].str.replace("SO:", "SO_")


In [None]:
def map_so_terms(so_term_col: Column, so_terms: pd.DataFrame) -> Column:
    """Map Sequence Ontology (SO) terms to their ids without joining.

    Note:
    ----
    The scores are ordered by the most severe SO term so, if we have a list of SO_terms, choose the fist one as most severe.

    """
    expr = f.when(f.lit(False), None)

    for v in so_terms.iterrows():
        so_term = v[1][1]
        so_id = v[1][0]
        expr = expr.when(f.lit(so_id) == so_term_col, f.lit(so_term))

    return expr


In [None]:
# Prefilter lead variants
lve_dataset = (
    lve.maf_filter()
    .effect_size_filter()
    .replicated()
    .select(
        f.col("variantId"),
        f.col("variant.chromosome").alias("chromosome"),
        f.col("variant.start").alias("start"),
        f.col("variant.end").alias("end"),
        f.col("leadVariantConsequence").getField("type").alias("variantPositionEffect"),
        f.col("leadVariantConsequence")
        .getField("transcriptConsequence")
        .getField("consequenceScore")
        .alias("consequenceScore"),
        f.col("leadVariantConsequence")
        .getField("transcriptConsequence")
        .getField("variantFunctionalConsequenceIds")
        .alias("consequenceIds"),
        f.col("studyStatistics").getField("studyType").alias("studyType"),
        f.col("majorLdPopulationMaf").getField("value").alias("MAF"),
        f.col("rescaledStatistics").getField("estimatedBeta").alias("estimatedBeta"),
        f.col("rescaledStatistics").getField("estimatedSE").alias("estimatedSE"),
    )
    .withColumn(
        "consequenceOntologyTerms",
        f.transform(f.col("consequenceIds"), lambda x: map_so_terms(x, so_terms)).getItem(0),
    )
)
# Filter based on score distribution - see 05.0 notebook
epirack_dataset = (
    epirack.filter(f.col("score") > 0.05)
    .dropDuplicates(["chromosome", "start", "end", "intervalType"])
    .select(
        f.regexp_replace(f.col("chromosome"), "chr", "").alias("chromosome"),
        (f.col("start") + 1).alias("start"),
        (f.col("end") + 1).alias("end"),
        "intervalType",
    )
)


25/06/25 16:28:53 WARN CacheManager: Asked to cache already cached data.
25/06/25 16:28:53 WARN CacheManager: Asked to cache already cached data.


Initial number of variants: 2720012
Following number of variants: 88109
Number of variants removed: 2631903
Percentage of variants removed: 96.76%


In [98]:
lve_dataset.write.mode("overwrite").bucketBy(50, "chromosome").sortBy("start").saveAsTable("lve_lead_variants")
epirack_dataset.write.mode("overwrite").bucketBy(50, "chromosome").sortBy("start").saveAsTable("epirack_intervals")


                                                                                

In [99]:
lve_dataset_table = session.spark.table("lve_lead_variants")
epirack_dataset_table = session.spark.table("epirack_intervals")

nc_dataset = lve_dataset_table.join(
    epirack_dataset_table,
    on=(
        (lve_dataset_table.chromosome == epirack_dataset_table.chromosome)
        & (lve_dataset_table.start >= epirack_dataset_table.start)
        & (lve_dataset_table.end < epirack_dataset_table.end)
    ),
    how="left",
).select(
    lve_dataset_table.variantId,
    lve_dataset_table.chromosome,
    lve_dataset_table.start,
    lve_dataset_table.end,
    lve_dataset_table.variantPositionEffect,
    lve_dataset_table.consequenceScore,
    lve_dataset_table.consequenceOntologyTerms,
    lve_dataset_table.studyType,
    lve_dataset_table.MAF,
    lve_dataset_table.estimatedBeta,
    lve_dataset_table.estimatedSE,
    epirack_dataset_table.intervalType,
)


In [100]:
nc_dataset.write.mode("overwrite").parquet("../../data/nc_dataset")


                                                                                

In [101]:
from pyspark.sql import Window

w = Window().partitionBy("chromosome", "start", "end").orderBy("start")

nc_dedup = (
    nc_dataset.withColumn("inEnhancerRegion", f.max((f.col("intervalType") == "enhancer").cast("int")).over(w) == 1)
    .withColumn("inPromoterRegion", f.max((f.col("intervalType") == "promoter").cast("int")).over(w) == 1)
    .dropDuplicates(["chromosome", "start", "end"])
    .cache()
)


In [106]:
# from manuscript_methods import group_statistics


# group_statistics(
#     nc_dataset, [f.col("variantPositionEffect"), f.col("intervalType"), f.col("consequenceOntologyTerms")]
# ).persist().filter(f.col("intervalType") == "enhancer").orderBy(f.desc("count")).show(truncate=False, n=200)


# Get intronic variants with enhancer regions
intronic_enhancers = (
    nc_dedup.filter((f.col("consequenceOntologyTerms") == "intron_variant") & (f.col("inEnhancerRegion")))
    .filter(f.col("estimatedBeta") > 1)
    .filter(f.col("variantPositionEffect") == "unknown")
    .select(
        f.col("variantId"),
        f.col("chromosome"),
        f.col("start"),
        f.col("end"),
        f.col("variantPositionEffect"),
        f.col("consequenceScore"),
        f.col("consequenceOntologyTerms"),
        f.col("studyType"),
        f.col("MAF"),
        f.col("estimatedBeta"),
        f.col("estimatedSE"),
        f.col("inEnhancerRegion"),
    )
)
intronic_enhancers.show()


+----------------+----------+---------+---------+---------------------+----------------+------------------------+---------+--------------------+------------------+--------------------+----------------+
|       variantId|chromosome|    start|      end|variantPositionEffect|consequenceScore|consequenceOntologyTerms|studyType|                 MAF|     estimatedBeta|         estimatedSE|inEnhancerRegion|
+----------------+----------+---------+---------+---------------------+----------------+------------------------+---------+--------------------+------------------+--------------------+----------------+
| 10_68496075_G_T|        10| 68496075| 68496075|              unknown|             0.1|          intron_variant|     gwas| 0.03204850584668684| 1.997161877193193|  0.3268992930940192|            true|
|11_121575382_G_A|        11|121575382|121575382|              unknown|             0.1|          intron_variant|     gwas|1.882175795219273...| 2.413611310688524| 0.33543218335325353|        

In [None]:
epirack.filter(f.col("intervalType") == f.lit("enhancer")).filter(f.col("chromosome") == f.lit("chrX")).filter(
    f.col("start") >= 138708418
).orderBy(f.desc("score")).show(truncate=False)




+----------+---------+---------+---------------+----------------------------+------------+------------+------------------------------------------------------------------------------------------------------------------------+------------+--------+
|chromosome|start    |end      |geneId         |biosampleName               |intervalType|score       |resourceScore                                                                                                           |datasourceId|pmid    |
+----------+---------+---------+---------------+----------------------------+------------+------------+------------------------------------------------------------------------------------------------------------------------+------------+--------+
|chrX      |145817529|145818329|ENSG00000185985|Muscle.Skeletal             |promoter    |0.9806451114|[{H3K27ac, 1.4808}, {Open, 7.1693}, {Cofactor, 0.9308}, {CTCF, 0.7901}, {HiC_contacts, 1.0}, {abc_tissue, 6.520721}]    |epiraction  |40027634|
|chrX      |

                                                                                

In [None]:
nc_dataset.filter(f.col("variantPositionEffect") == "out-of-gene-effect").filter(
    f.col("intervalType").isNotNull()
).show()


+----------+---------+---------+---------------------+----------------+------------------------+----------+--------------------+-------------------+-------------------+------------+
|chromosome|    start|      end|variantPositionEffect|consequenceScore|consequenceOntologyTerms| studyType|                 MAF|      estimatedBeta|        estimatedSE|intervalType|
+----------+---------+---------+---------------------+----------------+------------------------+----------+--------------------+-------------------+-------------------+------------+
|         2|218156235|218156235|   out-of-gene-effect|             0.0|    upstream_gene_var...|trans-pqtl|0.011732533521524348| 0.2439420560557209|0.03636328703736627|    enhancer|
|         2| 86930667| 86930667|   out-of-gene-effect|             0.0|    upstream_gene_var...|      eqtl| 0.08980134763485674|-0.9595888210139163|0.11139149209893032|    enhancer|
|         2| 43195651| 43195652|   out-of-gene-effect|             0.1|          intron_va