# Quantify the input of functional genomics data in feature groups

This notebook aims to create dataset combinations to quantify the feature groups input

The following combinations are analysed:
1. GWAS credible sets
2. eQTL + GWAS
3. eQTL + sceQTL + GWAS
4. pQTL + GWAS
5. tsQTL + GWAS

Steps for the analysis

* Build credible set from the conditional combinations
* Build feature matrix for each combination
* Run model training with default parameters
* Run model predict with default paramters
* Plot model metrics

In [None]:
from dataclasses import dataclass
from pathlib import Path
from typing import Literal

from gentropy.common.session import Session
from gentropy.dataset.study_locus import StudyLocus
from gentropy.l2g import LocusToGeneFeatureMatrixStep, LocusToGeneStep
from loguru import logger
from pyspark.sql import functions as f


In [None]:
session = Session(extended_spark_conf={"spark.driver.memory": "60G"})


25/04/07 12:48:29 WARN Utils: Your hostname, mindos resolves to a loopback address: 127.0.1.1; using 192.168.0.100 instead (on interface eno1)
25/04/07 12:48:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/07 12:48:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [None]:
# Data
variant_index_path = "../variant_effect_prediction/variant"
credible_set_path = "../variant_effect_prediction/credible_set"
study_index_path = "../variant_effect_prediction/study"
target_index_path = "/home/mindos/data/ot-platform/2503-testrun-3/target"
colocalisation_path = "/home/mindos/data/ot-platform/2503-testrun-3/colocalisation*"
gold_standard_path = "/home/mindos/data/ot-platform/gold_standard.json"


In [9]:
features_list = [
    # max CLPP for each (study, locus, gene) aggregating over a specific qtl type
    "eQtlColocClppMaximum",
    "pQtlColocClppMaximum",
    "sQtlColocClppMaximum",
    # max H4 for each (study, locus, gene) aggregating over a specific qtl type
    "eQtlColocH4Maximum",
    "pQtlColocH4Maximum",
    "sQtlColocH4Maximum",
    # max CLPP for each (study, locus, gene) aggregating over a specific qtl type and in relation with the mean in the vicinity
    "eQtlColocClppMaximumNeighbourhood",
    "pQtlColocClppMaximumNeighbourhood",
    "sQtlColocClppMaximumNeighbourhood",
    # max H4 for each (study, locus, gene) aggregating over a specific qtl type and in relation with the mean in the vicinity
    "eQtlColocH4MaximumNeighbourhood",
    "pQtlColocH4MaximumNeighbourhood",
    "sQtlColocH4MaximumNeighbourhood",
    # distance to gene footprint
    "distanceSentinelFootprint",
    "distanceSentinelFootprintNeighbourhood",
    "distanceFootprintMean",
    "distanceFootprintMeanNeighbourhood",
    # distance to gene tss
    "distanceTssMean",
    "distanceTssMeanNeighbourhood",
    "distanceSentinelTss",
    "distanceSentinelTssNeighbourhood",
    # vep
    "vepMaximum",
    "vepMaximumNeighbourhood",
    "vepMean",
    "vepMeanNeighbourhood",
    # other
    "geneCount500kb",
    "proteinGeneCount500kb",
    "credibleSetConfidence",
]
hyperparameters = {
    "n_estimators": 100,
    "max_depth": 3,
    "ccp_alpha": 0,
    "learning_rate": 0.1,
    "min_samples_leaf": 1,
    "min_samples_split": 5,
    "subsample": 0.7,
}
l2g_threshold = 0.05


In [None]:
session.spark


In [None]:
@dataclass
class L2GParamSet:
    run_id: str
    variant_index_path: str
    colocalisation_path: str
    study_index_path: str
    target_index_path: str
    credible_set_path: str
    gold_standard_path: str
    l2g_threshold: float
    explain_predictions: bool
    study_types: tuple[Literal["gwas", "pqtl", "sqtl", "eqtl", "tuqtl", "sceqtl"]]
    temp_path: str = "."

    def to_params(self) -> dict[str, str]:
        """Convert paramters to dictionary."""
        return {
            "run_id": self.run_id,
            "variant_index_path": self.variant_index_path,
            "colocalisation_path": self.colocalisation_path,
            "study_index_path": self.study_index_path,
            "target_index_path": self.target_index_path,
            "credible_set_path": self.limit_cs(self.credible_set_path, self.study_types),
            "feature_matrix_path": str(Path(self.temp_path) / "feature_matrix" / self.run_id + "_feature_matrix"),
            "model_path": str(Path(self.temp_path) / "l2g_model" / self.run_id + "_l2g_model.skops"),
            "gold_standard_path": self.gold_standard_path,
            "l2g_threshold": self.l2g_threshold,
            "predictions_path": str(Path(self.temp_path) / "l2g_predictions" / self.run_id + "_l2g_predictions"),
            "explain_predictions": self.explain_predictions,
        }

    def limit_cs(self) -> str:
        """Limit credible sets to specific number of study types, dump it and use for to_params method."""
        cs = StudyLocus.from_parquet(session, self.credible_set_path).df.filter(
            f.col("studyType").isin(self.study_types)
        )
        cs_path = str(Path(self.temp_path / "credible_set" / self.run_id + "_credible_set"))
        cs.write.mode("overwrite").parquet(cs_path)
        return cs_path


runs = {
    "gwas_only": ["gwas"],
    "gwas_vs_bulk_eqtl": ["gwas", "eqtl"],
    "gwas_vs_sceqtl": ["gwas", "sceqtl"],
    "gwas_vs_eqtl": ["gwas", "eqtl", "sceqtl"],
    "gwas_vs_pqtl": ["gwas", "pqtl"],
    "gwas_vs_tuqtl": ["gwas", "tuqtl"],
    "gwas_vs_sqtl": ["gwas", "sqtl"],
}

param_grid = {
    run_id: {
        L2GParamSet(
            run_id=run_id,
            variant_index_path=variant_index_path,
            colocalisation_path=colocalisation_path,
            study_index_path=study_index_path,
            target_index_path=target_index_path,
            credible_set_path=credible_set_path,
            gold_standard_path=gold_standard_path,
            explain_predictions=True,
            l2g_threshold=l2g_threshold,
            study_types=study_types,
        ).to_params()
    }
    for run_id, study_types in runs.items()
}


In [None]:
for combination, params in param_grid:
    logger.info(f"Running combination {combination}")

    logger.info("Building feature matrix")
    LocusToGeneFeatureMatrixStep(
        session,
        features_list=features_list,
        credible_set_path=combination,
        variant_index_path=params["variant_index_path"],
        colocalisation_path=params["colocalisation_path"],
        study_index_path=params["study_index_path"],
        target_index_path=params["target_index_path"],
        feature_matrix_path=params["feature_matrix_path"],
    )

    logger.info("Running training")
    LocusToGeneStep(
        session,
        run_mode="train",
        hyperparameters=hyperparameters,
        download_from_hub=False,
        cross_validate=False,
        wandb_run_Name=f"2504-gentropy-manuscript-{combination}",
        credible_set_path=params["credible_set_path"],
        feature_matrix_path=params["feature_matrix_path"],
        model_path=params["model_path"],
        features_list=features_list,
        gold_standard_curation_path=params["gold_standard_path"],
        l2g_threshold=params["l2g_threshold"],
        hf_hub_repo_id=None,
    )

    logger.info(f"Running predictions")
    LocusToGeneStep(
        session,
        run_mode="predict",
        hyperparameters=hyperparameters,
        download_from_hub=False,
        cross_validate=True,
        wandb_run_Name=f"2504-gentropy-manuscript-{combination}",
        credible_set_path=params["credible_set_path"],
        feature_matrix_path=params["feature_matrix_path"],
        model_path=params["model_path"],
        features_list=features_list,
        gold_standard_curation_path=params["gold_standard_path"],
        variant_index_path=params["variant_index_path"],
        predictions_path=params["predictions_path"],
        l2g_threshold=params["l2g_threshold"],
        explain_predictions=params["explain_predictions"],
        hf_hub_repo_id=None,
    )
