# Locus To Gene model benchmarking

## Configuration

In [7]:
feature_matrix_path = "/home/mindos/data/2503-tstrun-1/l2g_feature_matrix"
credible_set_path = "/home/mindos/data/2503-tstrun-1/creadible_set"
gold_standard_curation_path = "/home/mindos/data/2503-tstrun-1/gold_standard"
hf_hub_repo_id = "PROJECT-DEFIANT/LocusToGene"

In [8]:
features_list = [
    # max CLPP for each (study, locus, gene) aggregating over a specific qtl type
    "eQtlColocClppMaximum",
    "pQtlColocClppMaximum",
    "sQtlColocClppMaximum",
    # max H4 for each (study, locus, gene) aggregating over a specific qtl type
    "eQtlColocH4Maximum",
    "pQtlColocH4Maximum",
    "sQtlColocH4Maximum",
    # max CLPP for each (study, locus, gene) aggregating over a specific qtl type and in relation with the mean in the vicinity
    "eQtlColocClppMaximumNeighbourhood",
    "pQtlColocClppMaximumNeighbourhood",
    "sQtlColocClppMaximumNeighbourhood",
    # max H4 for each (study, locus, gene) aggregating over a specific qtl type and in relation with the mean in the vicinity
    "eQtlColocH4MaximumNeighbourhood",
    "pQtlColocH4MaximumNeighbourhood",
    "sQtlColocH4MaximumNeighbourhood",
    # distance to gene footprint
    "distanceSentinelFootprint",
    "distanceSentinelFootprintNeighbourhood",
    "distanceFootprintMean",
    "distanceFootprintMeanNeighbourhood",
    # distance to gene tss
    "distanceTssMean",
    "distanceTssMeanNeighbourhood",
    "distanceSentinelTss",
    "distanceSentinelTssNeighbourhood",
    # vep
    "vepMaximum",
    "vepMaximumNeighbourhood",
    "vepMean",
    "vepMeanNeighbourhood",
    # other
    "geneCount500kb",
    "proteinGeneCount500kb",
    "credibleSetConfidence",
]

In [9]:
hyperparameters = {
    "n_estimators": 100,
    "max_depth": 10,
    "ccp_alpha": 0,
    "learning_rate": 0.1,
    "min_samples_leaf": 5,
    "min_samples_split": 5,
    "subsample": 1,
}
cross_validate = True
download_from_hub = True
hf_model_commit_message = "chore: update model"
wandb_run_name = "szsz-test"

## Imports

In [10]:
from pyspark.sql import DataFrame, Window
from pyspark.sql import functions as f
from pyspark.sql import types as t

from gentropy.common.session import Session
from gentropy.l2g import LocusToGeneStep


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



In [11]:
session = Session()

25/02/21 13:12:15 WARN Utils: Your hostname, mindos resolves to a loopback address: 127.0.1.1; using 192.168.0.100 instead (on interface eno1)
25/02/21 13:12:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/21 13:12:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/21 13:12:16 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [13]:
LocusToGeneStep(
    session=session,
    run_mode="train",
    credible_set_path=credible_set_path,
    gold_standard_curation_path=gold_standard_curation_path,
    hf_hub_repo_id=hf_hub_repo_id,
    features_list=features_list,
    wandb_run_name=None,
    hf_model_commit_message=hf_model_commit_message,
    cross_validate=False,
    hyperparameters=hyperparameters,
    download_from_hub=False,
    feature_matrix_path=feature_matrix_path,
)

25/02/21 13:15:43 ERROR Executor: Exception in task 5.0 in stage 6.0 (TID 63)
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.util.ThreadUtils$.parmap(ThreadUtils.scala:387)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.readParquetFootersInParallel(ParquetFileFormat.scala:443)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$mergeSchemasInParallel$1(ParquetFileFormat.scala:493)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat$.$anonfun$mergeSchemasInParallel$1$adapted(ParquetFileFormat.scala:485)
	at org.apache.spark.sql.execution.datasources.SchemaMergeUtils$.$anonfun$mergeSchemasInParallel$2(SchemaMergeUtils.scala:80)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:858)
	at org.apache.spark

+---------------+---------------+------------+--------------------+-----------------------+------------------+
|         geneId|goldStandardSet|     studyId|        studyLocusId|traitFromSourceMappedId|         variantId|
+---------------+---------------+------------+--------------------+-----------------------+------------------+
|ENSG00000198551|       negative|GCST90302016|4bf6b3cc013862a90...|            EFO_0004611|   19_11113815_A_G|
|ENSG00000099203|       negative|GCST90302016|4bf6b3cc013862a90...|            EFO_0020946|   19_11113815_A_G|
|ENSG00000196666|       negative|GCST90038594|60142de5d934b14f5...|            EFO_0004735|   11_47508395_C_A|
|ENSG00000198382|       negative|GCST90092899|9cd5ec4404c5af4ff...|            EFO_0004612|11_75741441_A_ACTC|
|ENSG00000141084|       negative|GCST90092903|bdcad4e12fa3ba5ef...|            EFO_0004612|   16_67991092_G_A|
|ENSG00000121940|       negative|GCST90132639|d7c8a7fc7afe3d164...|            EFO_0004529|   1_109275684_G_T|
|

ERROR:grpc._plugin_wrapping:AuthMetadataPluginCallback "<google.auth.transport.grpc.AuthMetadataPlugin object at 0x7da2099f6950>" raised exception!
Traceback (most recent call last):
  File "/home/mindos/Projects/OpenTargets/gentropy/.venv/lib/python3.11/site-packages/grpc/_plugin_wrapping.py", line 105, in __call__
    self._metadata_plugin(
  File "/home/mindos/Projects/OpenTargets/gentropy/.venv/lib/python3.11/site-packages/google/auth/transport/grpc.py", line 95, in __call__
    callback(self._get_authorization_headers(context), None)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mindos/Projects/OpenTargets/gentropy/.venv/lib/python3.11/site-packages/google/auth/transport/grpc.py", line 81, in _get_authorization_headers
    self._credentials.before_request(
  File "/home/mindos/Projects/OpenTargets/gentropy/.venv/lib/python3.11/site-packages/google/auth/credentials.py", line 239, in before_request
    self._blocking_refresh(request)
  File "/home/mindos/Projec

RetryError: Deadline of 60.0s exceeded while calling target function, last exception: 503 Getting metadata from plugin failed with error: Reauthentication is needed. Please run `gcloud auth application-default login` to reauthenticate.