# UKBB MVP Finngen summary statistics preparation

## Setup

In [1]:
from gentropy import Session
from pyspark.sql import types as t
from pyspark.sql import functions as f


In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for pandas UDF instead of specifying pandas UDF type which will be deprecated in the future releases. See SPARK-28264 for more details.



In [2]:
session = Session(
    spark_uri="yarn",
    extended_spark_conf={"spark.master.memory": "16G"},
    write_mode="overwrite",
)

25/10/06 16:55:30 WARN SparkConf: The configuration key 'spark.yarn.executor.failuresValidityInterval' has been deprecated as of Spark 3.5 and may be removed in the future. Please use the new key 'spark.executor.failuresValidityInterval' instead.
25/10/06 16:55:30 WARN SparkConf: The configuration key 'spark.yarn.executor.failuresValidityInterval' has been deprecated as of Spark 3.5 and may be removed in the future. Please use the new key 'spark.executor.failuresValidityInterval' instead.
25/10/06 16:55:30 WARN SparkConf: The configuration key 'spark.yarn.executor.failuresValidityInterval' has been deprecated as of Spark 3.5 and may be removed in the future. Please use the new key 'spark.executor.failuresValidityInterval' instead.
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/06 16:55:31 WARN SparkConf: The configuration key 'spark.yarn.executor.failuresValidityInterval' has been deprecated as of 

In [3]:
## Inputs
source_manifest_path = "gs://finngen-public-data-r12/meta_analysis/mvp_ukbb/FinnGen_R12_MVP_UKBB_manifest.tsv"
source_summary_statistics_glob = (
    "gs://finngen-public-data-r12/meta_analysis/mvp_ukbb/summary_stats/*.gz"
)
efo_curation_path = "https://raw.githubusercontent.com/opentargets/curation/refs/tags/25.09/mappings/disease/manual_string.tsv"
gnomad_variant_index_path = "gs://gnomad_data_2/v4.1/variant_index/"

## Outputs
study_index_output_path = "gs://finngen_ukb_mvp_meta_data/study_index/"
raw_summary_statistics_output_path = (
    "gs://finngen_ukb_mvp_meta_data/raw_summary_statistics/"
)
harmonised_summary_statistics_output_path = (
    "gs://finngen_ukb_mvp_meta_data/harmonised_summary_statistics/"
)


### Building StudyIndex

In [4]:
from gentropy.finngen_ukb_mvp_meta import FinngenUkbbMvpMetaIngestionStep

In [5]:
FinngenUkbbMvpMetaIngestionStep(
    session=session,
    source_manifest_path=source_manifest_path,
    source_summary_statistics_glob=source_summary_statistics_glob,
    efo_curation_path=efo_curation_path,
    gnomad_variant_index_path=gnomad_variant_index_path,
    study_index_output_path=study_index_output_path,
    raw_summary_statistics_output_path=raw_summary_statistics_output_path,
    harmonised_summary_statistics_output_path=harmonised_summary_statistics_output_path,
)

25/10/06 13:54:10 WARN SparkConf: The configuration key 'spark.yarn.executor.failuresValidityInterval' has been deprecated as of Spark 3.5 and may be removed in the future. Please use the new key 'spark.executor.failuresValidityInterval' instead.
25/10/06 13:54:10 WARN SparkConf: The configuration key 'spark.yarn.executor.failuresValidityInterval' has been deprecated as of Spark 3.5 and may be removed in the future. Please use the new key 'spark.executor.failuresValidityInterval' instead.
                                                                                

<gentropy.finngen_ukb_mvp_meta.FinngenUkbbMvpMetaIngestionStep at 0x7f3745af12d0>

#### Reading studyIndex

In [9]:
from gentropy import StudyIndex

si = StudyIndex.from_parquet(session, study_index_output_path)
si.df.show()

+--------------------+--------------------+---------+--------------------+------------------------+----------+------+---------------------+-----------+--------+----------------+----------------------+---------------+------------------+----------------------------------+--------------------+--------------------+------+---------+--------+--------------------+---------------------+--------------------+------------------+---------------+-------------+--------------------+-----------+---------+---------------+
|             studyId|           projectId|studyType|     traitFromSource|traitFromSourceMappedIds|diseaseIds|geneId|biosampleFromSourceId|biosampleId|pubmedId|publicationTitle|publicationFirstAuthor|publicationDate|publicationJournal|backgroundTraitFromSourceMappedIds|backgroundDiseaseIds|   initialSampleSize|nCases|nControls|nSamples|             cohorts|ldPopulationStructure|    discoverySamples|replicationSamples|qualityControls|analysisFlags|summarystatsLocation|hasSumstats|condi

#### nSamples validation

In [20]:
from gentropy.datasource.finngen_meta import FinnGenMetaManifest

finngen_manifest = FinnGenMetaManifest.from_path(session, source_manifest_path)
finngen_manifest._df.filter(f.col("fg_phenotype") == "I9_AVBLOCK").show()

+------------+---------------+------------+-------------+--------+----------+---------------+------------------+---------------+------------------+---------------+------------------+--------------------+
|ukbb_n_cases|ukbb_n_controls|fg_phenotype|fg_n_controls|    name|fg_n_cases|MVP_AMR_n_cases|MVP_AFR_n_controls|MVP_EUR_n_cases|MVP_EUR_n_controls|MVP_AFR_n_cases|MVP_AMR_n_controls|         path_bucket|
+------------+---------------+------------+-------------+--------+----------+---------------+------------------+---------------+------------------+---------------+------------------+--------------------+
|        3088|         384657|  I9_AVBLOCK|       375343|AV-block|      7850|            817|            115972|          15010|            431314|           2893|             49621|gs://finngen-publ...|
+------------+---------------+------------+-------------+--------+----------+---------------+------------------+---------------+------------------+---------------+------------------+--

#### Disease validation

In [15]:
disease_index_path = (
    "gs://open-targets-pipeline-runs/il/25.09-testrun-1/output/disease/"
)

In [16]:
disease_index = (
    session.spark.read.parquet(disease_index_path)
    .select(
        f.col("id").alias("diseaseId"),
        f.explode_outer(
            f.when(
                f.col("obsoleteTerms").isNotNull(),
                f.array_union(f.array("id"), f.col("obsoleteTerms")),
            )
        ).alias("efo"),
    )
    .withColumn("efo", f.coalesce(f.col("efo"), f.col("diseaseId")))
)

                                                                                

In [32]:
si.validate_disease(disease_index).df.filter(f.size("qualityControls") > 0).limit(
    10
).select(
    "studyId",
    "traitFromSource",
    "traitFromSourceMappedIds",
    "diseaseIds",
    "qualityControls",
).show(truncate=False)

25/10/06 12:16:27 WARN CacheManager: Asked to cache already cached data.


+-------------------------------------------------------------+------------------------------------------------------+------------------------+----------+-----------------------------------+
|studyId                                                      |traitFromSource                                       |traitFromSourceMappedIds|diseaseIds|qualityControls                    |
+-------------------------------------------------------------+------------------------------------------------------+------------------------+----------+-----------------------------------+
|FINNGEN_R12_UKB_MVP_META_L12_OTHPIGMENTATION                 |Other disorders of pigmentation                       |[EFO_1000755]           |[]        |[No valid disease identifier found]|
|FINNGEN_R12_UKB_MVP_META_M13_FATIGFRACT                      |Fatigue fracture of vertebra                          |[HP_0041166]            |[]        |[No valid disease identifier found]|
|FINNGEN_R12_UKB_MVP_META_M13_FINGERDEFORM   

In [27]:
disease_index.filter(f.col("efo") == "EFO_1000755").show()

[Stage 89:>                                                         (0 + 1) / 1]

+---------+---+
|diseaseId|efo|
+---------+---+
+---------+---+



                                                                                

### Preparing variant annotations for flipping

In [4]:
from gentropy.dataset.variant_direction import VariantDirection
from gentropy.dataset.variant_index import VariantIndex

In [5]:
vi = VariantIndex.from_parquet(session, gnomad_variant_index_path)

                                                                                

In [6]:
vd = VariantDirection.from_variant_index(variant_index=vi)

In [7]:
vd.df.show()

                                                                                

+----------+-------------------+----+-------------------+---------+------+-------------+
|chromosome|  originalVariantId|type|          variantId|direction|strand|isPalindromic|
+----------+-------------------+----+-------------------+---------+------+-------------+
|         4|     4_64274529_A_G|   1|     4_64274529_A_G|        1|     1|        false|
|         4|     4_64274529_A_G|   1|     4_64274529_G_A|       -1|     1|        false|
|         4|     4_64274529_A_G|   1|     4_64274529_T_C|        1|    -1|        false|
|         4|     4_64274529_A_G|   1|     4_64274529_C_T|       -1|    -1|        false|
|         4|     4_64274530_G_C|   1|     4_64274530_G_C|        1|     1|         true|
|         4|     4_64274530_G_C|   1|     4_64274530_C_G|       -1|     1|         true|
|         4|     4_64274530_G_C|   1|     4_64274530_C_G|        1|    -1|         true|
|         4|     4_64274530_G_C|   1|     4_64274530_G_C|       -1|    -1|         true|
|         4|     4_64

## Munge summary statistics

In [10]:
from gentropy.common.stats import pvalue_from_neglogpval

In [8]:
sst = session.spark.read.parquet(raw_summary_statistics_output_path)

                                                                                

In [16]:
cols = sst.columns
print(cols)
pval_cols = [col for col in cols if "_pval" in col]
sst.select(
    f.col("studyId"),
    f.col("#CHR").alias("chromosome"),
    f.col("POS").alias("position"),
    f.col("REF").alias("referenceAllele"),
    f.col("ALT").alias("alternateAllele"),
    f.col("all_inv_var_meta_beta").alias("beta"),
    f.col("all_inv_var_meta_sebeta").alias("standardError"),
    f.col("all_inv_var_meta_p"),
    f.col("all_inv_var_meta_mlogp"),
    *pval_cols,
).show(truncate=False)

['#CHR', 'POS', 'REF', 'ALT', 'SNP', 'fg_beta', 'fg_sebeta', 'fg_pval', 'fg_af_alt', 'fg_af_alt_cases', 'fg_af_alt_controls', 'MVP_EUR_beta', 'MVP_EUR_sebeta', 'MVP_EUR_pval', 'MVP_EUR_af_alt', 'MVP_EUR_r2', 'MVP_AFR_beta', 'MVP_AFR_sebeta', 'MVP_AFR_pval', 'MVP_AFR_af_alt', 'MVP_AFR_r2', 'MVP_HIS_beta', 'MVP_HIS_sebeta', 'MVP_HIS_pval', 'MVP_HIS_af_alt', 'MVP_HIS_r2', 'all_meta_N', 'all_inv_var_meta_beta', 'all_inv_var_meta_sebeta', 'all_inv_var_meta_p', 'all_inv_var_meta_mlogp', 'all_inv_var_het_p', 'leave_fg_N', 'leave_fg_inv_var_meta_beta', 'leave_fg_inv_var_meta_sebeta', 'leave_fg_inv_var_meta_p', 'leave_fg_inv_var_meta_mlogp', 'leave_fg_inv_var_meta_het_p', 'leave_MVP_EUR_N', 'leave_MVP_EUR_inv_var_meta_beta', 'leave_MVP_EUR_inv_var_meta_sebeta', 'leave_MVP_EUR_inv_var_meta_p', 'leave_MVP_EUR_inv_var_meta_mlogp', 'leave_MVP_EUR_inv_var_meta_het_p', 'leave_MVP_AFR_N', 'leave_MVP_AFR_inv_var_meta_beta', 'leave_MVP_AFR_inv_var_meta_sebeta', 'leave_MVP_AFR_inv_var_meta_p', 'leave_MVP

[Stage 10:>                                                         (0 + 1) / 1]

+--------------------------------------+----------+--------+---------------+---------------+--------+-------------+------------------+----------------------+--------+------------+------------+------------+
|studyId                               |chromosome|position|referenceAllele|alternateAllele|beta    |standardError|all_inv_var_meta_p|all_inv_var_meta_mlogp|fg_pval |MVP_EUR_pval|MVP_AFR_pval|MVP_HIS_pval|
+--------------------------------------+----------+--------+---------------+---------------+--------+-------------+------------------+----------------------+--------+------------+------------+------------+
|FINNGEN_R12_UKB_MVP_META_I9_HYPTENSESS|1         |11063   |T              |G              |1.54e+00|7.25e-01     |4.747e-05         |1                     |NA      |NA          |NA          |NA          |
|FINNGEN_R12_UKB_MVP_META_I9_HYPTENSESS|1         |13259   |G              |A              |1.92e-01|7.06e-01     |0.0002696         |1                     |NA      |NA        

                                                                                