# Run L2G for FinnGen fine mapping results


In [None]:
!pip3 install wandb # TODO: check toml

In [1]:
import pyspark.sql.functions as f
from pyspark.sql.types import DoubleType, IntegerType, StringType

from xgboost.spark import SparkXGBClassifier

from otg.common.session import Session
from otg.config import LocusToGeneConfig
from otg.dataset.colocalisation import Colocalisation
from otg.dataset.l2g.feature_matrix import L2GFeatureMatrix
from otg.dataset.l2g.gold_standard import L2GGoldStandard
from otg.dataset.l2g.predictions import L2GPredictions
from otg.dataset.study_index import StudyIndex
from otg.dataset.study_locus import StudyLocus
from otg.dataset.study_locus_overlap import StudyLocusOverlap
from otg.dataset.v2g import V2G
from otg.method.locus_to_gene import LocusToGeneModel, LocusToGeneTrainer

In [None]:
session = Session()

## Read data and initialise respective datasets

In [3]:
finngen_assocs_path = "gs://ot-team/dsuveges/finngen/2023.10.06_PICSed"
v2g_path = "gs://genetics_etl_python_playground/XX.XX/output/python_etl/parquet/v2g" # old v2g
gold_standards_path = "gs://genetics_etl_python_playground/input/l2g/gold_standard/curation.json"
overlaps_path = "gs://genetics_etl_python_playground/output/python_etl/parquet/XX.XX/study_locus_overlap"
interactions_path = "gs://genetics_etl_python_playground/input/l2g/interaction" # 23.09 data
# paths to data that is not used (related to coloc feature factory)
studies_path = "gs://genetics_etl_python_playground/output/python_etl/parquet/XX.XX/catalog_study_index"
coloc_path = "gs://genetics_etl_python_playground/XX.XX/output/python_etl/parquet/coloc_ecaviar_all"

In [4]:
sl = StudyLocus.from_parquet(session, finngen_assocs_path)
sl.df.show(1, False, True)

[Stage 0:>                                                          (0 + 1) / 1]

-RECORD 0---------------------------------------------------------------
 studyLocusId                     | 5242723067793949472                 
 variantId                        | 20_35437976_G_A                     
 chromosome                       | 20                                  
 position                         | 35437976                            
 studyId                          | FINNGEN_R9_HEIGHT_IRN               
 beta                             | -0.0551669                          
 oddsRatio                        | null                                
 oddsRatioConfidenceIntervalLower | null                                
 oddsRatioConfidenceIntervalUpper | null                                
 betaConfidenceIntervalLower      | -0.057027749999999995               
 betaConfidenceIntervalUpper      | -0.05330605                         
 pValueMantissa                   | 3.811                               
 pValueExponent                   | -193           

                                                                                

In [6]:
# v2g data is old, i need to slightly convert it to conform to the schema

v2g_df = (
    session.spark.read.parquet(v2g_path)
    .withColumn("resourceScore", f.col("resourceScore").cast(DoubleType()))
    .withColumn("position", f.col("resourceScore").cast(IntegerType()))
    .withColumn("chromosome", f.col("resourceScore").cast(StringType()))
)

v2g = V2G(_df=v2g_df, _schema=V2G.get_schema())
v2g.df.show(1, False, True)



-RECORD 0-----------------------------------------
 geneId                         | ENSG00000279973 
 variantId                      | 22_10633788_G_A 
 distance                       | 432630          
 datatypeId                     | distance        
 datasourceId                   | canonical_tss   
 score                          | 0.136           
 pmid                           | null            
 biofeature                     | null            
 resourceScore                  | null            
 position                       | null            
 label                          | null            
 variantFunctionalConsequenceId | null            
 isHighQualityPlof              | null            
 chromosome                     | null            
only showing top 1 row



                                                                                

In [8]:
## PARSING GOLD STANDARDS

study_locus_overlap = StudyLocusOverlap.from_parquet(
                session, overlaps_path
            )
interactions = session.spark.read.parquet(interactions_path)
gs_curation = session.spark.read.json(gold_standards_path)

gold_standards = L2GGoldStandard.from_curation(
                gold_standard_curation=gs_curation,
                v2g=v2g,
                study_locus_overlap=study_locus_overlap,
                interactions=interactions,
            )

gold_standards.df.show(1, False, True)



-RECORD 0-------------------------------
 studyLocusId    | -4063386906864236882 
 geneId          | ENSG00000100344      
 goldStandardSet | positive             
only showing top 1 row



                                                                                

In [16]:
## CREATING FEATURE MATRIX

studies = StudyIndex.from_parquet(session, studies_path)
coloc = Colocalisation.from_parquet(session, coloc_path)

fm = L2GFeatureMatrix.generate_features(
                study_locus=sl,
                study_index=studies,
                variant_gene=v2g,
                colocalisation=coloc,
            )

fm.df.filter(~f.col("distanceTssMean").isNull()).show(1, False, True)

                                                                                

root
 |-- studyLocusId: long (nullable = true)
 |-- geneId: string (nullable = true)
 |-- distanceTssMean: float (nullable = true)
 |-- distanceTssMinimum: float (nullable = true)





-RECORD 0----------------------------------
 studyLocusId       | -6495574247332038438 
 geneId             | ENSG00000100249      
 distanceTssMean    | 253151.0             
 distanceTssMinimum | 253151.0             
only showing top 1 row



                                                                                

## Join gold standard with feature matrix

Gold standard: Curated pairs of study/locus/gene pairs.
Feature matrix: Our study locus associations enriched with functional genomics features

We build a feature matrix based on all associations in OTG.
This is joined with the Gold Standard, which is supposed to be a subset of the associations dataset.

In [17]:
data = L2GFeatureMatrix(
                _df=gold_standards.df.join(
                    fm.df, on=["studyLocusId", "geneId"], how="inner"
                ).transform(L2GFeatureMatrix.fill_na),
                _schema=L2GFeatureMatrix.get_schema(),
            )

data.df.show(1, False, True)

[Stage 224:>                                                        (0 + 1) / 1]

(0 rows)



                                                                                

In [18]:
data.df.printSchema()

root
 |-- studyLocusId: long (nullable = false)
 |-- geneId: string (nullable = true)
 |-- goldStandardSet: string (nullable = false)
 |-- distanceTssMean: float (nullable = false)
 |-- distanceTssMinimum: float (nullable = false)



## Train XGBoost