# Variance Explained

The goal of this notebook is to add a variance explained calculated as
`chi2.cdf(pvalue, df=1)`


In [None]:
import polars as pl
from scipy.stats import chi2


### Load the dataset from previous notebook

In [4]:
lead_variant_maf_dataset = pl.read_parquet("lead-maf-vep/*.parquet")


In [5]:
lead_variant_maf_dataset.shape


(2622098, 19)

In [4]:
lead_variant_maf_dataset.head()


studyId,studyLocusId,variantId,beta,zScore,pValueMantissa,pValueExponent,standardError,finemappingMethod,studyType,credibleSetSize,nCases,nControls,nSamples,majorPopulation,majorPopulationMAF,majorPopulationAlleleFrequency,vepEffect
str,str,str,f64,f64,f32,i32,f64,str,str,i64,i32,i32,i32,struct[2],f64,list[struct[2]],struct[3]
"""gtex_txrev_prostate_ensg000001…","""9cf8bb4583ab369310742a50dee370…","""13_45342674_C_T""",-1.21864,,4.021,-7,0.232593,"""SuSie""","""tuqtl""",30,,,218,"{""nfe"",0.0}",0.000559,"[{""nfe_adj"",0.000559}]","{""intron_variant"",0.1,null}"
"""schmiedel_2018_tx_th2_memory_e…","""998d6b254b25f87d2b500e587ad60b…","""13_45342825_C_T""",1.6927,,4.536,-11,0.219494,"""SuSie""","""eqtl""",31,,,89,"{""nfe"",0.0}",0.112651,"[{""nfe_adj"",0.112651}]","{""intron_variant"",0.1,null}"
"""gtex_txrev_uterus_ensg00000170…","""4666d5e4fb13431c5937e45720730e…","""13_45343039_A_T""",0.935377,,1.71,-7,0.167887,"""SuSie""","""tuqtl""",22,,,129,"{""nfe"",0.0}",0.112192,"[{""nfe_adj"",0.112192}]","{""intron_variant"",0.1,null}"
"""gtex_txrev_muscle_ensg00000170…","""26633dba587823b0add355f3e67881…","""13_45343765_C_T""",0.521832,,2.387,-4,0.141282,"""SuSie""","""tuqtl""",8,,,702,"{""nfe"",0.0}",0.020081,"[{""nfe_adj"",0.020081}]","{""intron_variant"",0.1,null}"
"""phlips_exon_ipsc_ensg000001709…","""bece5b71efb8e16685c1318cac9864…","""13_45344247_T_TTTTTA""",-0.67395,,8.559,-6,0.139961,"""SuSie""","""eqtl""",1,,,83,"{""nfe"",0.0}",0.369299,"[{""nfe_adj"",0.630701}]","{""intron_variant"",0.1,null}"


In [6]:
# NOTE! Calculate variance explained requires removal of the studies that have nSamples = 0
df = (
    lead_variant_maf_dataset.filter(pl.col("nSamples").is_not_null())
    .select(
        "*",
        (pl.col("pValueMantissa") * 10).pow(pl.col("pValueExponent")).alias("pValue"),
        (pl.col("pValueMantissa") * 10)
        .pow(pl.col("pValueExponent").map_elements(lambda x: chi2.cdf(x, 1), pl.Float64()))
        .alias("cdf"),
    )
    .select(
        "*",
        (pl.col("cdf") / pl.col("nSamples")).alias("varianceExplained"),
    )
)


In [None]:
df.write_parquet("lead-maf-vep-ve.parquet")


In [None]:
df.shape[0] - lead_variant_maf_dataset.shape[0]


-1050

Exactly 1050 samples have no `nSamples` disallowing us to calculate the varianceExplained