In [None]:
import os
import sys
import subprocess
import hail as hl
from pyspark.sql import SparkSession

In [None]:
builder = (
    SparkSession
    .builder
    .enableHiveSupport())
spark = builder.getOrCreate()
hl.init(sc=spark.sparkContext)
hl.default_reference("GRCh38")
print("Hail version:", hl.__version__)

In [None]:
!tar -xzf /mnt/projects/dcm_pgs/gnomad.v3.1.pca_loadings_grch37.ht.tar.gz

In [None]:
loadings = hl.read_table("file:///opt/notebooks/gnomad.v3.1.pca_loadings_grch37.ht")
loadings = loadings.key_by("locus", "alleles")
loadings.describe()
loadings.show(3)

In [None]:
# BGEN files
bgen_dir = "/mnt/project/dcm_pgs/pca_variants"
sample_file = "/Bulk/Imputation/UKB imputation from genotype/ukb22828_c1_b0_v3.sample"

mts = []
for c in range(1, 23):
    mt_chr = hl.import_bgen(
        bgen_dir + f"dcm_pca_chr{c}_subset.bgen",
        sample_file=sample_file,
        entry_fields=["dosage"]
    )
    mts.append(mt_chr)

mt = mts[0]
for mt_chr in mts[1:]:
    mt = mt.union_rows(mt_chr)

In [None]:
# Annotate rows with loadings and filter to intersection
mt = mt.annotate_rows(l = loadings[mt.row_key])
mt = mt.filter_rows(hl.is_defined(mt.l))

In [None]:
# IMPORTANT: make sure dosage is ALT allele dosage in mt.alleles[1]
# Then use hwe normalization with pca_af from loadings.
p = mt.l.pca_af
mu = 2.0 * p
sigma = hl.sqrt(2.0 * p * (1.0 - p))
same = mt.alleles == mt.l.alleles
flip = mt.alleles == mt.l.alleles[::-1]
dos = hl.float64(mt.dosage)
dos_aligned = hl.if_else(
    same, dos,
    hl.if_else(flip, 2.0 - dos, hl.missing(hl.tfloat64))
)
mt = mt.annotate_entries(dos=dos_aligned)
mt = mt.filter_entries(hl.is_defined(mt.dos))

In [None]:
# then compute x using mt.dos instead of mt.dosage
x = (mt.dos - mu) / sigma

In [None]:
# Project
mt = mt.annotate_entries(x = x)

mt = mt.annotate_entries(contrib = mt.x * mt.l.loadings)
mt = mt.annotate_cols(scores = hl.agg.array_sum(mt.contrib))

pcs = mt.cols().select(scores=mt.scores)
K = loadings.take(1)[0].loadings.length()

pcs = pcs.annotate(**{f"PC{i+1}": pcs.scores[i] for i in range(K)}).drop("scores")


In [None]:
pcs.export("ukb_gnomad_projected_pcs.tsv.bgz")
hl.hadoop_copy(
    "ukb_gnomad_projected_pcs.tsv.bgz",
    "file:///mnt/project/dcm_pgs/ukb_gnomad_projected_pcs.tsv.bgz"
)