# PCA of the gnomAD data

In [2]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '100g', 'spark.local.dir': '/home/olavur/tmp'})

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-7889d4ff4c-6wxtc:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/gnomad_genome_genotypes/hail-20210323-1224-0.2.61-3c86d3ba497a.log


In [3]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'
RESOURCES_DIR = '/non-fargen/resources'

## Load gnomAD data

In [27]:
gnomad_mt = hl.read_matrix_table(RESOURCES_DIR + '/gnomAD/gnomad.genomes.v3.1.hgdp_1kg_subset_dense_EXOME.mt')

In [6]:
n_variants, n_samples = gnomad_mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 7094228
Number of samples: 3942


### Filter poor quality variants

Filter the variants based on the AC0 and RF filters, described on the [gnomAD webiste](https://gnomad.broadinstitute.org/faq#whats-the-difference-between-gnomad-v2-and-v3) as follows:

* AC0: The allele count is zero after filtering out low-confidence genotypes (GQ < 20; DP < 10; and AB < 0.2 for het calls)
* RF (gnomAD v2 only): Failed random forest filtering thresholds of 0.055 for exome SNVs, 0.206 for exome indels, 0.263 for genome SNVs, and 0.222 for genome indels


In [28]:
gnomad_mt = gnomad_mt.filter_rows(~gnomad_mt.filters.contains('RF') | ~gnomad_mt.filters.contains('AC0'))

## Population filters

Use only biallelic variants.

In [29]:
gnomad_mt = gnomad_mt.filter_rows(hl.len(gnomad_mt.alleles) == 2)

Remove variants failing the HWE test with $p>10^{-6}$.

**FIXME:** the HWE threshold should be such that genotyping artifacts are removed, but population structure is kept.

In [30]:
gnomad_mt = gnomad_mt.annotate_rows(hwe=hl.agg.hardy_weinberg_test(gnomad_mt.GT))
gnomad_mt = gnomad_mt.filter_rows(gnomad_mt.hwe.p_value > 1e-6)

Filter variants with minor allele frequency under 0.01.

In [31]:
maf_filter = 0.01
AF_exprs = gnomad_mt.gnomad_freq.AF[0]
gnomad_mt = gnomad_mt.filter_rows((AF_exprs > maf_filter) & (AF_exprs < (1 - maf_filter)))

Write the filtered data to disk. This caches all expressions above so that we may run multiple expressions on the data faster.

In [None]:
if False:
    gnomad_mt = gnomad_mt.checkpoint(BASE_DIR + '/data/resources/gnomAD/gnomad.genomes.v3.1.hgdp_1kg_subset_dense_EXOME_FILTERED.mt')

In [35]:
gnomad_mt = hl.read_matrix_table(BASE_DIR + '/data/resources/gnomAD/gnomad.genomes.v3.1.hgdp_1kg_subset_dense_EXOME_FILTERED.mt')

In [36]:
n_variants, n_samples = gnomad_mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 406173
Number of samples: 3942


## LD pruning

In [37]:
pruned_variants_ht = hl.ld_prune(gnomad_mt.GT, r2=0.2, bp_window_size=500000)

2021-03-23 14:38:47 Hail: INFO: ld_prune: running local pruning stage with max queue size of 64777 variants
2021-03-23 15:18:17 Hail: INFO: wrote table with 245632 rows in 115375 partitions to /tmp/rRHLi2DBOLOOLdoScrX38A
    Total size: 13.18 MiB
    * Rows: 13.18 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  155 rows (5.13 KiB)


KeyboardInterrupt: 

In [None]:
gnomad_indep_mt = gnomad_mt.filter_rows(hl.is_defined(pruned_variants_ht[gnomad_mt.row_key]))

In [None]:
n_variants, n_samples = gnomad_indep_mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

## PCA

In [None]:
eigenvalues, scores, loadings = hl.hwe_normalized_pca(gnomad_indep_mt.GT, k=2)

In [None]:
p = hl.plot.scatter(mt.scores[0],
                    mt.scores[1],
                    title='PCA', xlabel='PC1', ylabel='PC2')
p.plot_width = 800
p.plot_height = 600
show(p)