# Principal Component Analysis

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '100g'}, tmp_dir='/home/olavur/tmp')

2021-09-28 10:47:21 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2021-09-28 10:47:21 WARN  Hail:37 - This Hail JAR was compiled for Spark 2.4.5, running with Spark 2.4.1.
  Compatibility is not guaranteed.
2021-09-28 10:47:22 WARN  Utils:66 - Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
2021-09-28 10:47:22 WARN  Utils:66 - Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
2021-09-28 10:47:22 WARN  Utils:66 - Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
2021-09-28 10:47:22 WARN  Utils:66 - Service 'SparkUI' could not bind on port 4043. Attempting port 4044.


Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-848846b477-8tkk6:4044
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/qc/hail-20210928-1047-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

In [3]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'
RESOURCES_DIR = '/data/other/resources'

## Load gnomAD data

**FIXME:** Load the `HIGH_QUALITY` file (gnomad.genomes.v3.1.hgdp_1kg_subset_dense_EXOME_HIGH_QUALITY.mt) when it's done.

In [4]:
gnomad_mt = hl.read_matrix_table(RESOURCES_DIR + '/gnomAD/hgdp_1kg_subset_dense/gnomad.genomes.v3.1.hgdp_1kg_subset_dense_EXOME_HIGH_QUALITY.mt')

In [5]:
n_variants, n_samples = gnomad_mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 7094228
Number of samples: 3942


## Load FarGen exome data

Load filtered, high-quality, variants.

In [6]:
fargen_mt = hl.read_matrix_table(BASE_DIR + '/data/mt/high_quality_variants.mt/')

In [7]:
n_variants, n_samples = fargen_mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 86300
Number of samples: 468


## Annotate population

The samples in the gnomAD data are annotated with population. Save these in a `pop` field.

In [8]:
gnomad_mt = gnomad_mt.annotate_cols(pop=gnomad_mt.population_inference.pop)

Count the number of samples in each population.

In [9]:
gnomad_mt.aggregate_cols(hl.agg.counter(gnomad_mt.pop))

{'nfe': 675,
 'fin': 92,
 'mid': 126,
 'oth': 47,
 'afr': 896,
 'eas': 801,
 'sas': 764,
 'amr': 541}

Make a `pop` parameter in the FarGen data as well.

In [10]:
fargen_mt = fargen_mt.annotate_cols(pop = 'fae')

## Merge datasets

Select only the `GT` and `pop` fields from both matrix tables, as these are the only ones we'll need.

In [11]:
fargen_mt = fargen_mt.select_entries('GT')
fargen_mt = fargen_mt.select_cols('pop')
fargen_mt = fargen_mt.select_rows()
gnomad_mt = gnomad_mt.select_entries('GT')
gnomad_mt = gnomad_mt.select_cols('pop')
gnomad_mt = gnomad_mt.select_rows()

**NOTE:** Variants that are unique to either dataset are lots in this union of sites. As such, the variance between populations can be under-estimated, as sites that are fixed in the Faroese population will not be included. However, these sites should contain enough information to summarize "coarse grained" population structure.

In [12]:
mt = fargen_mt.union_cols(gnomad_mt)

## Population filters

Discard multi-allelic sites.

In [13]:
mt = mt.filter_rows(hl.len(mt.alleles) == 2)

**NOTE:** The HWE filter should remove genotyping errors, but not HWE deviations due to population structure. The value of $p > 10^{-6}$ used here is quite low, so it is quite lenient.

Remove variants failing HWE test with $p > 10^{-9}$.

In [14]:
mt = mt.annotate_rows(hwe=hl.agg.hardy_weinberg_test(mt.GT))
mt = mt.filter_rows(mt.hwe.p_value > 1e-9)

Calculate allele frequencies.

In [15]:
# The number of alleles at the site is the sum of the ploidy at each site.
# This number should be twice the number of samples.
# If there are missing genotype calls, the number of alleles will be less.
AN_exprs = hl.agg.sum(mt.GT.ploidy)
mt = mt.annotate_rows(AN=AN_exprs)

# Calculate the number of alternate alleles at each site.
AC_exprs = hl.agg.sum(mt.GT.n_alt_alleles())
mt = mt.annotate_rows(AC=AC_exprs)

# Calculate the alternate allele frequency.
mt = mt.annotate_rows(AF=mt.AC / mt.AN)

**TODO:** I can increase the MAF threshold if still a lot of variants remain.

Remove variants with minor allele frequency under 0.05.

In [16]:
maf_filter = 0.05
mt = mt.filter_rows((mt.AF > maf_filter) & (mt.AF < (1 - maf_filter)))

## Filter indels

Remove all indels from the dataset.

**NOTE:** this code only works because there are only diallelic sites. If there were multi-allelic sites, I would have to check all allele pairs.

In [17]:
mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))

### Write to file

In [18]:
if False:
    mt = mt.checkpoint(BASE_DIR + '/data/resources/gnomAD/fargen_gnomad_union_common_hwe_diallelic.mt', overwrite=True)
else:
    mt = hl.read_matrix_table(BASE_DIR + '/data/resources/gnomAD/fargen_gnomad_union_common_hwe_diallelic.mt')

In [19]:
n_variants, n_samples = mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 124299
Number of samples: 4410


**FIXME:** write the dataset to file to speed up the remaining analysis.

## LD pruning

**TODO:** tune r2 parameter

In [None]:
pruned_variant_table = hl.ld_prune(mt.GT, r2=0.2, bp_window_size=500000)
mt = mt.filter_rows(hl.is_defined(pruned_variant_table[mt.row_key]))

2021-09-28 10:47:36 Hail: INFO: ld_prune: running local pruning stage with max queue size of 58204 variants

In [None]:
if True:
    mt = mt.checkpoint(BASE_DIR + '/data/resources/gnomAD/fargen_gnomad_union_common_hwe_diallelic_indep.mt', overwrite=True)
else:
    mt = hl.read_matrix_table(BASE_DIR + '/data/resources/gnomAD/fargen_gnomad_union_common_hwe_diallelic_indep.mt')

In [None]:
n_variants, n_samples = mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

## PCA

In [None]:
eigenvalues, scores, loadings = hl.hwe_normalized_pca(mt.GT, k=5)

In [None]:
mt = mt.annotate_cols(scores = scores[mt.s].scores)

In [None]:
if True:
    mt = mt.checkpoint(BASE_DIR + '/data/resources/gnomAD/fargen_gnomad_union_common_hwe_diallelic_indep_pca.mt', overwrite=True)
else:
    mt = hl.read_matrix_table(BASE_DIR + '/data/resources/gnomAD/fargen_gnomad_union_common_hwe_diallelic_indep_pca.mt')

In [None]:
p = hl.plot.scatter(mt.scores[0],
                    mt.scores[1],
                    hover_fields={'Sample': mt.s},
                    size=1,
                    label=mt.pop,
                    title='PCA', xlabel='PC1', ylabel='PC2')
p.plot_width = 800
p.plot_height = 600
show(p)

In [None]:
p = hl.plot.scatter(mt.scores[0],
                    mt.scores[2],
                    hover_fields={'Sample': mt.s},
                    size=1,
                    label=mt.pop,
                    title='PCA', xlabel='PC1', ylabel='PC3')
p.plot_width = 800
p.plot_height = 600
show(p)

In [None]:
p = hl.plot.scatter(mt.scores[1],
                    mt.scores[2],
                    hover_fields={'Sample': mt.s},
                    size=1,
                    label=mt.pop,
                    title='PCA', xlabel='PC2', ylabel='PC3')
p.plot_width = 800
p.plot_height = 600
show(p)