# Principal Component Analysis

In [None]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '100g'}, tmp_dir='/home/olavur/tmp')

In [None]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

In [None]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'
RESOURCES_DIR = '/data/other/resources'

## Load gnomAD data

**FIXME:** Load the `HIGH_QUALITY` file (gnomad.genomes.v3.1.hgdp_1kg_subset_dense_EXOME_HIGH_QUALITY.mt) when it's done.

In [None]:
gnomad_mt = hl.read_matrix_table(RESOURCES_DIR + '/gnomAD/hgdp_1kg_subset_dense/gnomad.genomes.v3.1.hgdp_1kg_subset_dense_EXOME_HIGH_QUALITY.mt')

In [None]:
n_variants, n_samples = gnomad_mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

## Load FarGen exome data

Load filtered, high-quality, variants.

In [None]:
fargen_mt = hl.read_matrix_table(BASE_DIR + '/data/mt/high_quality_variants.mt/')

In [None]:
n_variants, n_samples = fargen_mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

## Annotate population

The samples in the gnomAD data are annotated with population. Save these in a `pop` field.

In [None]:
gnomad_mt = gnomad_mt.annotate_cols(pop=gnomad_mt.population_inference.pop)

Count the number of samples in each population.

In [None]:
gnomad_mt.aggregate_cols(hl.agg.counter(gnomad_mt.pop))

Make a `pop` parameter in the FarGen data as well.

In [None]:
fargen_mt = fargen_mt.annotate_cols(pop = 'fae')

## Merge datasets

Select only the `GT` and `pop` fields from both matrix tables, as these are the only ones we'll need.

In [None]:
fargen_mt = fargen_mt.select_entries('GT')
fargen_mt = fargen_mt.select_cols('pop')
fargen_mt = fargen_mt.select_rows()
gnomad_mt = gnomad_mt.select_entries('GT')
gnomad_mt = gnomad_mt.select_cols('pop')
gnomad_mt = gnomad_mt.select_rows()

**NOTE:** Variants that are unique to either dataset are lots in this union of sites. As such, the variance between populations can be under-estimated, as sites that are fixed in the Faroese population will not be included. However, these sites should contain enough information to summarize "coarse grained" population structure.

In [None]:
mt = fargen_mt.union_cols(gnomad_mt)

## Population filters

Discard multi-allelic sites.

In [None]:
mt = mt.filter_rows(hl.len(mt.alleles) == 2)

**NOTE:** The HWE filter should remove genotyping errors, but not HWE deviations due to population structure. The value of $p > 10^{-6}$ used here is quite low, so it is quite lenient.

Remove variants failing HWE test with $p > 10^{-9}$.

In [None]:
mt = mt.annotate_rows(hwe=hl.agg.hardy_weinberg_test(mt.GT))
mt = mt.filter_rows(mt.hwe.p_value > 1e-9)

Calculate allele frequencies.

In [None]:
# The number of alleles at the site is the sum of the ploidy at each site.
# This number should be twice the number of samples.
# If there are missing genotype calls, the number of alleles will be less.
AN_exprs = hl.agg.sum(mt.GT.ploidy)
mt = mt.annotate_rows(AN=AN_exprs)

# Calculate the number of alternate alleles at each site.
AC_exprs = hl.agg.sum(mt.GT.n_alt_alleles())
mt = mt.annotate_rows(AC=AC_exprs)

# Calculate the alternate allele frequency.
mt = mt.annotate_rows(AF=mt.AC / mt.AN)

**TODO:** I can increase the MAF threshold if still a lot of variants remain.

Remove variants with minor allele frequency under 0.05.

In [None]:
maf_filter = 0.05
mt = mt.filter_rows((mt.AF > maf_filter) & (mt.AF < (1 - maf_filter)))

## Filter indels

Remove all indels from the dataset.

**NOTE:** this code only works because there are only diallelic sites. If there were multi-allelic sites, I would have to check all allele pairs.

In [None]:
mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))

### Write to file

In [None]:
if True:
    mt = mt.checkpoint(BASE_DIR + '/data/resources/gnomAD/fargen_gnomad_union_common_hwe_diallelic.mt', overwrite=True)
else:
    mt = hl.read_matrix_table(BASE_DIR + '/data/resources/gnomAD/fargen_gnomad_union_common_hwe_diallelic.mt')

In [None]:
n_variants, n_samples = mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

**FIXME:** write the dataset to file to speed up the remaining analysis.

## LD pruning

**TODO:** tune r2 parameter

In [None]:
pruned_variant_table = hl.ld_prune(mt.GT, r2=0.2, bp_window_size=500000)
mt = mt.filter_rows(hl.is_defined(pruned_variant_table[mt.row_key]))

In [None]:
if True:
    mt = mt.checkpoint(BASE_DIR + '/data/resources/gnomAD/fargen_gnomad_union_common_hwe_diallelic_indep.mt', overwrite=True)
else:
    mt = hl.read_matrix_table(BASE_DIR + '/data/resources/gnomAD/fargen_gnomad_union_common_hwe_diallelic_indep.mt')

In [None]:
n_variants, n_samples = mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

## PCA

In [None]:
eigenvalues, scores, loadings = hl.hwe_normalized_pca(mt.GT, k=5)

In [None]:
mt = mt.annotate_cols(scores = scores[mt.s].scores)

In [None]:
if True:
    mt = mt.checkpoint(BASE_DIR + '/data/resources/gnomAD/fargen_gnomad_union_common_hwe_diallelic_indep_pca.mt', overwrite=True)
else:
    mt = hl.read_matrix_table(BASE_DIR + '/data/resources/gnomAD/fargen_gnomad_union_common_hwe_diallelic_indep_pca.mt')

In [None]:
p = hl.plot.scatter(mt.scores[0],
                    mt.scores[1],
                    hover_fields={'Sample': mt.s},
                    size=1,
                    label=mt.pop,
                    title='PCA', xlabel='PC1', ylabel='PC2')
p.plot_width = 800
p.plot_height = 600
show(p)

In [None]:
p = hl.plot.scatter(mt.scores[0],
                    mt.scores[2],
                    hover_fields={'Sample': mt.s},
                    size=1,
                    label=mt.pop,
                    title='PCA', xlabel='PC1', ylabel='PC2')
p.plot_width = 800
p.plot_height = 600
show(p)

In [None]:
p = hl.plot.scatter(mt.scores[1],
                    mt.scores[2],
                    hover_fields={'Sample': mt.s},
                    size=1,
                    label=mt.pop,
                    title='PCA', xlabel='PC1', ylabel='PC2')
p.plot_width = 800
p.plot_height = 600
show(p)