# Principal Component Analysis

In [None]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '100g', 'spark.local.dir': '/home/olavur/tmp'})

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-7889d4ff4c-6wxtc:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/qc/hail-20210324-1015-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

In [3]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'
RESOURCES_DIR = '/non-fargen/resources'

## Load gnomAD data

**FIXME:** Load the `HIGH_QUALITY` file (gnomad.genomes.v3.1.hgdp_1kg_subset_dense_EXOME_HIGH_QUALITY.mt) when it's done.

In [4]:
gnomad_mt = hl.read_matrix_table(RESOURCES_DIR + '/gnomAD/gnomad.genomes.v3.1.hgdp_1kg_subset_dense_EXOME_HIGH_QUALITY.mt')

In [5]:
n_variants, n_samples = gnomad_mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 7094228
Number of samples: 3942


## Load FarGen exome data

Load filtered, high-quality, variants.

In [6]:
fargen_mt = hl.read_matrix_table(BASE_DIR + '/data/mt/high_quality_variants.mt/')

In [7]:
n_variants, n_samples = fargen_mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 1332013
Number of samples: 474


## Annotate population

The samples in the gnomAD data are annotated with population. Save these in a `pop` field.

In [8]:
gnomad_mt = gnomad_mt.annotate_cols(pop=gnomad_mt.population_inference.pop)

Count the number of samples in each population.

In [9]:
gnomad_mt.aggregate_cols(hl.agg.counter(gnomad_mt.pop))

{'nfe': 675,
 'fin': 92,
 'mid': 126,
 'oth': 47,
 'afr': 896,
 'eas': 801,
 'sas': 764,
 'amr': 541}

Make a `pop` parameter in the FarGen data as well.

In [10]:
fargen_mt = fargen_mt.annotate_cols(pop = 'fae')

## Merge datasets

Select only the `GT` and `pop` fields from both matrix tables, as these are the only ones we'll need.

In [11]:
fargen_mt = fargen_mt.select_entries('GT')
fargen_mt = fargen_mt.select_cols('pop')
fargen_mt = fargen_mt.select_rows()
gnomad_mt = gnomad_mt.select_entries('GT')
gnomad_mt = gnomad_mt.select_cols('pop')
gnomad_mt = gnomad_mt.select_rows()

**NOTE:** Variants that are unique to either dataset are lots in this union of sites. As such, the variance between populations can be under-estimated, as sites that are fixed in the Faroese population will not be included. However, these sites should contain enough information to summarize "coarse grained" population structure.

In [12]:
mt = fargen_mt.union_cols(gnomad_mt)

In [21]:
#n_variants, n_samples = mt.count()
#print('Number of variants: ' + str(n_variants))
#print('Number of samples: ' + str(n_samples))

## Population filters

Discard multi-allelic sites.

In [13]:
mt = mt.filter_rows(hl.len(mt.alleles) == 2)

**NOTE:** The HWE filter should remove genotyping errors, but not HWE deviations due to population structure. The value of $p > 10^{-6}$ used here is quite low, so it is quite lenient.

Remove variants failing HWE test with $p > 10^{-6}$.

In [14]:
mt = mt.annotate_rows(hwe=hl.agg.hardy_weinberg_test(mt.GT))
mt = mt.filter_rows(mt.hwe.p_value > 1e-6)

Calculate allele frequencies.

In [15]:
# The number of alleles at the site is the sum of the ploidy at each site.
# This number should be twice the number of samples.
# If there are missing genotype calls, the number of alleles will be less.
AN_exprs = hl.agg.sum(mt.GT.ploidy)
mt = mt.annotate_rows(AN=AN_exprs)

# Calculate the number of alternate alleles at each site.
AC_exprs = hl.agg.sum(mt.GT.n_alt_alleles())
mt = mt.annotate_rows(AC=AC_exprs)

# Calculate the alternate allele frequency.
mt = mt.annotate_rows(AF=mt.AC / mt.AN)

**TODO:** I can increase the MAF threshold if still a lot of variants remain.

Remove variants with minor allele frequency under 0.05.

In [17]:
maf_filter = 0.05
mt = mt.filter_rows((mt.AF > maf_filter) & (mt.AF < (1 - maf_filter)))

### Write to file

In [18]:
if False:
    mt.write(BASE_DIR + '/data/resources/gnomAD/fargen_gnomad_union_common_hwe_diallelic.mt')

2021-03-24 12:28:31 Hail: INFO: wrote matrix table with 111195 rows and 4416 columns in 115455 partitions to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/resources/gnomAD/fargen_gnomad_union_common_hwe_diallelic.mt
    Total size: 145.26 MiB
    * Rows/entries: 145.24 MiB
    * Columns: 18.41 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  66 rows (91.28 KiB)


In [23]:
if True:
    mt = hl.read_matrix_table(BASE_DIR + '/data/resources/gnomAD/fargen_gnomad_union_common_hwe_diallelic.mt')

In [24]:
n_variants, n_samples = mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 111195
Number of samples: 4416


**FIXME:** write the dataset to file to speed up the remaining analysis.

## LD pruning

**TODO:** tune r2 parameter

In [25]:
pruned_variant_table = hl.ld_prune(mt.GT, r2=0.2, bp_window_size=500000)
mt = mt.filter_rows(hl.is_defined(pruned_variant_table[mt.row_key]))

2021-03-24 12:32:03 Hail: INFO: ld_prune: running local pruning stage with max queue size of 58154 variants
2021-03-24 13:12:42 Hail: INFO: wrote table with 55517 rows in 115455 partitions to /tmp/tch5ma5qXVBZoiUvmqdz02
    Total size: 5.73 MiB
    * Rows: 5.73 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  14 rows (581.00 B)
2021-03-24 13:32:15 Hail: INFO: Wrote all 28 blocks of 55517 x 4416 matrix with block size 4096.
2021-03-24 13:58:06 Hail: INFO: wrote table with 40776 rows in 27 partitions to /tmp/kkx6G9vxhaNNeKyOvb34qn
    Total size: 1.25 MiB
    * Rows: 552.71 KiB
    * Globals: 728.28 KiB
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  3540 rows (47.50 KiB)


In [26]:
if False:
    mt.write(BASE_DIR + '/data/resources/gnomAD/fargen_gnomad_union_common_hwe_diallelic_indep.mt')

2021-03-24 14:53:32 Hail: INFO: wrote matrix table with 36796 rows and 4416 columns in 115455 partitions to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/resources/gnomAD/fargen_gnomad_union_common_hwe_diallelic_indep.mt
    Total size: 55.71 MiB
    * Rows/entries: 55.69 MiB
    * Columns: 18.41 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  13 rows (21.78 KiB)


In [27]:
if True:
    mt = hl.read_matrix_table(BASE_DIR + '/data/resources/gnomAD/fargen_gnomad_union_common_hwe_diallelic_indep.mt')

In [28]:
n_variants, n_samples = mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 36796
Number of samples: 4416


## PCA

In [None]:
eigenvalues, scores, loadings = hl.hwe_normalized_pca(mt.GT, k=5)

2021-03-24 15:02:46 Hail: INFO: hwe_normalized_pca: running PCA using 36796 variants.
2021-03-24 15:09:53 Hail: INFO: pca: running PCA with 5 components...


In [None]:
mt = mt.annotate_cols(scores = scores[mt.s].scores)

In [41]:
p = hl.plot.scatter(mt.scores[0],
                    mt.scores[1],
                    hover_fields={'Sample': mt.s},
                    size=1,
                    label=mt.pop,
                    title='PCA', xlabel='PC1', ylabel='PC2')
p.plot_width = 800
p.plot_height = 600
show(p)

In [42]:
p = hl.plot.scatter(mt.scores[0],
                    mt.scores[2],
                    hover_fields={'Sample': mt.s},
                    size=1,
                    label=mt.pop,
                    title='PCA', xlabel='PC1', ylabel='PC2')
p.plot_width = 800
p.plot_height = 600
show(p)

In [43]:
p = hl.plot.scatter(mt.scores[1],
                    mt.scores[2],
                    hover_fields={'Sample': mt.s},
                    size=1,
                    label=mt.pop,
                    title='PCA', xlabel='PC1', ylabel='PC2')
p.plot_width = 800
p.plot_height = 600
show(p)