# Principal Component Analysis

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '10g', 'spark.local.dir': '/home/olavur/tmp'})

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-7889d4ff4c-6wxtc:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/qc/hail-20210310-1448-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

In [3]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'

## Load 1000 Genomes data

In [4]:
hl.utils.get_1kg(BASE_DIR + '/data/1kG_subset/')

2021-03-10 14:48:06 Hail: INFO: 1KG files found


In [5]:
kg_mt = hl.read_matrix_table(BASE_DIR + '/data/1kG_subset/1kg.mt')

In [6]:
n_variants, n_samples = kg_mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 10879
Number of samples: 284


Convert the dataset from reference genom GRCh37 to GRCh38.

In [7]:
# Create a liftover from GRCh37 to GRCh38.
rg37 = hl.get_reference('GRCh37')
rg38 = hl.get_reference('GRCh38')
rg37.add_liftover(BASE_DIR + '/data/resources/liftover/grch37_to_grch38.over.chain.gz', rg38)

In [8]:
# Define the locus in GRCh38.
kg_mt = kg_mt.annotate_rows(new_locus=hl.liftover(kg_mt.locus, 'GRCh38'))
# Remove sites where the new locus isn't defined.
kg_mt = kg_mt.filter_rows(hl.is_defined(kg_mt.new_locus))  
# Replace the loci by the new loci, and key the rows by locus and alleles.
# NOTE: the FarGen exome dataset is keyed by both the locus and the alleles, and it is important that the 1kG
# dataset is keyed by the same fields.
kg_mt = kg_mt.key_rows_by(locus=kg_mt.new_locus, alleles=kg_mt.alleles)  

**TODO:** filter variants based on depth/GQ?

## Load FarGen exome data

Load filtered, high-quality, variants.

In [9]:
fargen_mt = hl.read_matrix_table(BASE_DIR + '/data/mt/high_quality_variants.mt/')

In [10]:
n_variants, n_samples = fargen_mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 1194405
Number of samples: 474


## Drop unconcordant fields

We cannot merge the datasets if there are row/column/entry fields that are not in both datasets, so we simply drop all these.

**NOTE:** Many of these fields are actually in both datasets, I'm not sure why Hail doesn't accept that they are the same, perhaps the *order* must be the same as well.

**NOTE:** I could have used `mt.select('GT', 'alleles', 's')` (or something similar) for the same effect here. Perhaps the `select()` operator even puts the fields in the correct order, such that Hail recognizes that for example the `AD` field in both datasets are the same.

In [12]:
# Drop row fields.
fargen_mt = fargen_mt.drop('MIN_DP', 'PGT', 'PID', 'PP', 'PS', 'RGQ', 'SB', 'AD', 'DP', 'GQ', 'PL')
kg_mt = kg_mt.drop('AD', 'DP', 'GQ', 'PL')

# Drop column fields.
fargen_mt = fargen_mt.drop('sample_qc', 'high_hom_het')

## Merge datasets

In [13]:
mt = fargen_mt.union_cols(kg_mt)

In [14]:
n_variants, n_samples = mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

2021-03-10 14:48:11 Hail: INFO: Ordering unsorted dataset with network shuffle


Number of variants: 1930
Number of samples: 758


## Population filters

Remove variants failing HWE test with $p > 10^{-6}$. Also discard multi-allelic sites.

In [15]:
mt = mt.filter_rows(hl.len(mt.alleles) == 2)
mt = mt.annotate_rows(hwe=hl.agg.hardy_weinberg_test(mt.GT))
mt = mt.filter_rows(mt.hwe.p_value > 1e-6)

In [16]:
n_variants, n_samples = mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

2021-03-10 14:48:15 Hail: INFO: Ordering unsorted dataset with network shuffle


Number of variants: 1793
Number of samples: 758


Remove variants with minor allele frequency under 0.01.

In [17]:
maf_filter = 0.01
mt = mt.filter_rows(mt.info.AF[0] > maf_filter)
mt = mt.filter_rows(mt.info.AF[0] < (1 - maf_filter))

In [18]:
n_variants, n_samples = mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

2021-03-10 14:48:23 Hail: INFO: Ordering unsorted dataset with network shuffle


Number of variants: 1585
Number of samples: 758


## LD pruning

In [19]:
pruned_variant_table = hl.ld_prune(mt.GT, r2=0.2, bp_window_size=500000)
mt = mt.filter_rows(hl.is_defined(pruned_variant_table[mt.row_key]))

2021-03-10 14:48:30 Hail: INFO: ld_prune: running local pruning stage with max queue size of 279621 variants
2021-03-10 14:48:30 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-03-10 14:48:38 Hail: INFO: wrote table with 1528 rows in 111 partitions to /tmp/4CN2YUxa8ruQhhqTaMPoex
    Total size: 63.63 KiB
    * Rows: 63.62 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  35 rows (1.31 KiB)
2021-03-10 14:48:39 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-03-10 14:50:08 Hail: INFO: Wrote all 1 blocks of 1528 x 758 matrix with block size 4096.
2021-03-10 14:50:15 Hail: INFO: wrote table with 0 rows in 1 partition to /tmp/9L1dijnHHJrR2C9QYP9Cy5
    Total size: 21.06 KiB
    * Rows: 21.00 B
    * Globals: 21.04 KiB
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  0 rows (21.00 B)


In [20]:
n_variants, n_samples = mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

2021-03-10 14:50:17 Hail: INFO: Ordering unsorted dataset with network shuffle


Number of variants: 1528
Number of samples: 758


## PCA

In [21]:
eigenvalues, scores, loadings = hl.hwe_normalized_pca(mt.GT, k=2)

2021-03-10 14:50:25 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-03-10 14:50:31 Hail: INFO: hwe_normalized_pca: running PCA using 1528 variants.
2021-03-10 14:50:32 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-03-10 14:50:38 Hail: INFO: pca: running PCA with 2 components...


In [22]:
mt = mt.annotate_cols(scores = scores[mt.s].scores)

In [23]:
p = hl.plot.scatter(mt.scores[0],
                    mt.scores[1],
                    hover_fields={'Sample': mt.s},
                    title='PCA', xlabel='PC1', ylabel='PC2')
p.plot_width = 800
p.plot_height = 600
show(p)