# Principal Component Analysis

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '10g'}, tmp_dir='/home/olavur/tmp')

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-848846b477-48ks9:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/qc/hail-20210512-1305-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

In [36]:
import pandas as pd

## Load FarGen data

Use LD pruned diallelic sites.

In [3]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'
mt = hl.read_matrix_table(BASE_DIR + '/data/mt/ld_pruned_diallelic_common.mt')

In [4]:
n_variants, n_samples = mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 134452
Number of samples: 474


## Annotate birth place region

In [7]:
fargen_rin_ht = hl.import_table(BASE_DIR + '/data/metadata/birthplace/fargen_rin_samplename.csv', delimiter=',')
fargen_rin_ht = fargen_rin_ht.key_by(fargen_rin_ht.rin)

2021-05-12 13:08:22 Hail: INFO: Reading table without type imputation
  Loading field 'rin' as type str (not specified)
  Loading field 'sample' as type str (not specified)


In [8]:
fargen_rin_ht.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'rin': str 
    'sample': str 
----------------------------------------
Key: ['rin']
----------------------------------------


In [28]:
rin_birthplace_ht = hl.import_table(BASE_DIR + '/data/metadata/birthplace/rin_region.csv', delimiter=',')
# Rename "ind" to "rin".
# Convert the region variable to float.
rin_birthplace_ht = rin_birthplace_ht.transmute(rin=rin_birthplace_ht.ind, birthplace=hl.float64(rin_birthplace_ht.region))

rin_birthplace_ht = rin_birthplace_ht.key_by(rin_birthplace_ht.rin)

2021-05-12 13:19:27 Hail: INFO: Reading table without type imputation
  Loading field 'ind' as type str (not specified)
  Loading field 'region' as type str (not specified)


In [29]:
rin_birthplace_ht.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'rin': str 
    'birthplace': float64 
----------------------------------------
Key: ['rin']
----------------------------------------


In [30]:
# Annotate the table with the birthplace by the samplenames.
samplename_birthplace_ht = rin_birthplace_ht.annotate(samplename=fargen_rin_ht[rin_birthplace_ht.rin].sample)
samplename_birthplace_ht = samplename_birthplace_ht.key_by(samplename_birthplace_ht.samplename)

In [31]:
samplename_birthplace_ht.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'rin': str 
    'birthplace': float64 
    'samplename': str 
----------------------------------------
Key: ['samplename']
----------------------------------------


In [34]:
p = hl.plot.histogram(mt.birthplace)
show(p)

2021-05-12 13:19:52 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-05-12 13:19:52 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-05-12 13:19:54 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-05-12 13:19:54 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-05-12 13:19:57 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-05-12 13:19:57 Hail: INFO: Ordering unsorted dataset with network shuffle


## Compute PCA

In [61]:
eigenvalues, scores, loadings = hl.hwe_normalized_pca(mt.GT, k=2)

2021-05-12 13:28:23 Hail: INFO: hwe_normalized_pca: running PCA using 134452 variants.
2021-05-12 13:28:25 Hail: INFO: pca: running PCA with 2 components...


In [62]:
mt = mt.annotate_cols(scores = scores[mt.s].scores)

In [64]:
p = hl.plot.scatter(mt.scores[0],
                    mt.scores[1],
                    label=hl.str(mt.birthplace),
                    title='PCA', xlabel='PC1', ylabel='PC2')
p.plot_width = 800
p.plot_height = 600
show(p)

2021-05-12 13:29:48 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-05-12 13:29:48 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-05-12 13:29:49 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-05-12 13:29:49 Hail: INFO: Ordering unsorted dataset with network shuffle
