# PCA of allele frequency

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '10g', 'spark.local.dir': '/home/olavur/tmp'})

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-7889d4ff4c-6wxtc:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/gnomad_exome_sites/hail-20210409-1236-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

In [2]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'

## Load gnomAD exome sites data

In [4]:
gnomad_ht = hl.read_table(BASE_DIR + '/data/resources/gnomAD/gnomad_exome_sites/gnomad.exomes.r2.1.1.sites.GRCh38.ht')

In [5]:
n_variants = gnomad_ht.count()
print('Number of variants: ' + str(n_variants))

Number of variants: 17204631


## Populations to use

In [5]:
gnomad_freq_index_dict = gnomad_ht.freq_index_dict.collect()[0]

In [6]:
# Names of the populations in the index dictionary.
pop_list = ['gnomad_afr', 'gnomad_sas', 'gnomad_amr', 'gnomad_eas', 'gnomad_nfe', 'gnomad_fin', 'gnomad_nfe_nwe', 'gnomad_nfe_seu']

# Get the indexes of the populations.
pop_index_list = [gnomad_freq_index_dict[pop] for pop in pop_list]

# More intuitive names for the populations.
pop_name_list = ['African', 'South-East Asian', 'American', 'East Asian', 'Non-Finnish European', 'Finnish', 'North-Western European', 'Southern European']

pop_dict = dict(zip(pop_name_list, pop_index_list))

In [8]:
pop_dict

{'African': 6,
 'South-East Asian': 8,
 'American': 9,
 'East Asian': 7,
 'Non-Finnish European': 2,
 'Finnish': 3,
 'North-Western European': 34,
 'Southern European': 33}

Annotate the table with one column for each population allele frequency.

In [9]:
#list(filter(lambda x: x[0][:6] == 'gnomad', gnomad_freq_index_dict.items()))

## PCA with gnomAD only

First we need to make a matrix with the populations as columns. In order to do that we need to use the `to_matrix_table_row_major()` method, and to use this method we need to annotate each row with the allele frequency in each population.

In [7]:
gnomad_ht = gnomad_ht.annotate(gnomad_afr=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_afr']],
                                     gnomad_sas=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_sas']],
                                     gnomad_amr=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_amr']],
                                     gnomad_eas=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_eas']],
                                     gnomad_nfe=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_nfe']],
                                     gnomad_fin=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_fin']],
                                     gnomad_nfe_nwe=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_nfe_nwe']],
                                     gnomad_nfe_seu=gnomad_ht.freq.AF[gnomad_freq_index_dict['gnomad_nfe_seu']])

Now we can convert this to a matrix, where the entries are keyed by locus, alleles and populations.

In [8]:
af_mt = gnomad_ht.to_matrix_table_row_major(columns=pop_list, entry_field_name='AF', col_field_name='pop')

In [13]:
af_mt = af_mt.select_rows()
af_mt = af_mt.select_globals()

In [None]:
af_mt.AF

In [14]:
af_mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    'pop': str
----------------------------------------
Row fields:
    'locus': locus<GRCh38>
    'alleles': array<str>
----------------------------------------
Entry fields:
    'AF': float64
----------------------------------------
Column key: ['pop']
Row key: ['locus', 'alleles']
----------------------------------------


We shall filter out variants that have a minor allele frequency below 0.01 in the *total population*.

In [12]:
maf_filter = 0.01
af_mt = af_mt.filter_rows(af_mt.freq.AF[gnomad_freq_index_dict['gnomad']] > maf_filter)
af_mt = af_mt.filter_rows(af_mt.freq.AF[gnomad_freq_index_dict['gnomad']] < (1 - maf_filter))

In [13]:
n_variants, n_samples = af_mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 183627
Number of samples: 8


**FIXME:** I need to do LD pruning. This information is available for download. Or use HGDP + 1kG to calculate LD.

**TODO:** Take variants that are >10cM or >50cM apart.

In [14]:
eigenvalues, scores, loadings = hl.pca(hl.is_defined(af_mt.AF), k=2)

2021-03-18 08:29:15 Hail: INFO: pca: running PCA with 2 components...


In [15]:
af_mt = af_mt.annotate_cols(scores = scores[af_mt.pop].scores)

In [16]:
p = hl.plot.scatter(af_mt.scores[0],
                    af_mt.scores[1],
                    label=af_mt.pop,
                    hover_fields={'Sample': af_mt.pop},
                    title='PCA', xlabel='PC1', ylabel='PC2')
p.plot_width = 800
p.plot_height = 600
show(p)

## Load FarGen data annotated with gnomAD data

In [17]:
fargen_mt = hl.read_matrix_table(BASE_DIR + '/data/mt/hq_gnomad_annotated.mt')