# Allele frequencies in FarGen and gnomAD data

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '10g', 'spark.local.dir': '/home/olavur/tmp'})

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-7889d4ff4c-6wxtc:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/gnomad_exome_sites/hail-20210318-0820-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

In [3]:
import pandas as pd
import numpy as np

In [4]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'

## Load FarGen data annotated with gnomAD data

In [5]:
fargen_mt = hl.read_matrix_table(BASE_DIR + '/data/mt/hq_gnomad_annotated.mt')

Annotate the FarGen data with gnomAD concordance.

In [6]:
fargen_mt = fargen_mt.annotate_rows(in_gnomad=hl.is_defined(fargen_mt.gnomad))

Count number of FarGen sites that are and are not present in gnomAD.

In [7]:
fargen_mt.aggregate_rows(hl.agg.counter(fargen_mt.in_gnomad))

{False: 1137429, True: 194584}

## Site-frequency spectrum

Let $f_i$ be the *site frequency* of bin $i$, where bin $i$ corresponds to some range of *allele frequencies*. We are going to compute the folded frequencies $f^*_i = f_i + f_{n-i}$.

Example: if we have 10 bins, then $f^*_1 = f_1 + f_{10}$.

Calculate a histogram of allele frequencies.

In [8]:
n_bins = 100

# Make sure the number of bins is an even number.
assert n_bins % 2 == 0, 'Number of bins must be an even number.'
hist_struct = fargen_mt.aggregate_rows(hl.agg.hist(fargen_mt.info.AF[0], 0, 1, n_bins))

Get the allele frequencies.

In [9]:
allele_freq = hist_struct.bin_edges

Get site frequencies.

In [10]:
# The site count is the number of times frequencies in each bin is observed.
site_counts = np.array(hist_struct.bin_freq)

# Calculate site frequencies.
n_sites = sum(site_counts)
site_freq = site_counts / n_sites

Compute the folded site frequency.

In [11]:
half = int(n_bins/2)
folded_site_freq = site_freq[:half] + site_freq[:half-1:-1]

Make a Hail table out of the results.

In [12]:
# Make a Hail table with the allele counts and site frequencies.
ffs_table = []
for ac, fc in zip(allele_freq, folded_site_freq):
    row = {'af': ac, 'ff': fc}
    ffs_table.append(row)

# Make a table where each row is a bin, and each row contains the allele frequency and the folded site-frequency.
ht_ffs = hl.Table.parallelize(hl.literal(ffs_table, 'array<struct{af:float32,ff:float32}>'))

Plot the FFS.

In [13]:
p = hl.plot.scatter(ht_ffs.af, ht_ffs.ff,
                    xlabel='Allele counts', ylabel='Frequency in population', title='Site frequency spectrum (folded)',
                    collect_all=True)
p.plot_width = 800
p.plot_height = 400
show(p)

### FFS function

Write the code above in a function so we can re-use it.

In [14]:
def ffs(n_bins, mt, af_exprs):
    # Make sure the number of bins is an even number.
    assert n_bins % 2 == 0, 'Number of bins must be an even number.'
    hist_struct = mt.aggregate_rows(hl.agg.hist(af_exprs, 0, 1, n_bins))

    # Allele frequency.
    allele_freq = hist_struct.bin_edges

    # The site count is the number of times frequencies in each bin is observed.
    site_counts = np.array(hist_struct.bin_freq)

    # Calculate site frequencies.
    n_sites = sum(site_counts)
    site_freq = site_counts / n_sites

    # Calculate folded site-frequencies.
    half = int(n_bins/2)
    folded_site_freq = site_freq[:half] + site_freq[:half-1:-1]

    # Make a Hail table with the allele counts and site frequencies.
    ffs_table = []
    for ac, fc in zip(allele_freq, folded_site_freq):
        row = {'af': ac, 'ff': fc}
        ffs_table.append(row)

    # Make a table where each row is a bin, and each row contains the allele frequency and the folded site-frequency.
    ht_ffs = hl.Table.parallelize(hl.literal(ffs_table, 'array<struct{af:float32,ff:float32}>'))

    return ht_ffs

Make the same FFS as above just to check that it works.

In [15]:
ffs_ht = ffs(100, fargen_mt, fargen_mt.info.AF[0])

In [16]:
p = hl.plot.scatter(ffs_ht.af, ffs_ht.ff,
                    xlabel='Allele counts', ylabel='Frequency in population', title='Site frequency spectrum (folded)',
                    collect_all=True)
p.plot_width = 800
p.plot_height = 400
show(p)

### SFS stratified by gnomAD concordance

Calculate the FFS with 100 bins for each of three strata:

* All variants
* Variants present in gnomAD
* Variants *not* present in gnomAD

In [17]:
# All variants.
all_ffs_ht = ffs(100, fargen_mt, fargen_mt.info.AF[0])
# Annotate the table with the strata name.
all_ffs_ht = all_ffs_ht.annotate(strata='all')

# Variants present in gnomAD.
fargen_gnomad_mt = fargen_mt.filter_rows(fargen_mt.in_gnomad)
gnomad_ffs_ht = ffs(100, fargen_gnomad_mt, fargen_gnomad_mt.info.AF[0])
gnomad_ffs_ht = gnomad_ffs_ht.annotate(strata='gnomad')

# Variants not present in gnomAD.
fargen_non_gnomad_mt = fargen_mt.filter_rows(~fargen_mt.in_gnomad)
non_gnomad_ffs_ht = ffs(100, fargen_non_gnomad_mt, fargen_non_gnomad_mt.info.AF[0])
non_gnomad_ffs_ht = non_gnomad_ffs_ht.annotate(strata='non_gnomad')

Make a union of all three FFS tables, so that we can plot them together.

In [18]:
ffs_ht = all_ffs_ht.union(gnomad_ffs_ht).union(non_gnomad_ffs_ht)

In [19]:
p = hl.plot.scatter(ffs_ht.af, ffs_ht.ff, label=ffs_ht.strata,
                    xlabel='Allele counts', ylabel='Frequency in population', title='Site frequency spectrum (folded)',
                    collect_all=True)
p.plot_width = 800
p.plot_height = 400
show(p)

In [20]:
p = hl.plot.scatter(ffs_ht.af, ffs_ht.ff, label=ffs_ht.strata,
                    xlabel='Allele counts', ylabel='Frequency in population', title='Site frequency spectrum (folded)',
                    collect_all=True)
p.plot_width = 800
p.plot_height = 400
p.y_scale = LogScale()
show(p)