# Site-Frequency Spectrum

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '100g'}, tmp_dir='/home/olavur/tmp')

2021-10-07 12:27:31 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2021-10-07 12:27:32 WARN  Hail:37 - This Hail JAR was compiled for Spark 2.4.5, running with Spark 2.4.1.
  Compatibility is not guaranteed.
2021-10-07 12:27:32 WARN  Utils:66 - Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-6676655f87-9xllv:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/qc/hail-20211007-1227-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

## Read data

In [3]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'

In [4]:
import pandas as pd
import numpy as np

In [5]:
mt = hl.read_matrix_table(BASE_DIR + '/data/mt/high_quality_variants.mt/')

In [6]:
n_variants, n_samples = mt.count()
print('Number of variants: {n}\nNumber of samples: {m}'.format(n=n_variants, m=n_samples))

Number of variants: 148305
Number of samples: 469


## Folded Site Frequency Spectrum (FFS)

Let $f_i$ be the *site frequency* of bin $i$, where bin $i$ corresponds to some range of *allele frequencies*. We are going to compute the folded frequencies $f^*_i = f_i + f_{n-i}$.

Example: if we have 10 bins, then $f^*_1 = f_1 + f_{10}$.

Below a function that computes the FFS is defined.

In [7]:
def ffs(mt, ac_exprs, n_alleles):
    hist_struct = mt.aggregate_rows(hl.agg.hist(ac_exprs, 0, n_alleles, n_alleles))

    # Get the allele counts.
    allele_counts = [int(ac) for ac in hist_struct.bin_edges]

    # The site count is the number of times frequencies in each bin is observed.
    site_counts = np.array(hist_struct.bin_freq)

    # Calculate site frequencies.
    n_sites = sum(site_counts)
    site_freq = site_counts / n_sites

    # Calculate folded site-frequencies.
    half = int(n_alleles / 2)
    folded_site_freq = site_freq[:half] + site_freq[:half-1:-1]

    # Make a Hail table with the allele counts and site frequencies.
    ffs_table = []
    for ac, fc in zip(allele_counts, folded_site_freq):
        # Don't add the frequency for 0 alternate alleles.
        if allele_counts == 0:
            continue
        row = {'ac': ac, 'ff': fc}
        ffs_table.append(row)

    # Make a table where each row is a bin, and each row contains the allele count and the folded site-frequency.
    ht_ffs = hl.Table.parallelize(hl.literal(ffs_table, 'array<struct{ac:int32,ff:float32}>'))

    return ht_ffs

In [8]:
# The number of alleles at each site is twice the sample size
n_samples = mt.count_cols()
n_alleles = 2 * n_samples

# We use only autosomal and pseudoautosomal regions, as other regions will not have 2 * n_samples alleles.
auto_mt = mt.filter_rows(mt.locus.in_autosome_or_par())

ffs_ht = ffs(auto_mt, auto_mt.variant_qc.AC[1], n_alleles)



In [9]:
p = hl.plot.scatter(ffs_ht.ac, ffs_ht.ff,
                    xlabel='Allele counts', ylabel='Frequency in population')
p.plot_width = 800
p.plot_height = 400
show(p)