# Folded Site-Frequency Spectrum

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '100g'}, tmp_dir='/home/olavur/tmp')

2021-10-20 11:43:17 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2021-10-20 11:43:18 WARN  Hail:37 - This Hail JAR was compiled for Spark 2.4.5, running with Spark 2.4.1.
  Compatibility is not guaranteed.
2021-10-20 11:43:18 WARN  Utils:66 - Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-6676655f87-9xllv:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/main/hail-20211020-1143-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

## Read data

In [3]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'

In [4]:
import pandas as pd
import numpy as np

In [28]:
mt = hl.read_matrix_table(BASE_DIR + '/data/mt/variants.mt/')

In [29]:
n_variants, n_samples = mt.count()
print('Number of variants: {n}\nNumber of samples: {m}'.format(n=n_variants, m=n_samples))

Number of variants: 911929
Number of samples: 473


## Folded Site Frequency Spectrum (FSFS)

Let $f_i$ be the *site frequency* of bin $i$, where bin $i$ corresponds to some range of *allele frequencies*. We are going to compute the folded frequencies $f^*_i = f_i + f_{n-i}$.

Example: if we have 10 bins, then $f^*_1 = f_1 + f_{10}$.

Below a function that computes the FSFS is defined.

In [6]:
def fsfs(n_bins, ht, af_exprs):
    # Make sure the number of bins is an even number.
    assert n_bins % 2 == 0, 'Number of bins must be an even number.'
    hist_struct = ht.aggregate(hl.agg.hist(af_exprs, 0, 1, n_bins))

    # Allele frequency.
    allele_freq = hist_struct.bin_edges

    # The site count is the number of times frequencies in each bin is observed.
    site_counts = np.array(hist_struct.bin_freq)

    # Calculate site frequencies.
    n_sites = sum(site_counts)
    site_freq = site_counts / n_sites

    # Calculate folded site-frequencies.
    half = int(n_bins/2)
    folded_site_freq = site_freq[:half] + site_freq[:half-1:-1]

    # Make a Hail table with the allele counts and site frequencies.
    fsfs_table = []
    for ac, fc in zip(allele_freq, folded_site_freq):
        row = {'af': ac, 'ff': fc}
        fsfs_table.append(row)

    # Make a table where each row is a bin, and each row contains the allele frequency and the folded site-frequency.
    ht_fsfs = hl.Table.parallelize(hl.literal(fsfs_table, 'array<struct{af:float32,ff:float32}>'))

    return ht_fsfs

In [30]:
mt = hl.variant_qc(mt)

rows_ht = mt.rows()

fsfs_ht = fsfs(50, rows_ht, rows_ht.variant_qc.AF[1])



In [31]:
p = hl.plot.scatter(fsfs_ht.af, fsfs_ht.ff,
                    xlabel='Allele counts', ylabel='Frequency in population')
p.plot_width = 800
p.plot_height = 400
show(p)

## FSFS stratified by variant effect

In [32]:
# Get variant impact, effect and gene name.
# Split overlapping transcripts into a list.
rows_ht = rows_ht.annotate(effect=rows_ht.info.ANN.map(lambda x: x.split('\|')[1]),
                      impact=rows_ht.info.ANN.map(lambda x: x.split('\|')[2]))
# Use only the first transcript.
rows_ht = rows_ht.annotate(impact1=rows_ht.impact[0].split('&'), effect1=rows_ht.effect[0].split('&'))

In [35]:
n_bins = 50
first = True
for effect in ['synonymous_variant', 'missense_variant', 'intron_variant']:
    temp_ht = rows_ht.filter(rows_ht.effect1.contains(effect))
    temp_fsfs_ht = fsfs(n_bins, temp_ht, temp_ht.variant_qc.AF[1])
    temp_fsfs_ht = temp_fsfs_ht.annotate(strata=effect)
    
    if first:
        fsfs_ht = temp_fsfs_ht
        first = False
    else:
        fsfs_ht = fsfs_ht.union(temp_fsfs_ht)



In [36]:
p = hl.plot.scatter(fsfs_ht.af, fsfs_ht.ff, label=fsfs_ht.strata,
                    xlabel='Allele counts', ylabel='Frequency in population',
                    collect_all=True)
p.plot_width = 800
p.plot_height = 400
p.y_scale = LogScale()
show(p)

## FSFS stratified by variant impact

In [37]:
n_bins = 50
first = True
for impact in ['HIGH', 'MODERATE', 'MODIFIER', 'LOW']:
    temp_ht = rows_ht.filter(rows_ht.impact1.contains(impact))
    temp_fsfs_ht = fsfs(n_bins, temp_ht, temp_ht.variant_qc.AF[1])
    temp_fsfs_ht = temp_fsfs_ht.annotate(strata=impact)
    
    if first:
        fsfs_ht = temp_fsfs_ht
        first = False
    else:
        fsfs_ht = fsfs_ht.union(temp_fsfs_ht)



In [38]:
p = hl.plot.scatter(fsfs_ht.af, fsfs_ht.ff, label=fsfs_ht.strata,
                    xlabel='Allele counts', ylabel='Frequency in population',
                    collect_all=True)
p.plot_width = 800
p.plot_height = 400
p.y_scale = LogScale()
show(p)