# Variant distributions

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '100g'}, tmp_dir='/home/olavur/tmp')

2021-10-07 12:17:27 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2021-10-07 12:17:27 WARN  Hail:37 - This Hail JAR was compiled for Spark 2.4.5, running with Spark 2.4.1.
  Compatibility is not guaranteed.


Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-6676655f87-9xllv:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/qc/hail-20211007-1217-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

## Read data

In [3]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'
RESOURCES_DIR = '/data/other/resources'

In [4]:
import pandas as pd
import numpy as np

Merged FarGen and gnomAD sites. This dataset only contains the population frequency of variants and SnpEff annotations. See the notebooks in the `gnomad_exome_sites` folder.

In [9]:
ht = hl.read_table(BASE_DIR + '/data/resources/gnomad_exome_sites/fargen_gnomad_union_annotated.ht')

In [6]:
n_variants = ht.count()
print('Number of variants: ' + str(n_variants))

Number of variants: 15692611


In [10]:
ht.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'rsid': str 
    'qual': float64 
    'filters': set<str> 
    'info': struct {
        AF_gnomad_all: float64, 
        AF_afr: float64, 
        AF_sas: float64, 
        AF_amr: float64, 
        AF_eas: float64, 
        AF_nfe: float64, 
        AF_fin: float64, 
        AF_nfe_nwe: float64, 
        AF_nfe_seu: float64, 
        AC_gnomad_all: int32, 
        AC_afr: int32, 
        AC_sas: int32, 
        AC_amr: int32, 
        AC_eas: int32, 
        AC_nfe: int32, 
        AC_fin: int32, 
        AC_nfe_nwe: int32, 
        AC_nfe_seu: int32, 
        AF_fae: float64, 
        AC_fae: int32, 
        ANN: array<str>, 
        LOF: array<str>, 
        NMD: array<str>
    } 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------


## Count variants

Annotate variants with variant effect, impact and gene. These annotations are obtained from SnpEff.

If multiple transcripts overlap a site, the first in the list is arbitrarily chosen.

**NOTE:** is it possible to use [split_multi()](https://hail.is/docs/0.2/methods/genetics.html#hail.methods.split_multi) to count overlapping transcripts separately?

In [11]:
# Get variant impact, effect and gene name.
# Split overlapping transcripts into a list.
ht = ht.annotate(effect=ht.info.ANN.map(lambda x: x.split('\|')[1]),
                      impact=ht.info.ANN.map(lambda x: x.split('\|')[2]),
                      gene=ht.info.ANN.map(lambda x: x.split('\|')[3]))
# Use only the first transcript.
ht = ht.annotate(impact1=ht.impact[0], effect1=ht.effect[0], gene1=ht.gene[0])

In [12]:
def variant_counts(ht):
    # Count number of variants, SNPs and indels. Only first allele in alternate allele list is considered.
    variant_counts_struct = ht.aggregate(hl.struct(
        n_variants = hl.agg.count(),
        snps = hl.agg.count_where(hl.is_snp(ht.alleles[0], ht.alleles[1])) / hl.agg.count(),
        indels = hl.agg.count_where(hl.is_indel(ht.alleles[0], ht.alleles[1])) / hl.agg.count()))
    
    variant_counts_pd = pd.DataFrame(variant_counts_struct.values(), index=variant_counts_struct.keys(), columns=[''])
    return variant_counts_pd

def impact_counts(ht):
    # Count variant impact types.
    impact_counts_struct = ht.aggregate(
        hl.struct(
            LOW = hl.agg.count_where(ht.impact1 == 'LOW') / hl.agg.count(),
            MODIFIER = hl.agg.count_where(ht.impact1 == 'MODIFIER') / hl.agg.count(),
            MODERATE = hl.agg.count_where(ht.impact1 == 'MODERATE') / hl.agg.count(),
            HIGH = hl.agg.count_where(ht.impact1 == 'HIGH') / hl.agg.count()))
    
    impact_counts_pd = pd.DataFrame(impact_counts_struct.values(), index=impact_counts_struct.keys(), columns=[''])
    return impact_counts_pd

def effect_counts(ht):
    # Count variant effect types.
    # There are many more effect types, these are some of the more interesting ones.
    effect_counts_struct = ht.aggregate(
        hl.struct(
            synonymous_variant = hl.agg.count_where(ht.effect1 == 'synonymous_variant') / hl.agg.count(),
            missense_variant = hl.agg.count_where(ht.effect1 == 'missense_variant') / hl.agg.count(),
            frameshift_variant = hl.agg.count_where(ht.effect1 == 'frameshift_variant') / hl.agg.count(),              
            stop_gained = hl.agg.count_where(ht.effect1 == 'stop_gained') / hl.agg.count()))
    
    effect_counts_pd = pd.DataFrame(effect_counts_struct.values(), index=effect_counts_struct.keys(), columns=[''])
    return effect_counts_pd

In [13]:
# Variants with non-zero alternate allele frequency in FarGen.
fae_ht = ht.filter(ht.info.AF_fae != 0)
fae_variant_counts_pd = variant_counts(fae_ht)
fae_effect_counts_pd = effect_counts(fae_ht)
fae_impact_counts_pd = impact_counts(fae_ht)



In [14]:
# Variants with non-zero alternate allele frequency in gnomAD.
gnomad_ht = ht.filter(ht.info.AF_gnomad_all != 0)
gnomad_variant_counts_pd = variant_counts(gnomad_ht)
gnomad_effect_counts_pd = effect_counts(gnomad_ht)
gnomad_impact_counts_pd = impact_counts(gnomad_ht)



In [15]:
# Variants that are enriched in FarGen: less than 1% in gnomAD and more than 5% in FarGen.
enriched_ht = ht.filter((ht.info.AF_gnomad_all < 0.01) & (ht.info.AF_fae > 0.05))
enriched_variant_counts_pd = variant_counts(enriched_ht)
enriched_effect_counts_pd = effect_counts(enriched_ht)
enriched_impact_counts_pd = impact_counts(enriched_ht)



In [16]:
# Concatenate columns, FarGen, gnomAD and enriched.
variant_counts_pd = pd.concat([fae_variant_counts_pd, gnomad_variant_counts_pd, enriched_variant_counts_pd], axis=1)
effect_counts_pd = pd.concat([fae_effect_counts_pd, gnomad_effect_counts_pd, enriched_effect_counts_pd], axis=1)
impact_counts_pd = pd.concat([fae_impact_counts_pd, gnomad_impact_counts_pd, enriched_impact_counts_pd], axis=1)

# Concatenate rows, variants, effects and impact.
counts_pd = pd.concat([variant_counts_pd, effect_counts_pd, impact_counts_pd])

# Set column names.
counts_pd.columns = ['FarGen', 'gnomAD', 'Enriched']

In [17]:
pd.set_option('display.float_format', lambda x: '%g' % x)

In [18]:
counts_pd

Unnamed: 0,FarGen,gnomAD,Enriched
n_variants,59352.0,14946000.0,283.0
snps,0.902733,0.937682,0.64311
indels,0.0972672,0.0623182,0.35689
synonymous_variant,0.234247,0.166956,0.155477
missense_variant,0.218106,0.345944,0.212014
frameshift_variant,0.00434695,0.0156443,0.0918728
stop_gained,0.00205553,0.0104211,0.00706714
LOW,0.295255,0.217375,0.233216
MODIFIER,0.454205,0.36904,0.402827
MODERATE,0.232191,0.36583,0.233216


## Folded Site Frequency Spectrum (FFS)

Let $f_i$ be the *site frequency* of bin $i$, where bin $i$ corresponds to some range of *allele frequencies*. We are going to compute the folded frequencies $f^*_i = f_i + f_{n-i}$.

Example: if we have 10 bins, then $f^*_1 = f_1 + f_{10}$.

Below a function that computes the FFS is defined.

In [19]:
def ffs(n_bins, ht, af_exprs):
    # Make sure the number of bins is an even number.
    assert n_bins % 2 == 0, 'Number of bins must be an even number.'
    hist_struct = ht.aggregate(hl.agg.hist(af_exprs, 0, 1, n_bins))

    # Allele frequency.
    allele_freq = hist_struct.bin_edges

    # The site count is the number of times frequencies in each bin is observed.
    site_counts = np.array(hist_struct.bin_freq)

    # Calculate site frequencies.
    n_sites = sum(site_counts)
    site_freq = site_counts / n_sites

    # Calculate folded site-frequencies.
    half = int(n_bins/2)
    folded_site_freq = site_freq[:half] + site_freq[:half-1:-1]

    # Make a Hail table with the allele counts and site frequencies.
    ffs_table = []
    for ac, fc in zip(allele_freq, folded_site_freq):
        row = {'af': ac, 'ff': fc}
        ffs_table.append(row)

    # Make a table where each row is a bin, and each row contains the allele frequency and the folded site-frequency.
    ht_ffs = hl.Table.parallelize(hl.literal(ffs_table, 'array<struct{af:float32,ff:float32}>'))

    return ht_ffs

We compute a simple FFS and plot it. Note that singletons have a frequency of about 0.65.

In [20]:
fae_ffs_ht = ffs(100, ht, ht.info.AF_fae)

In [21]:
p = hl.plot.scatter(fae_ffs_ht.af, fae_ffs_ht.ff,
                    xlabel='Allele counts', ylabel='Frequency in population')
p.plot_width = 800
p.plot_height = 400
show(p)

## FFS strafified by population

In [22]:
n_bins = 200

fae_ht = ht.filter(ht.info.AF_fae != 0)
fae_ffs_ht = ffs(n_bins, fae_ht, fae_ht.info.AF_fae)
fae_ffs_ht = fae_ffs_ht.annotate(strata='FarGen')

gnomad_ht = ht.filter(ht.info.AF_gnomad_all != 0)
gnomad_ffs_ht = ffs(n_bins, gnomad_ht, gnomad_ht.info.AF_gnomad_all)
gnomad_ffs_ht = gnomad_ffs_ht.annotate(strata='gnomAD')

ffs_ht = fae_ffs_ht.union(gnomad_ffs_ht)

In [23]:
p = hl.plot.scatter(ffs_ht.af, ffs_ht.ff, label=ffs_ht.strata,
                    xlabel='Allele counts', ylabel='Frequency in population',
                    collect_all=True)
p.plot_width = 800
p.plot_height = 400
p.y_scale = LogScale()
show(p)

## Frequency correlation

In [24]:
p = hl.plot.scatter(ht.info.AF_fae, ht.info.AF_gnomad_all, xlabel='FarGen AF', ylabel='gnomAD AF', size=1, label=ht.impact1)
p.plot_width = 800
p.plot_height = 400
show(p)

