# Variant distributions

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '100g'}, tmp_dir='/home/olavur/tmp')

2021-10-20 10:31:26 WARN  NativeCodeLoader:62 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2021-10-20 10:31:27 WARN  Hail:37 - This Hail JAR was compiled for Spark 2.4.5, running with Spark 2.4.1.
  Compatibility is not guaranteed.


Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-6676655f87-9xllv:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/main/hail-20211020-1031-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

## Read data

In [3]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'
RESOURCES_DIR = '/data/other/resources'

In [4]:
import pandas as pd
import numpy as np

Merged FarGen and gnomAD sites. This dataset only contains the population frequency of variants and SnpEff annotations. See the notebooks in the `gnomad_exome_sites` folder.

In [5]:
ht = hl.read_table(BASE_DIR + '/data/resources/gnomad_exome_sites/fargen_gnomad_union_annotated.ht')

In [6]:
n_variants = ht.count()
print('Number of variants: ' + str(n_variants))

Number of variants: 15692611


In [7]:
ht.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh38> 
    'alleles': array<str> 
    'rsid': str 
    'qual': float64 
    'filters': set<str> 
    'info': struct {
        AF_gnomad_all: float64, 
        AF_afr: float64, 
        AF_sas: float64, 
        AF_amr: float64, 
        AF_eas: float64, 
        AF_nfe: float64, 
        AF_fin: float64, 
        AF_nfe_nwe: float64, 
        AF_nfe_seu: float64, 
        AC_gnomad_all: int32, 
        AC_afr: int32, 
        AC_sas: int32, 
        AC_amr: int32, 
        AC_eas: int32, 
        AC_nfe: int32, 
        AC_fin: int32, 
        AC_nfe_nwe: int32, 
        AC_nfe_seu: int32, 
        AF_fae: float64, 
        AC_fae: int32, 
        ANN: array<str>, 
        LOF: array<str>, 
        NMD: array<str>
    } 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------


## Count variants

Annotate variants with variant effect, impact and gene. These annotations are obtained from SnpEff.

If multiple transcripts overlap a site, the first in the list is arbitrarily chosen.

**NOTE:** is it possible to use [split_multi()](https://hail.is/docs/0.2/methods/genetics.html#hail.methods.split_multi) to count overlapping transcripts separately?

In [7]:
# Get variant impact, effect and gene name.
# Split overlapping transcripts into a list.
ht = ht.annotate(effect=ht.info.ANN.map(lambda x: x.split('\|')[1]),
                      impact=ht.info.ANN.map(lambda x: x.split('\|')[2]),
                      gene=ht.info.ANN.map(lambda x: x.split('\|')[3]))
# Use only the first transcript.
ht = ht.annotate(impact1=ht.impact[0], effect1=ht.effect[0], gene1=ht.gene[0])

In [9]:
def variant_counts(ht):
    # Count number of variants, SNPs and indels. Only first allele in alternate allele list is considered.
    variant_counts_struct = ht.aggregate(hl.struct(
        n_variants = hl.agg.count(),
        snps = hl.agg.count_where(hl.is_snp(ht.alleles[0], ht.alleles[1])) / hl.agg.count(),
        indels = hl.agg.count_where(hl.is_indel(ht.alleles[0], ht.alleles[1])) / hl.agg.count()))
    
    variant_counts_pd = pd.DataFrame(variant_counts_struct.values(), index=variant_counts_struct.keys(), columns=[''])
    return variant_counts_pd

def impact_counts(ht):
    # Count variant impact types.
    impact_counts_struct = ht.aggregate(
        hl.struct(
            LOW = hl.agg.count_where(ht.impact1 == 'LOW') / hl.agg.count(),
            MODIFIER = hl.agg.count_where(ht.impact1 == 'MODIFIER') / hl.agg.count(),
            MODERATE = hl.agg.count_where(ht.impact1 == 'MODERATE') / hl.agg.count(),
            HIGH = hl.agg.count_where(ht.impact1 == 'HIGH') / hl.agg.count()))
    
    impact_counts_pd = pd.DataFrame(impact_counts_struct.values(), index=impact_counts_struct.keys(), columns=[''])
    return impact_counts_pd

def effect_counts(ht):
    # Count variant effect types.
    # There are many more effect types, these are some of the more interesting ones.
    effect_counts_struct = ht.aggregate(
        hl.struct(
            synonymous_variant = hl.agg.count_where(ht.effect1 == 'synonymous_variant') / hl.agg.count(),
            missense_variant = hl.agg.count_where(ht.effect1 == 'missense_variant') / hl.agg.count(),
            frameshift_variant = hl.agg.count_where(ht.effect1 == 'frameshift_variant') / hl.agg.count(),              
            stop_gained = hl.agg.count_where(ht.effect1 == 'stop_gained') / hl.agg.count()))
    
    effect_counts_pd = pd.DataFrame(effect_counts_struct.values(), index=effect_counts_struct.keys(), columns=[''])
    return effect_counts_pd

In [13]:
# Variants with non-zero alternate allele frequency in FarGen.
fae_ht = ht.filter(ht.info.AF_fae != 0)
fae_variant_counts_pd = variant_counts(fae_ht)
fae_effect_counts_pd = effect_counts(fae_ht)
fae_impact_counts_pd = impact_counts(fae_ht)



In [20]:
# FIXME: Tesing, uncomment.
# Variants with non-zero alternate allele frequency in gnomAD.
gnomad_all_ht = ht.filter(ht.info.AF_gnomad_all != 0)
gnomad_all_variant_counts_pd = variant_counts(gnomad_all_ht)
gnomad_all_effect_counts_pd = effect_counts(gnomad_all_ht)
gnomad_all_impact_counts_pd = impact_counts(gnomad_all_ht)



In [34]:
# Variants with non-zero alternate allele frequency in gnomAD.
gnomad_nfe_nwe_ht = ht.filter((ht.info.AF_nfe_nwe != 0) & (ht.info.AC_nfe_nwe > 1))
gnomad_nfe_nwe_variant_counts_pd = variant_counts(gnomad_nfe_nwe_ht)
gnomad_nfe_nwe_effect_counts_pd = effect_counts(gnomad_nfe_nwe_ht)
gnomad_nfe_nwe_impact_counts_pd = impact_counts(gnomad_nfe_nwe_ht)



In [30]:
# Variants that are enriched in FarGen: less than 1% in gnomAD and more than 5% in FarGen.
enriched_ht = ht.filter((ht.info.AF_nfe_nwe < 0.05) & (ht.info.AF_fae > 0.01))
enriched_variant_counts_pd = variant_counts(enriched_ht)
enriched_effect_counts_pd = effect_counts(enriched_ht)
enriched_impact_counts_pd = impact_counts(enriched_ht)



In [35]:
# Concatenate columns, FarGen, gnomAD and enriched.
variant_counts_pd = pd.concat([fae_variant_counts_pd, gnomad_all_variant_counts_pd, gnomad_nfe_nwe_variant_counts_pd, enriched_variant_counts_pd], axis=1)
effect_counts_pd = pd.concat([fae_effect_counts_pd, gnomad_all_effect_counts_pd, gnomad_nfe_nwe_effect_counts_pd, enriched_effect_counts_pd], axis=1)
impact_counts_pd = pd.concat([fae_impact_counts_pd, gnomad_all_impact_counts_pd, gnomad_nfe_nwe_impact_counts_pd, enriched_impact_counts_pd], axis=1)

# Concatenate rows, variants, effects and impact.
counts_pd = pd.concat([variant_counts_pd, effect_counts_pd, impact_counts_pd])

# Set column names.
counts_pd.columns = ['FarGen', 'gnomAD all', 'gnomAD NFE NWE', 'Enriched']

In [36]:
pd.set_option('display.float_format', lambda x: '%g' % x)

In [37]:
counts_pd

Unnamed: 0,FarGen,gnomAD all,gnomAD NFE NWE,Enriched
n_variants,59352.0,14946000.0,1654000.0,11124.0
snps,0.902733,0.937682,0.926923,0.857605
indels,0.0972672,0.0623182,0.073076,0.142395
synonymous_variant,0.234247,0.166956,0.185015,0.217188
missense_variant,0.218106,0.345944,0.319183,0.261057
frameshift_variant,0.00434695,0.0156443,0.0100272,0.0106077
stop_gained,0.00205553,0.0104211,0.00730469,0.00368572
LOW,0.295255,0.217375,0.237523,0.280744
MODIFIER,0.454205,0.36904,0.389407,0.412981
MODERATE,0.232191,0.36583,0.337796,0.278497


## Folded Site Frequency Spectrum (FSFS)

Let $f_i$ be the *site frequency* of bin $i$, where bin $i$ corresponds to some range of *allele frequencies*. We are going to compute the folded frequencies $f^*_i = f_i + f_{n-i}$.

Example: if we have 10 bins, then $f^*_1 = f_1 + f_{10}$.

Below a function that computes the FSFS is defined.

In [64]:
def fsfs(n_bins, ht, af_exprs):
    # Make sure the number of bins is an even number.
    assert n_bins % 2 == 0, 'Number of bins must be an even number.'
    hist_struct = ht.aggregate(hl.agg.hist(af_exprs, 0, 1, n_bins))

    # Allele frequency.
    allele_freq = hist_struct.bin_edges

    # The site count is the number of times frequencies in each bin is observed.
    site_counts = np.array(hist_struct.bin_freq)

    # Calculate site frequencies.
    n_sites = sum(site_counts)
    site_freq = site_counts / n_sites

    # Calculate folded site-frequencies.
    half = int(n_bins/2)
    folded_site_freq = site_freq[:half] + site_freq[:half-1:-1]

    # Make a Hail table with the allele counts and site frequencies.
    fsfs_table = []
    for ac, fc in zip(allele_freq, folded_site_freq):
        row = {'af': ac, 'ff': fc}
        fsfs_table.append(row)

    # Make a table where each row is a bin, and each row contains the allele frequency and the folded site-frequency.
    ht_fsfs = hl.Table.parallelize(hl.literal(fsfs_table, 'array<struct{af:float32,ff:float32}>'))

    return ht_fsfs

We compute a simple FFS and plot it. Note that singletons have a frequency of about 0.65.

In [67]:
fae_ht = ht.filter(ht.info.AC_fae != 0)
fae_fsfs_ht = fsfs(50, fae_ht, fae_ht.info.AF_fae)



In [68]:
p = hl.plot.scatter(fae_fsfs_ht.af, fae_fsfs_ht.ff,
                    xlabel='Allele counts', ylabel='Frequency in population')
p.plot_width = 800
p.plot_height = 400
show(p)

## FSFS strafified by population

In [69]:
n_bins = 50

fae_ht = ht.filter(ht.info.AC_fae != 0)
fae_fsfs_ht = ffs(n_bins, fae_ht, fae_ht.info.AF_fae)
fae_fsfs_ht = fae_fsfs_ht.annotate(strata='FarGen')

gnomad_ht = ht.filter(ht.info.AC_gnomad_all != 0)
gnomad_fsfs_ht = fsfs(n_bins, gnomad_ht, gnomad_ht.info.AF_gnomad_all)
gnomad_fsfs_ht = gnomad_fsfs_ht.annotate(strata='gnomAD')


gnomad_nfe_nwe_ht = ht.filter(ht.info.AC_nfe_nwe != 0)
gnomad_nfe_nwe_fsfs_ht = fsfs(n_bins, gnomad_nfe_nwe_ht, gnomad_nfe_nwe_ht.info.AF_nfe_nwe)
gnomad_nfe_nwe_fsfs_ht = gnomad_nfe_nwe_fsfs_ht.annotate(strata='NFE NWE')

fsfs_ht = fae_fsfs_ht.union(gnomad_fsfs_ht)
fsfs_ht = fsfs_ht.union(gnomad_nfe_nwe_fsfs_ht)



In [70]:
p = hl.plot.scatter(fsfs_ht.af, fsfs_ht.ff, label=fsfs_ht.strata,
                    xlabel='Allele counts', ylabel='Frequency in population',
                    collect_all=True)
p.plot_width = 800
p.plot_height = 400
p.y_scale = LogScale()
show(p)

## FSFS stratified by variant effect

In [6]:
# Get variant impact, effect and gene name.
# Split overlapping transcripts into a list.
ht = ht.annotate(effect=ht.info.ANN.map(lambda x: x.split('\|')[1]),
                      impact=ht.info.ANN.map(lambda x: x.split('\|')[2]))
# Use only the first transcript.
ht = ht.annotate(impact1=ht.impact[0].split('&'), effect1=ht.effect[0].split('&'))

In [65]:
n_bins = 50
first = True
for effect in ['synonymous_variant', 'missense_variant', 'intron_variant']:
    temp_ht = ht.filter((ht.effect1.contains(effect)) & (ht.info.AC_fae != 0))
    temp_fsfs_ht = fsfs(n_bins, temp_ht, temp_ht.info.AF_fae)
    temp_fsfs_ht = temp_fsfs_ht.annotate(strata=effect)
    
    if first:
        fsfs_ht = temp_fsfs_ht
        first = False
    else:
        fsfs_ht = fsfs_ht.union(temp_fsfs_ht)



In [66]:
p = hl.plot.scatter(fsfs_ht.af, fsfs_ht.ff, label=fsfs_ht.strata,
                    xlabel='Allele counts', ylabel='Frequency in population',
                    collect_all=True)
p.plot_width = 800
p.plot_height = 400
p.y_scale = LogScale()
show(p)

## Count all effect types

Only for the first transcript in the list.

In [57]:
%%time

temp_ht = ht.filter(ht.info.AC_fae != 0)
effect1_list = temp_ht.effect1.collect()



CPU times: user 433 ms, sys: 290 ms, total: 722 ms
Wall time: 11.9 s


In [58]:
len(effect1_list)

59352

In [59]:
effect1_small = effect1_list[:100000]

In [60]:
effect1_list_flat = [item for sublist in effect1_list for item in sublist]

In [61]:
%%time

effect_count = dict()
for effect in effect1_list_flat:
    count = effect_count.get(effect)
    if count is None:
        effect_count[effect] = 1
    else:
        effect_count[effect] += 1

CPU times: user 28.9 ms, sys: 146 µs, total: 29 ms
Wall time: 28.8 ms


In [62]:
sorted(effect_count.items(), key=lambda x: x[1], reverse=True)

[('intron_variant', 21755),
 ('synonymous_variant', 14228),
 ('missense_variant', 13230),
 ('splice_region_variant', 3365),
 ('upstream_gene_variant', 2042),
 ('3_prime_UTR_variant', 2017),
 ('downstream_gene_variant', 1850),
 ('5_prime_UTR_variant', 954),
 ('intergenic_region', 678),
 ('sequence_feature', 617),
 ('non_coding_transcript_exon_variant', 500),
 ('structural_interaction_variant', 493),
 ('frameshift_variant', 277),
 ('5_prime_UTR_premature_start_codon_gain_variant', 168),
 ('disruptive_inframe_deletion', 143),
 ('stop_gained', 129),
 ('conservative_inframe_deletion', 85),
 ('splice_acceptor_variant', 82),
 ('disruptive_inframe_insertion', 48),
 ('splice_donor_variant', 47),
 ('conservative_inframe_insertion', 40),
 ('start_lost', 38),
 ('stop_lost', 21),
 ('stop_retained_variant', 18),
 ('protein_protein_contact', 16),
 ('gene_fusion', 1)]

## Frequency correlation

In [None]:
temp_ht = ht.filter((ht.effect1 == 'synonymous_variant') | (ht.effect1 == 'missense_variant'))
p = hl.plot.scatter(temp_ht.info.AF_fae, temp_ht.info.AF_gnomad_all, xlabel='FarGen AF', ylabel='gnomAD AF', size=1, label=temp_ht.effect1)
p.plot_width = 800
p.plot_height = 400
show(p)