# Variant distributions

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '100g'}, tmp_dir='/home/olavur/tmp')

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-848846b477-48ks9:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/qc/hail-20210617-1010-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

## Read data

In [3]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'
RESOURCES_DIR = '/data/other/resources'

In [52]:
import pandas as pd

Read high quality variants.

In [4]:
mt = hl.read_matrix_table(BASE_DIR + '/data/mt/high_quality_variants.mt/')

In [5]:
n_variants, n_samples = mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 1146382
Number of samples: 468


## Count variants

Annotate variants with variant effect, impact and gene. These annotations are obtained from SnpEff.

If multiple transcripts overlap a site, the first in the list is arbitrarily chosen.

For multi-allelic sites, only one of the alternate alleles is considered.

**NOTE:** use [split_multi()](https://hail.is/docs/0.2/methods/genetics.html#hail.methods.split_multi) to count multi-allelic sites.

**NOTE:** is it possible to use [split_multi()](https://hail.is/docs/0.2/methods/genetics.html#hail.methods.split_multi) to count overlapping transcripts separately?

In [12]:
# Get variant impact, effect and gene name.
# Split overlapping transcripts into a list.
mt = mt.annotate_rows(effect=mt.info.ANN.map(lambda x: x.split('\|')[1]),
                      impact=mt.info.ANN.map(lambda x: x.split('\|')[2]),
                      gene=mt.info.ANN.map(lambda x: x.split('\|')[3]))
# Use only the first transcript.
mt = mt.annotate_rows(impact1=mt.impact[0], effect1=mt.effect[0], gene1=mt.gene[0])

In [58]:
# Count number of variants, SNPs and indels. Only first allele in alternate allele list is considered.
variant_counts_struct = mt.aggregate_rows(hl.struct(
    n_variants = hl.agg.count(),
    n_snps = hl.agg.count_where(hl.is_snp(mt.alleles[0], mt.alleles[1])),
    n_indels = hl.agg.count_where(hl.is_indel(mt.alleles[0], mt.alleles[1]))))

# Count variant impact types.
impact_counts_struct = mt.aggregate_rows(
    hl.struct(
        LOW = hl.agg.count_where(mt.impact1 == 'LOW'),
        MODIFIER = hl.agg.count_where(mt.impact1 == 'MODIFIER'),
        MODERATE = hl.agg.count_where(mt.impact1 == 'MODERATE'),
        HIGH = hl.agg.count_where(mt.impact1 == 'HIGH')))

# Count variant effect types.
# There are many more effect types, these are some of the more interesting ones.
effect_counts_struct = mt.aggregate_rows(
    hl.struct(
        synonymous_variant = hl.agg.count_where(mt.effect1 == 'synonymous_variant'),
        missense_variant = hl.agg.count_where(mt.effect1 == 'missense_variant'),
        frameshift_variant = hl.agg.count_where(mt.effect1 == 'frameshift_variant'),              
        stop_lost = hl.agg.count_where(mt.effect1 == 'stop_lost')))

In [78]:
pd.DataFrame(variant_counts_struct.values(), index=variant_counts_struct.keys(), columns=['Count'])

Unnamed: 0,Count
n_variants,1146382
n_snps,429044
n_indels,717314


In [79]:
pd.DataFrame(impact_counts_struct.values(), index=impact_counts_struct.keys(), columns=['Count'])

Unnamed: 0,Count
LOW,75420
MODIFIER,916641
MODERATE,54417
HIGH,99904


In [80]:
pd.DataFrame(effect_counts_struct.values(), index=effect_counts_struct.keys(), columns=['Count'])

Unnamed: 0,Count
synonymous_variant,37396
missense_variant,44908
frameshift_variant,89744
stop_lost,49
