In [1]:
import hail as hl
hl.init()

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-7889d4ff4c-z7fmq:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/hail-20210224-1345-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

In [3]:
mt = hl.read_matrix_table('/home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/mt/variants.mt')

In [4]:
mt = hl.variant_qc(mt)
mt = hl.sample_qc(mt)

In [5]:
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'sample_qc': struct {
        dp_stats: struct {
            mean: float64, 
            stdev: float64, 
            min: float64, 
            max: float64
        }, 
        gq_stats: struct {
            mean: float64, 
            stdev: float64, 
            min: float64, 
            max: float64
        }, 
        call_rate: float64, 
        n_called: int64, 
        n_not_called: int64, 
        n_filtered: int64, 
        n_hom_ref: int64, 
        n_het: int64, 
        n_hom_var: int64, 
        n_non_ref: int64, 
        n_singleton: int64, 
        n_snp: int64, 
        n_insertion: int64, 
        n_deletion: int64, 
        n_transition: int64, 
        n_transversion: int64, 
        n_star: int64, 
        r_ti_tv: float64, 
        r_het_hom_var: float64, 
        r_insertion_deletion: float64
    }
----------------------------

## VQSR filter

In [6]:
mt = mt.transmute_rows(filters=hl.delimit(mt.filters, ','))

In [7]:
mt.aggregate_rows(hl.agg.counter(mt.filters))

{'': 572780,
 'VQSRTrancheINDEL99.90to100.00': 48589,
 'VQSRTrancheINDEL99.00to99.90': 82523,
 'VQSRTrancheSNP99.90to100.00': 137182,
 'VQSRTrancheSNP99.00to99.90': 107475}

## Variant QC

In [8]:
p = hl.plot.histogram(mt.variant_qc.dp_stats.mean, range=(0,100), legend='Mean DP per variant histogram')
show(p)

In [9]:
p = hl.plot.histogram(mt.variant_qc.gq_stats.mean, range=(0,100), legend='Mean GQ per variant histogram')
show(p)

## Sample QC

### Depth

In [11]:
p = hl.plot.histogram(mt.sample_qc.dp_stats.mean, range=(0,100), legend='Mean DP per sample histogram')
show(p)

### Call rate

In [23]:
p = hl.plot.histogram(mt.sample_qc.call_rate, range=(0.9998,1), legend='Call Rate')
show(p)

### Genotype quality

In [13]:
p = hl.plot.histogram(mt.sample_qc.gq_stats.mean, range=(10,100), legend='Mean Sample GQ')
show(p)

### Het/hom rate

In [14]:
p = hl.plot.histogram(mt.sample_qc.r_het_hom_var, range=(0.5,2.5), legend='Het/hom rate')
show(p)

### DP-call rate scatter

In [15]:
p = hl.plot.scatter(mt.sample_qc.dp_stats.mean, mt.sample_qc.call_rate, xlabel='Mean DP', ylabel='Call Rate')
show(p)

## Ti/tv rate vs het/hom rate

In [16]:
p = hl.plot.scatter(mt.sample_qc.r_ti_tv, mt.sample_qc.r_het_hom_var, xlabel='ti/tv rate', ylabel='het/hom rate')
show(p)

## Sites where all samples are hom.alt.

In [17]:
mt_hom_alt = mt.filter_rows(mt.variant_qc.AC[0] == 0)
n_all_hom_alt = mt_hom_alt.count_rows()
print(n_all_hom_alt)

9953


Note that there are no sites where all samples are hom.ref., because these have been filtered out.

In [18]:
n_all_hom_ref = mt.filter_rows(mt.variant_qc.AC[1] == 0).count_rows()
print(n_all_hom_ref)

0


In [19]:
p = hl.plot.histogram(mt_hom_alt.variant_qc.dp_stats.mean, range=(0,100), legend='Mean DP per variant histogram')
show(p)

Looks like the variants have poor depth, but the shape of the distribution is similar to that of all the variants (above). The genotype quality looks much better.

In [20]:
p = hl.plot.histogram(mt_hom_alt.variant_qc.gq_stats.mean, range=(0,100), legend='Mean GQ per variant histogram')
show(p)

Are any of the variants novel?

In [21]:
n_novel = mt_hom_alt.aggregate_rows(hl.agg.count_where(hl.is_missing(mt_hom_alt.rsid)))
print('{n_novel} out of the {n_all_hom_alt} variants are novel'.format(n_novel=n_novel, n_all_hom_alt=n_all_hom_alt))

388 out of the 9953 variants are novel


Would be interesting to see what the allele frequency of these variants is in other populations.