# Gene statistic aggregation

In [1]:
import hail as hl
hl.init()

Running on Apache Spark version 2.4.6
SparkUI available at http://hms-beagle-5466c684ff-2l8nm:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.58-3f304aae6ce2
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/hail-20201209-1053-0.2.58-3f304aae6ce2.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

In [3]:
mt = hl.read_matrix_table('/home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/mt/filtered.mt')

Annotate each locus with the gene name (w.r.t. the first transcript).

In [4]:
mt = mt.annotate_rows(gene=mt.info.ANN.map(lambda x: x.split('\|')[3]))
mt = mt.annotate_rows(gene1=mt.gene[0])

Annotate each locus with the variant impact (w.r.t. the first transcript).

In [5]:
mt = mt.annotate_rows(impact=mt.info.ANN.map(lambda x: x.split('\|')[2]))
mt = mt.annotate_rows(impact1=mt.impact[0])

Count number of heterozygous genotypes per site.

In [6]:
mt = mt.annotate_rows(n_het=hl.agg.count_where(mt.GT.is_het()))

Group sites (rows) by gene name and collect stats:

* Number of high impact variants
* Number of sites
* Average heterozygosity

In [7]:
rows = mt.rows()
gene_stats = (rows.group_by(rows.gene1)
                       .aggregate(n_high_impact = hl.agg.count_where(rows.impact1 == 'HIGH'),
                                  n_sites=hl.agg.count(),
                                  avg_het=hl.agg.mean(rows.n_het)))

Make a histogram of number of sites per gene.

In [8]:
p = hl.plot.histogram(gene_stats.n_sites, range=(0, 200), bins=100, title='Histogram of number of sites per gene')
p.plot_width = 800
p.plot_height = 500
show(p)

2020-12-09 10:53:47 Hail: INFO: Ordering unsorted dataset with network shuffle


In [9]:
p = hl.plot.histogram(gene_stats.avg_het, range=(0, 50), bins=100, title='Histogram of average gene heterozygosity')
p.plot_width = 800
p.plot_height = 500
show(p)

2020-12-09 10:53:55 Hail: INFO: Ordering unsorted dataset with network shuffle


In [10]:
p = hl.plot.histogram(gene_stats.n_high_impact, range=(0, 20), bins=20, title='Histogram of number of high impact variants per gene')
p.plot_width = 800
p.plot_height = 500
show(p)

2020-12-09 10:54:01 Hail: INFO: Ordering unsorted dataset with network shuffle


Make a scatterplot of heterozygosity and impact per gene.

In [11]:
p = hl.plot.scatter(gene_stats.n_high_impact, gene_stats.avg_het, hover_fields={'Gene': gene_stats.gene1})

2020-12-09 10:54:07 Hail: INFO: Ordering unsorted dataset with network shuffle


In [12]:
p.xaxis.axis_label = '# high impact variants'
p.yaxis.axis_label = '# heterozygotes'
p.plot_width = 800
p.plot_height = 500
p.y_scale = LogScale()
p.x_scale = LogScale()
show(p)