# Pasing statistics

In [1]:
import hail as hl
hl.init(spark_conf={'spark.driver.memory': '10g'}, tmp_dir='/home/olavur/tmp')

Running on Apache Spark version 2.4.1
SparkUI available at http://hms-beagle-848846b477-48ks9:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.61-3c86d3ba497a
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/qc/hail-20210614-1032-0.2.61-3c86d3ba497a.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
from bokeh.models.scales import LogScale
output_notebook()

In [3]:
import pandas as pd

## Import data

In [4]:
BASE_DIR = '/home/olavur/experiments/2020-11-13_fargen1_exome_analysis'

In [63]:
mt = hl.read_matrix_table(BASE_DIR + '/data/mt/high_quality_variants.mt/')

In [64]:
n_variants, n_samples = mt.count()
print('Number of variants: ' + str(n_variants))
print('Number of samples: ' + str(n_samples))

Number of variants: 1146382
Number of samples: 468


## Phasing statistics

### Phased heterozygotes

Calculate number of phased heterozygotes per sample.

**NOTE:** we only look at heterozygotes, because in principle, the phase of any homozygote variants are known trivially. Therefore, if we include homozygous variants, in principle our phase block stretches from the first to the last homozygote on the chromosome.

In [90]:
# Get all heterozygotes.
het_mt = mt.filter_entries(mt.GT.is_het())

# For each sample, count number of phased heterozygotes and number of heterozygotes.
het_mt = het_mt.annotate_cols(n_phased_hets=hl.agg.count_where(het_mt.GT.phased), n_hets = hl.agg.count_where(het_mt.GT.is_het()))

# Calculate fraction of phased heterozygotes.
het_mt = het_mt.annotate_cols(phased_hets_fraction = het_mt.n_phased_hets / het_mt.n_hets)

The plot below shows the distribution of the fraction of phased heterozygotes for all samples. The majority of the samples have a fraction somewhere between 0.6 and 0.7, while a few have as low as 0.4, and a single sample has about 0.2.

In [91]:
p = hl.plot.histogram(het_mt.phased_hets_fraction, title='Histogram of fraction of phased heterozygotes per sample')
p.plot_width = 800
p.plot_height = 500
show(p)

For each sample, calculate the fraction of heterozygotes per gene.

First we annotate each variant with gene name. Note that since there may be multiple transcripts overlapping any variant site, there may be multiple gene names. So we just arbitrarily pick the first gene in the list.

In [114]:
# Get the gene name from variant annotation.
# The annotation field is an array with one element for each transcript for the particular site.
# The various information in the annotation is separated by a pipe ("|").
het_mt = het_mt.annotate_rows(gene = het_mt.info.ANN.map(lambda x: x.split('\|')[3]))

# We will only look at one of the genes, so we arbitrarily pick the first in the list.
het_mt = het_mt.annotate_rows(gene1 = het_mt.gene[0])

In [80]:
# Get the entries as a Hail table indexed by locus, alleles and sample.
entries = het_mt.entries()

# Group entries by sample and gene name.
gene_groups_ht = entries.group_by(sample_group=entries.s, gene_group=entries.gene1)

# For each sample and gene group, calculate the number of phased heterozygotes, the number of heterozygotes, and the fraction of phased heterozygotes.
gene_stats_ht = gene_groups_ht.aggregate(n_phased_hets=hl.agg.count_where(entries.GT.phased), n_hets = hl.agg.count_where(entries.GT.is_het()))

gene_stats_ht = gene_stats_ht.annotate(phased_hets_fraction = gene_stats_ht.n_phased_hets / gene_stats_ht.n_hets)

In [83]:
# FIXME: checkpoint only for testing, remove.
# Cache all operations by making a checkpoint.
gene_stats_ht = gene_stats_ht.checkpoint('/home/olavur/tmp/gene_phasing_stats.ht', overwrite=True)

2021-06-14 11:44:28 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-06-14 11:46:04 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-06-14 11:46:42 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-06-14 11:47:03 Hail: INFO: wrote table with 8212659 rows in 37 partitions to /home/olavur/tmp/gene_phasing_stats.ht
    Total size: 121.50 MiB
    * Rows: 121.50 MiB
    * Globals: 11.00 B
    * Smallest partition: 148496 rows (2.19 MiB)
    * Largest partition:  413807 rows (6.13 MiB)


For each sample, count number of fully phased genes, that is, where the fraction of phased heterozygotes is 1.

In [85]:
phased_genes_ht = (gene_stats_ht.group_by(sample_group = gene_stats_ht.sample_group)
    .aggregate(phased_genes = hl.agg.count_where(gene_stats_ht.phased_hets_fraction  == 1.0)))

In [88]:
p = hl.plot.histogram(phased_genes_ht.phased_genes, title='Histogram of number of fully phased genes per sample')
p.plot_width = 800
p.plot_height = 500
show(p)

2021-06-14 11:52:27 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-06-14 11:52:28 Hail: INFO: Ordering unsorted dataset with network shuffle


### Phase block lengths

Calculate the lengths of the phase blocks. Phase blocks are defined by the `PS` 'phase set' tag on the genotypes. The `PS` tag is an integer equal to the position of the first variant in the phase block.

We will obtain the start and end positions of each phase block, defined as the positions of the first and last variants in each phase set. We will use these to calculate the phase block lengths. We will also calculate the number of variants in each phase block.

In [61]:
# Get the entries as a Hail table indexed by locus, alleles and sample.
entries = het_mt.entries()

# Group entries by phase set, chromosome, and sample.
ps_groups_ht = entries.group_by(PS_group=entries.PS, chrom_group=entries.locus.contig, sample_group=entries.s)

# For each sample and chromosome, obtain the start and end positions of each phase set, and count the number of variants in the phase sets.
ps_stats_ht = ps_groups_ht.aggregate(ps_start=hl.agg.min(entries.locus.position), ps_stop=hl.agg.max(entries.locus.position), ps_count=hl.agg.count())

# Calculate the lengths of the phase sets.
ps_stats_ht = ps_stats_ht.annotate(ps_length = ps_stats_ht.ps_stop - ps_stats_ht.ps_start)

In [44]:
# FIXME: checkpoint only for testing, remove.
# Cache all operations by making a checkpoint.
ps_stats_ht = ps_stats_ht.checkpoint('/home/olavur/tmp/phasing_stats.ht', overwrite=True)

2021-06-14 10:57:20 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-06-14 10:57:36 Hail: INFO: wrote table with 4017621 rows in 37 partitions to /home/olavur/tmp/phasing_stats.ht
    Total size: 71.18 MiB
    * Rows: 71.18 MiB
    * Globals: 11.00 B
    * Smallest partition: 76453 rows (1.32 MiB)
    * Largest partition:  160761 rows (2.81 MiB)


Calculate phase block length summary statistics for each sample.

In [45]:
ps_len_stats_ht = (ps_stats_ht.group_by(ps_stats_ht.sample_group)
    .aggregate(stats = hl.agg.stats(ps_stats_ht.ps_length)))

In [112]:
p = hl.plot.scatter(ps_len_stats_ht.stats.mean, ps_len_stats_ht.stats.stdev, xlabel='Mean length', ylabel='St.dev. of length', title='Phase block lengths')
p.plot_width = 800
p.plot_height = 500
show(p)

2021-06-14 13:39:29 Hail: INFO: Ordering unsorted dataset with network shuffle


In [108]:
p = hl.plot.histogram(ps_len_stats_ht.stats.mean)
show(p)

2021-06-14 13:31:10 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-06-14 13:31:30 Hail: INFO: Ordering unsorted dataset with network shuffle


In [109]:
p = hl.plot.histogram(ps_len_stats_ht.stats.stdev)
show(p)

2021-06-14 13:35:39 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-06-14 13:35:59 Hail: INFO: Ordering unsorted dataset with network shuffle
