# Average number of pairwise differences

In [1]:
import hail as hl
hl.init()

Running on Apache Spark version 2.4.6
SparkUI available at http://hms-beagle-5466c684ff-2l8nm:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.58-3f304aae6ce2
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/hail-20201207-1144-0.2.58-3f304aae6ce2.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

In [3]:
mt = hl.read_matrix_table('/home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/mt/filtered.mt')

Calculate the allele dosage.

## Dosage difference

Calculate differences between to samples at a site as the difference in number of alternate alleles (dosage).

Confusion matrix:

|  | 0 | 1 | 2
---|---|---|---|
0  | 0  | 1  | 2
1  | 1  | 0  | 1
2  | 2  | 1  | 0


In [4]:
mt = mt.annotate_entries(DS=mt.GT[0] + mt.GT[1])

In [5]:
mt_small = mt.sample_rows(0.00001)

In [6]:
ht = mt.localize_entries("entries", "cols")
ht = ht.annotate(sum_of_pairwise_diffs = hl.sum(
        hl.range(0, hl.len(ht.entries))
            .flatmap(
                lambda i: hl.range(0, i).map(lambda j: hl.abs(ht.entries[i].DS - ht.entries[j].DS)))))
ht = ht.annotate(avg_pairwise_diff=ht.sum_of_pairwise_diffs/(hl.len(ht.entries) * (hl.len(ht.entries) - 1) / 2))

In [7]:
ht.avg_pairwise_diff.take(5)

[0.0416666679084301,
 0.11968085169792175,
 0.1906028389930725,
 0.5939716100692749,
 0.08156028389930725]

## Nucleotide diversity

In [8]:
def get_mt_allele(mt, allele):
    '''
    Replace the GT call field with the specified allele. Add the allele number to
    the column key.
    '''
    mt2 = mt.transmute_entries(GT=mt.GT[allele])
    mt2 = mt2.key_cols_by(s=mt2.s + '_' + str(allele))
    return mt2

def explode_alleles(mt):
    '''
    Split each sample into the reference and alternate allele.
    
    Input:
    mt:      MatrixTable with diallelic GT call field.

    Returns:
    MatrixTable
    '''
    
    # Get a MatrixTable with the reference allele, where the sample names are
    # appended with an "_0".
    mt_ref = get_mt_allele(mt, 0)
    # Similarly for the alternate allele.
    mt_alt = get_mt_allele(mt, 1)

    # Concatenate the samples of the two MatrixTables.
    mt_new = mt_ref.union_cols(mt_alt)
    
    return mt_new

In [9]:
mt_new = explode_alleles(mt)

In [10]:
ht = mt_new.localize_entries("entries", "cols")
ht = ht.annotate(sum_of_pairwise_diffs = hl.sum(
        hl.range(0, hl.len(ht.entries))
            .flatmap(
                lambda i: hl.range(0, i).map(lambda j: hl.abs(ht.entries[i].GT - ht.entries[j].GT)))))
ht = ht.annotate(avg_pairwise_diff=ht.sum_of_pairwise_diffs/(hl.len(ht.entries) * (hl.len(ht.entries) - 1) / 2))

In [11]:
%%time
ht.aggregate(hl.agg.mean(ht.sum_of_pairwise_diffs))

CPU times: user 12.7 ms, sys: 3.56 ms, total: 16.2 ms
Wall time: 21.5 s


271.69273460335575

## Average pairwise differences per gene

In [12]:
ht = ht.annotate(gene=ht.info.ANN.map(lambda x: x.split('\|')[3]))
ht = ht.annotate(gene1=ht.gene[0])

In [13]:
gene_stats = (ht.group_by(ht.gene1)
                       .aggregate(pi=hl.agg.mean(ht.avg_pairwise_diff)))

In [14]:
%%time
gene_stats.pi.summarize()

2020-12-07 11:45:52 Hail: INFO: Ordering unsorted dataset with network shuffle


0,1
Non-missing,20200 (100.00%)
Missing,0
Minimum,0.00
Maximum,0.97
Mean,0.05
Std Dev,0.05


CPU times: user 29.4 ms, sys: 4.79 ms, total: 34.2 ms
Wall time: 27.4 s


In [15]:
p = hl.plot.histogram(gene_stats.pi, range=(0, 0.26), bins=100,
                      title='Histogram of average pairwise differences per gene')
p.plot_width = 800
p.plot_height = 500
show(p)

2020-12-07 11:46:18 Hail: INFO: Ordering unsorted dataset with network shuffle


## A less stupid way of computing nucleotide diversity

We need the number of alleles which is $2 N$ where $N$ is the number of samples.

In [16]:
n_alleles = 2 * mt.count_cols()
mt = mt.annotate_rows(n_alt=mt.info.AC[0])

For a **single site**, the number of pairwise differences between sequences can be calculated as $m (N - m)$, where $m$ is the number of either allele (e.g. the reference allele). The average pairwise differences is then obtained by dividing this by the number of total comparisons.

$$
\pi = \frac{m (N - m)}{N (N - 1) / 2}
$$

In [17]:
mt = mt.annotate_rows(pi=mt.n_alt * (n_alleles - mt.n_alt) / (n_alleles * (n_alleles - 1) / 2))

Annotate the `MatrixTable` with gene names. Where there are multiple transcripts, the first one is chosen.

In [18]:
mt = mt.annotate_rows(gene=mt.info.ANN.map(lambda x: x.split('\|')[3]))
mt = mt.annotate_rows(gene1=mt.gene[0])

Group the sites by gene, and compute the average pairwise differences for each gene.

In [19]:
rows = mt.rows()
gene_stats = (rows.group_by(rows.gene1)
                       .aggregate(pi = hl.agg.mean(rows.pi),
                                 n_sites=hl.agg.count(),
                                 avg_ac=hl.agg.mean(rows.info.AC[0])))

In [20]:
gene_stats.pi.summarize()

2020-12-07 11:46:23 Hail: INFO: Ordering unsorted dataset with network shuffle


0,1
Non-missing,20200 (100.00%)
Missing,0
Minimum,0.00
Maximum,0.51
Mean,0.05
Std Dev,0.05


In [21]:
p = hl.plot.histogram(gene_stats.pi, range=(0, 0.30), bins=100,
                      title='Histogram of average pairwise differences per gene')
p.plot_width = 800
p.plot_height = 500
show(p)

2020-12-07 11:46:27 Hail: INFO: Ordering unsorted dataset with network shuffle


In [22]:
gene_stats_filtered = gene_stats.filter((gene_stats.pi>0.0205) & (gene_stats.pi < 0.021))

In [26]:
1/50

0.02

In [23]:
gene_stats_filtered.pi.summarize()

2020-12-07 11:46:30 Hail: INFO: Ordering unsorted dataset with network shuffle


0,1
Non-missing,6002 (100.00%)
Missing,0
Minimum,0.02
Maximum,0.02
Mean,0.02
Std Dev,0.00


In [24]:
gene_stats_filtered.n_sites.summarize()

2020-12-07 11:46:34 Hail: INFO: Ordering unsorted dataset with network shuffle


0,1
Non-missing,6002 (100.00%)
Missing,0
Minimum,1
Maximum,28
Mean,3.62
Std Dev,3.09


In [25]:
gene_stats_filtered.avg_ac.summarize()

2020-12-07 11:46:37 Hail: INFO: Ordering unsorted dataset with network shuffle


0,1
Non-missing,6002 (100.00%)
Missing,0
Minimum,1.00
Maximum,95.00
Mean,1.05
Std Dev,1.95
