# Sample QC

In [54]:
# TODO: filtrera allelic balance (allelic depth)

In [2]:
import hail as hl
hl.init()

Running on Apache Spark version 2.4.6
SparkUI available at http://hms-beagle-5466c684ff-2l8nm:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.58-3f304aae6ce2
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/hail-20201217-0953-0.2.58-3f304aae6ce2.log


In [3]:
import numpy as np
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

Load high quality variants.

In [4]:
mt = hl.read_matrix_table('/home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/mt/filtered.mt')

In [5]:
n_sites, n_samples = mt.count()
print('{n_sites} variants and {n_samples} samples.'.format(n_sites=n_sites, n_samples=n_samples))

185771 variants and 48 samples.


## Filter variants

In [8]:
maf_filter = 0.05
mt = mt.filter_rows(mt.info.AF[0] > maf_filter)
mt = mt.filter_rows(mt.info.AF[0] < (1 - maf_filter))
mt.count()

(25712, 48)

In [6]:
#mt = mt.annotate_rows(hwe=hl.agg.hardy_weinberg_test(mt.GT))
#mt = mt.filter_rows(mt.hwe.p_value > 10e-8)
#mt.count()

In [7]:
#n_samples = mt.count_cols()
#mt = mt.filter_rows(mt.info.AC[0] > 1)
#mt = mt.filter_rows(mt.info.AC[0] < 2 * n_samples - 1)
#mt.count()

In [9]:
mt.aggregate_rows(hl.agg.min(mt.info.AF[0]))

0.052

In [10]:
mt.aggregate_rows(hl.agg.max(mt.info.AF[0]))

0.948

## Distributions

In [11]:
mt = hl.sample_qc(mt)

In [12]:
p = hl.plot.histogram(mt.sample_qc.dp_stats.mean, range=(0,100), legend='Mean DP per sample histogram')
show(p)

In [13]:
p = hl.plot.histogram(mt.sample_qc.gq_stats.mean, range=(10,100), legend='Mean Sample GQ')
show(p)

In [14]:
p = hl.plot.histogram(mt.sample_qc.r_het_hom_var, range=(2,22), legend='Het/hom rate')
show(p)

In [15]:
p = hl.plot.histogram(mt.sample_qc.r_ti_tv, range=(0.4,1.2), legend='Ti/Tv rate')
show(p)

In [16]:
min_singl = mt.aggregate_cols(hl.agg.min(mt.sample_qc.n_singleton))
max_singl = mt.aggregate_cols(hl.agg.max(mt.sample_qc.n_singleton))

In [17]:
p = hl.plot.histogram(mt.sample_qc.n_singleton, range=(min_singl, max_singl), legend='Number of singletons per sample')
show(p)

## Inbreeding

In [18]:
mt = hl.variant_qc(mt)

In [19]:
mt_diallelic = mt.filter_rows(hl.len(mt.alleles) == 2)

In [20]:
mt_diallelic = mt_diallelic.annotate_cols(IB = hl.agg.inbreeding(mt_diallelic.GT, mt_diallelic.variant_qc.AF[1]))
#mt_diallelic.IB.show(width=100)

In [21]:
min_f = mt_diallelic.aggregate_cols(hl.agg.min(mt_diallelic.IB.f_stat))
max_f = mt_diallelic.aggregate_cols(hl.agg.max(mt_diallelic.IB.f_stat))
p = hl.plot.histogram(mt_diallelic.IB.f_stat, range=(min_f, max_f), legend='F (inbreeding)')
show(p)

In [22]:
p = hl.plot.scatter(mt_diallelic.sample_qc.n_singleton, mt_diallelic.IB.f_stat,
                    xlabel='# Singletons', ylabel='F', hover_fields={'Sample': mt_diallelic.s})
p.plot_width = 700
p.plot_height = 500
show(p)

## Relatedness

Use diallelic LD pruned variants.

In [23]:
mt_indep = hl.read_matrix_table('/home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/mt/ld_pruned_diallelic.mt/')
mt_indep.count()

(54738, 48)

In [41]:
maf_filter = 0.05
mt_indep = mt_indep.filter_rows(mt_indep.info.AF[0] > maf_filter)
mt_indep = mt_indep.filter_rows(mt_indep.info.AF[0] < (1 - maf_filter))
mt_indep.count()

(10837, 48)

In [42]:
mt_indep.aggregate_rows(hl.agg.min(mt_indep.info.AF[0]))

0.052

In [43]:
mt_indep.aggregate_rows(hl.agg.max(mt_indep.info.AF[0]))

0.948

In [44]:
rel = hl.pc_relate(mt_indep.GT, 0.01, k=10)

2020-12-17 10:04:16 Hail: INFO: hwe_normalized_pca: running PCA using 10837 variants.
2020-12-17 10:04:17 Hail: INFO: pca: running PCA with 10 components...
2020-12-17 10:04:20 Hail: INFO: Wrote all 3 blocks of 10837 x 48 matrix with block size 4096.


In [45]:
p = hl.plot.histogram(rel.kin, range=(-0.05, 0.06), legend='Kinship coefficient')
show(p)

2020-12-17 10:04:20 Hail: INFO: wrote matrix with 11 rows and 10837 columns as 3 blocks of size 4096 to /tmp/pcrelate-write-read-SLbgM069jRaUr3aUGBK65u.bm
2020-12-17 10:04:21 Hail: INFO: wrote matrix with 10837 rows and 48 columns as 3 blocks of size 4096 to /tmp/pcrelate-write-read-yhjYPmkYFfyBtOVtG67Dyc.bm
2020-12-17 10:04:21 Hail: INFO: wrote matrix with 10837 rows and 48 columns as 3 blocks of size 4096 to /tmp/pcrelate-write-read-XrlNyhz2pREywwBKmpoX0r.bm
2020-12-17 10:04:21 Hail: INFO: wrote matrix with 48 rows and 48 columns as 1 block of size 4096 to /tmp/pcrelate-write-read-2SFXbOdqm7QQmjfs4HNKRh.bm
2020-12-17 10:04:21 Hail: INFO: wrote matrix with 48 rows and 48 columns as 1 block of size 4096 to /tmp/pcrelate-write-read-vkpLDlOm3qnEJz2sl2HlpR.bm
2020-12-17 10:04:21 Hail: INFO: wrote matrix with 48 rows and 48 columns as 1 block of size 4096 to /tmp/pcrelate-write-read-jsXG9OaHKJT69O6FxopIVp.bm
2020-12-17 10:04:21 Hail: INFO: wrote matrix with 48 rows and 48 columns as 1 bloc

In [46]:
p = hl.plot.scatter(rel.ibd0, rel.ibd1,
                    xlabel='', ylabel='')
p.plot_width = 700
p.plot_height = 500
show(p)

2020-12-17 10:04:22 Hail: INFO: wrote matrix with 11 rows and 10837 columns as 3 blocks of size 4096 to /tmp/pcrelate-write-read-G0loKsJ9mpZ9FqNRTkQCaP.bm
2020-12-17 10:04:22 Hail: INFO: wrote matrix with 10837 rows and 48 columns as 3 blocks of size 4096 to /tmp/pcrelate-write-read-BoXhoaa0v3Nln1NBz3XZRy.bm
2020-12-17 10:04:23 Hail: INFO: wrote matrix with 10837 rows and 48 columns as 3 blocks of size 4096 to /tmp/pcrelate-write-read-JJdekuMAMBFGB9D0I2TmHD.bm
2020-12-17 10:04:23 Hail: INFO: wrote matrix with 48 rows and 48 columns as 1 block of size 4096 to /tmp/pcrelate-write-read-mZW1FQBnc0Y8FyhjgiPjNH.bm
2020-12-17 10:04:23 Hail: INFO: wrote matrix with 48 rows and 48 columns as 1 block of size 4096 to /tmp/pcrelate-write-read-WtOh8feBUs3FP35Vqy43Zg.bm
2020-12-17 10:04:23 Hail: INFO: wrote matrix with 48 rows and 48 columns as 1 block of size 4096 to /tmp/pcrelate-write-read-CtDhJ1BLrNkDY9Aa3DGWU8.bm
2020-12-17 10:04:23 Hail: INFO: wrote matrix with 48 rows and 48 columns as 1 bloc

### IBD

In [47]:
ht_ibd = hl.identity_by_descent(mt_indep)
ht_ibd.count()

2020-12-17 10:04:40 Hail: INFO: Coerced sorted dataset


1128

In [48]:
ht_ibd.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'i': str 
    'j': str 
    'ibd': struct {
        Z0: float64, 
        Z1: float64, 
        Z2: float64, 
        PI_HAT: float64
    } 
    'ibs0': int64 
    'ibs1': int64 
    'ibs2': int64 
----------------------------------------
Key: ['i', 'j']
----------------------------------------


In [49]:
ht_ibd.ibd.PI_HAT.summarize()

2020-12-17 10:04:48 Hail: INFO: Coerced sorted dataset


0,1
Non-missing,1128 (100.00%)
Missing,0
Minimum,0.35
Maximum,0.74
Mean,0.48
Std Dev,0.04


In [50]:
p = hl.plot.histogram(ht_ibd.ibd.PI_HAT, range=(0.35, 0.73), legend='pi_hat')
show(p)

2020-12-17 10:04:56 Hail: INFO: Coerced sorted dataset


In [51]:
ht_ibd0_5 = ht_ibd.filter(ht_ibd.ibd.PI_HAT == 0.5)

In [52]:
ht_ibd0_5.count()

2020-12-17 10:05:04 Hail: INFO: Coerced sorted dataset


877

### King

In [33]:
mt_king = hl.king(mt_indep.GT)
mt_king.count()

2020-12-17 09:57:00 Hail: INFO: Wrote all 14 blocks of 54738 x 48 matrix with block size 4096.
2020-12-17 09:57:03 Hail: INFO: Wrote all 14 blocks of 54738 x 48 matrix with block size 4096.
2020-12-17 09:57:06 Hail: INFO: Wrote all 14 blocks of 54738 x 48 matrix with block size 4096.
2020-12-17 09:57:09 Hail: INFO: Wrote all 14 blocks of 54738 x 48 matrix with block size 4096.
2020-12-17 09:57:09 Hail: INFO: wrote matrix with 48 rows and 48 columns as 1 block of size 4096 to /tmp/nYTwpQz0aQZnboLK9b4LCQ
2020-12-17 09:57:10 Hail: INFO: wrote matrix with 48 rows and 48 columns as 1 block of size 4096 to /tmp/zVXcSJYNR1TSpeNnvXpH5l
2020-12-17 09:57:10 Hail: INFO: wrote matrix with 48 rows and 48 columns as 1 block of size 4096 to /tmp/M5QQ7fuOaDZbNXFWaKAtbA
2020-12-17 09:57:10 Hail: INFO: wrote matrix with 48 rows and 48 columns as 1 block of size 4096 to /tmp/EQHJWI8XsZKhP1HRZRY6tX
2020-12-17 09:57:10 Hail: INFO: wrote matrix with 48 rows and 48 columns as 1 block of size 4096 to /tmp/huj

(48, 48)

In [34]:
mt_king.phi.summarize()

0,1
Non-missing,2304 (100.00%)
Missing,0
Minimum,-0.73
Maximum,0.50
Mean,0.04
Std Dev,0.14


In [35]:
p = hl.plot.histogram(mt_king.phi, range=(-0.74, 0.5), legend='phi')
show(p)

## GRM

In [36]:
bm_grm = hl.genetic_relatedness_matrix(mt_indep.GT)
grm = bm_grm.to_numpy()

2020-12-17 09:57:17 Hail: INFO: Wrote all 14 blocks of 54738 x 48 matrix with block size 4096.


In [37]:
grm.shape

(48, 48)

In [38]:
avg_rel = grm.mean(1)
rel_min, rel_max = (avg_rel.min(), avg_rel.max())

In [39]:
# Make a Hail table with the allele counts and site frequencies.
table = []
for i, rel in enumerate(avg_rel):
    row = {'rel': rel}
    table.append(row)

ht = hl.Table.parallelize(hl.literal(table, 'array<struct{rel:float32}>'))

In [40]:
p = hl.plot.histogram(ht.rel, range=(rel_min, rel_max), bins=10)
show(p)