# Relatedness pruning

In [1]:
import hail as hl
hl.init()

Running on Apache Spark version 2.4.6
SparkUI available at http://hms-beagle-5466c684ff-d8mgh:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.58-3f304aae6ce2
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/hail-20201120-1410-0.2.58-3f304aae6ce2.log


In [2]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

In [3]:
mt = hl.read_matrix_table('data/mt/variants.mt')

## LD pruning

Filter out multi-allelic sites and prune variants in linkage disequilibrium.

In [4]:
mt_biallelic = mt.filter_rows(hl.len(mt.alleles) == 2)
pruned_variant_table = hl.ld_prune(mt_biallelic.GT, r2=0.2, bp_window_size=500000)
mt_filtered = mt.filter_rows(hl.is_defined(pruned_variant_table[mt.row_key]))

2020-11-20 14:21:57 Hail: INFO: ld_prune: running local pruning stage with max queue size of 1082402 variants
2020-11-20 14:22:28 Hail: INFO: wrote table with 168790 rows in 5 partitions to /tmp/nhiyxOOVTjLcQD7I5Ev5G3
    Total size: 3.42 MiB
    * Rows: 3.42 MiB
    * Globals: 11.00 B
    * Smallest partition: 31926 rows (659.68 KiB)
    * Largest partition:  36679 rows (764.25 KiB)
2020-11-20 14:23:45 Hail: INFO: Wrote all 42 blocks of 168790 x 48 matrix with block size 4096.
2020-11-20 14:44:16 Hail: INFO: wrote table with 203 rows in 83 partitions to /tmp/jOEGBIq5Y4kTythq9GeNDM
    Total size: 1.33 MiB
    * Rows: 5.00 KiB
    * Globals: 1.33 MiB
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  32 rows (388.00 B)


## Identity-by-descent calculation

In [7]:
ht_ibd = hl.identity_by_descent(mt_filtered)

In [12]:
ht_ibd.count()

2020-11-20 15:00:16 Hail: INFO: Coerced sorted dataset
2020-11-20 15:02:05 Hail: INFO: Coerced sorted dataset


1128

In [14]:
z0_hist = ht_ibd.aggregate(hl.expr.aggregators.hist(ht_ibd.ibd.PI_HAT, 0, 1, 100))
p = hl.plot.histogram(z0_hist, legend='PI_HAT', title='Histogram of kinship')
show(p)

2020-11-20 15:12:08 Hail: INFO: Coerced sorted dataset


## Relatedness pruning

In [None]:
pc_rel = hl.pc_relate(mt.GT, 0.001, k=2, statistics='kin')
pairs = pc_rel.filter(pc_rel['kin'] > 0.125)
related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j,
                                                       keep=False)
result = mt.filter_cols(
    hl.is_defined(related_samples_to_remove[mt.col_key]), keep=False)

In [20]:
pairs = ht_ibd.filter(ht_ibd.ibd.PI_HAT > 0.125)
related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, keep=False)
result = mt_filtered.filter_cols(hl.is_defined(related_samples_to_remove[mt_filtered.col_key]), keep=False)

2020-11-20 15:16:42 Hail: INFO: Coerced sorted dataset
2020-11-20 15:16:43 Hail: INFO: wrote table with 430 rows in 1 partition to /tmp/JKAtJBTiE5mooQc59bf16u
    Total size: 2.97 KiB
    * Rows: 2.96 KiB
    * Globals: 11.00 B
    * Smallest partition: 430 rows (2.96 KiB)
    * Largest partition:  430 rows (2.96 KiB)


In [21]:
result.count()

2020-11-20 15:16:57 Hail: INFO: Ordering unsorted dataset with network shuffle


(168618, 20)