# Linkage Disequilibrium

In [1]:
import hail as hl
hl.init()

Running on Apache Spark version 2.4.6
SparkUI available at http://hms-beagle-5466c684ff-2l8nm:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.58-3f304aae6ce2
LOGGING: writing to /home/olavur/experiments/2020-11-13_fargen1_exome_analysis/fargen-1-exome/notebooks/hail-20201217-0900-0.2.58-3f304aae6ce2.log


In [2]:
import numpy as np
from bokeh.io import show, output_notebook, output_file
from bokeh.plotting import figure, ColumnDataSource
from bokeh.layouts import gridplot
from bokeh.models import FuncTickFormatter, ColorBar, LinearColorMapper, BasicTicker
output_notebook()

## Function for calculating and plotting the LD matrix

In [3]:
def ld_matrix_plot(mt, annotate):
    # Calculate LD matrix and convert to numpy array.
    ld = hl.ld_matrix(mt.GT.n_alt_alleles(), mt.locus, radius=2e6)
    ld = ld.to_numpy()

    # Get the number of sites.
    nn, _ = ld.shape
    
    # Data source for Bokeh image plot.
    data = dict(
        image=[ld],
        x=[0],  # Axis starts at 0.
        y=[0],
        dw=[nn-1],  # Axis has same dimension as LD matrix.
        dh=[nn-1],
    )

    # Tooltips to show on mouse hover.
    TOOLTIPS = [
        ("(x,y)", "($x, $y)"),
        ("rho", "@image"),
    ]
    
    for name, exprs in annotate.items():
        # Add the annotation to the data source.
        data[name] = [np.array([exprs.collect()] * nn)]
        # Add the tooltip.
        TOOLTIPS.append((name, '@' + name))

    source = ColumnDataSource(data=data)

    p = figure(tooltips=TOOLTIPS)

    # Add colorbar to plot.
    color_mapper = LinearColorMapper(palette="Viridis256", low=np.nanmin(ld), high=np.nanmax(ld))
    color_bar = ColorBar(color_mapper=color_mapper, label_standoff=5, border_line_color=None, location=(0,0))
    p.add_layout(color_bar, 'right')

    # Provide the image, the annotations, and other parameters.
    p.image(source=source, image='image', x='x', y='y', dw='dw', dh='dh', color_mapper=color_mapper)

    # Remove the padding in the sides of the plot.
    p.x_range.range_padding = p.y_range.range_padding = 0

    # Use the position of the site as the axis ticks.
    # The current ticks should be the index in the image (matrix), as we've set the axis to start at 0
    # and end at nn-1. Therefore, we just map the current ticks to the position array.
    pos = mt.locus.position.collect()
    pos_tick_format = FuncTickFormatter(args={'pos': pos}, code="""
        return pos[tick]
    """)
    p.xaxis.formatter = pos_tick_format
    p.yaxis.formatter = pos_tick_format
    
    return p

## Load data

These data are filtered to contain only high quality VQSR variants.

In [4]:
mt = hl.read_matrix_table('/home/olavur/experiments/2020-11-13_fargen1_exome_analysis/data/mt/filtered.mt')
mt.count()

(185771, 48)

## Filter data

Keep only diallelic sites.

In [5]:
mt = mt.filter_rows(hl.len(mt.alleles) == 2)
mt.count()

(176834, 48)

Remove invariable sites, as we cannot calculate LD between such sites. In this context, invariable means that dosage (number of alternate alleles) is constant over all samples in a single site.

First we count the sites to be removed.

In [6]:
mt = mt.annotate_rows(n_het=hl.agg.sum(mt.GT.is_het()))

all_0 = mt.aggregate_rows(hl.agg.sum(mt.info.AC[0] == 0))
all_1 = mt.aggregate_rows(hl.agg.sum(mt.n_het == mt.count_cols()))
all_2 = mt.aggregate_rows(hl.agg.sum(mt.info.AC[0] == 2 * mt.count_cols()))

print('Dosage\tSite count')
print('{d}\t{count}'.format(d=0, count=all_0))
print('{d}\t{count}'.format(d=1, count=all_1))
print('{d}\t{count}'.format(d=2, count=all_2))

Dosage	Site count
0	0
1	136
2	186


There are no hom.ref. sites because these were removed in the pipeline.

Let's remove all invariant sites.

In [7]:
# Remove all hom.ref. sites.
mt = mt.filter_rows(mt.info.AC[0] != 0)
# Remove all het. sites.
mt = mt.filter_rows(mt.n_het != mt.count_cols())
# Remove all hom.alt. sites.
mt = mt.filter_rows(mt.info.AC[0] != 2 * mt.count_cols())

mt.count()

(176512, 48)

Remove variants that fail HWE test.

In [8]:
mt = mt.annotate_rows(hwe=hl.agg.hardy_weinberg_test(mt.GT))
mt = mt.filter_rows(mt.hwe.p_value > 10e-8)
mt.count()

(175832, 48)

Remove singletons.

In [9]:
n_samples = mt.count_cols()
mt = mt.filter_rows(mt.info.AC[0] > 1)
mt = mt.filter_rows(mt.info.AC[0] < 2 * n_samples - 1)
mt.count()

(68473, 48)

Remove samples with low average genotype quality.

In [10]:
mt = hl.sample_qc(mt)
mt = mt.filter_cols(mt.sample_qc.gq_stats.mean > 30)
mt.count()

(68473, 47)

## LD matrix

We will compute the LD matrix for the MHC, which is located at chromosome band 6p21.3.

Chr | Start | Stop | Cytoband
----|-------|------|---------
chr6 | 30500000 | 32100000 | p21.33
chr6 | 32100000 | 33500000 | p21.32
chr6 | 33500000 | 36600000 | p21.31

Remove all variants outsite this region.

In [11]:
contig, start, stop = ('chr6', 30500000, 36600000)

# Only keep the specified loci.
mt_mhc = mt.filter_rows((mt.locus.contig == contig) & (mt.locus.position > start) & (mt.locus.position < stop))

n_variants = mt_mhc.count_rows()
len_mhc = stop - start

print('The MHC region is {len_mhc} basepairs long and our samples contain {n_variants} in this region.'
      .format(len_mhc=len_mhc, n_variants=n_variants))

The MHC region is 6100000 basepairs long and our samples contain 226 in this region.


In [12]:
p = ld_matrix_plot(mt_mhc, {'pos': mt_mhc.locus.position, 'rsid': mt_mhc.rsid})
p.plot_width = 700
p.plot_height = 500

2020-12-17 09:01:21 Hail: INFO: Wrote all 1 blocks of 226 x 47 matrix with block size 4096.


In [13]:
show(p)