## Setup

Sets up the environment for Ag1000G Selection Atlas.

In [1]:
%%HTML
<style type="text/css">
.container {
    width: 100%;
}
</style>

In [2]:
# python standard library
import sys
import os
import operator
import itertools
import collections
import functools
import glob
import csv
import datetime
import bisect
import sqlite3
import subprocess
import random
import gc
import shutil
import shelve
import contextlib
import tempfile
import math

In [3]:
# plotting setup
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.gridspec import GridSpec
import seaborn as sns
sns.set_context('paper')
sns.set_style('darkgrid')
# use seaborn defaults
rcParams = plt.rcParams
rcParams['savefig.jpeg_quality'] = 100

In [4]:
%matplotlib inline
%config InlineBackend.figure_formats = {'retina', 'png'}

In [5]:
# general purpose third party packages
import numpy as np
nnz = np.count_nonzero
import scipy
import scipy.stats
import scipy.spatial.distance
import numexpr
import h5py
import tables
import bcolz
import dask
import dask.array as da
import pandas as pd
import IPython
from IPython.display import clear_output, display, HTML
import sklearn
import sklearn.decomposition
import sklearn.manifold
import petl as etl
etl.config.display_index_header = True
import humanize
from humanize import naturalsize, intcomma, intword
import zarr
from scipy.stats import entropy
import lmfit

In [6]:
import allel

In [7]:
sys.path.insert(0, '../agam-report-base/src/python')
from util import *
import zcache
import veff
import hapclust
%reload_ext autoreload
%autoreload 1
%aimport rockies

ag1k_dir = '../ngs.sanger.ac.uk/production/ag1000g/phase1'
from ag1k import phase1_ar3
phase1_ar3.init(os.path.join(ag1k_dir, 'AR3'))

from ag1k import phase1_ar31
phase1_ar31.init(os.path.join(ag1k_dir, 'AR3.1'))

In [8]:
from ag1k import phase1_selection
phase1_selection.init(os.path.join(ag1k_dir, 'selection.1.RC2'))

In [9]:
tbl_genes = etl.cat(*[get_geneset_features(phase1_ar3.geneset_agamp42_fn, chrom).eq('type', 'gene').unpackdict('attributes', ['ID']) 
                      for chrom in chromosomes])
tbl_genes

0|seqid,1|source,2|type,3|start,4|end,5|score,6|strand,7|phase,8|ID
2R,VectorBase,gene,6577,7851,.,+,.,AGAP001096
2R,VectorBase,gene,13754,15149,.,+,.,AGAP013094
2R,VectorBase,gene,18554,19549,.,+,.,AGAP001097
2R,VectorBase,gene,20368,21901,.,+,.,AGAP001098
2R,VectorBase,gene,21882,31394,.,-,.,AGAP001099


In [10]:
lkp_gene = tbl_genes.recordlookupone('ID')
lkp_gene['AGAP004707']

('2L', 'VectorBase', 'gene', 2358158, 2431617, '.', '+', '.', 'AGAP004707')

In [11]:
def plot_list_genes_track(namespace, chrom, ax, gene_labels, plot=True, x_loc=None, **kwargs):
    if chrom == '2R':
        sns.despine(ax=ax, left=True, offset=5)
    else:
        sns.despine(ax=ax, left=True, offset=5)
    if plot:
        plot_genes(namespace.genome, namespace.geneset_agamp42_fn, 
                   chrom, ax=ax, height=.2, label=False, 
                   barh_kwargs=dict(lw=0.1, alpha=.2))
    if chrom == '2R':
#         ax.set_ylabel('genes', ha='left', va='center', rotation=0)
#         ax.yaxis.set_label_coords(0, 1, transform=ax.transAxes)
        ax.set_yticks([])
        ax.set_ylabel('')
    else:
        ax.set_yticks([])
        ax.set_ylabel('')
    if chrom == 'X':
        ax.set_xlabel('Position (Mbp)', ha='left')
        ax.xaxis.set_label_coords(1.1, -.87, transform=ax.transAxes)
        ax.set_ylabel('Genes', ha='left', va='center', rotation=0)
        ax.yaxis.set_label_coords(1.1, 0.5, transform=ax.transAxes)
    
    for gid in gene_labels:
        rec = lkp_gene[gid]
        if rec.seqid == chrom:
            x = (rec.start + rec.end) / 2
            if rec.strand == '+':
                y = .9
                marker = 'v'
                yt = -1
            else:
                y = .1
                marker = '^'
                yt = -4
            ax.plot([x], [y], marker=marker, mfc='w', mec='k')
            ax.annotate(gene_labels[gid], xy=(x, y), xytext=(5, yt), textcoords='offset points',
                        fontsize=7, fontstyle='italic')
            
            
    if x_loc is not None:
        for _chrom, _pos in x_loc:

            if _chrom == chrom:
                y = .1
                marker = '^'
                yt = -4
                ax.plot([_pos], [y], marker=marker, mfc='k', mec='k')

    xticks = np.arange(0, len(namespace.genome[chrom]), 10000000)
    if chrom in {'3R', '2R'}:
        xticks = xticks[:-1]
    ax.set_xticks(xticks)
    ax.set_xticklabels(xticks//1000000)
    ax.set_xlim(0, len(namespace.genome[chrom]))
    ax.set_ylim(0, 1)

In [12]:
def fig_linear_genome(plotf, genome, chromosomes=None, fig=None, 
                      bottom=0, height=1, width_factor=1.08, chrom_pad=0.035, 
                      clip_patch_kwargs=None, **kwargs):
    if chromosomes is None:
        chromosomes = ['2R', '2L', '3R', '3L', 'X']
    genome_size = sum(len(genome[chrom]) for chrom in chromosomes)

    from matplotlib.path import Path

    if fig is None:
        fig = plt.figure(figsize=(8, 1))

    left = 0

    if clip_patch_kwargs is None:
        clip_patch_kwargs = dict()
    clip_patch_kwargs.setdefault('edgecolor', 'k')
    clip_patch_kwargs.setdefault('facecolor', 'none')
    clip_patch_kwargs.setdefault('lw', 1)

    axs = dict()
    for chrom in chromosomes:

        # calculate width needed for this chrom
        width = len(genome[chrom]) / (genome_size * width_factor)

        # create axes
        ax = fig.add_axes([left, bottom, width, height])
        ax.set_axis_bgcolor((1, 1, 1, 0));
        axs[chrom] = ax

        # construct clip path
        if chrom in {'2R', '3R'}:
            verts = [(0.01, 0.02), (0.9, 0.02), (1.01, 0.3), (1.01, 0.7), (0.9, .98), (0.01, .98), (0.01, 0.02)]
            codes = [Path.MOVETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.CLOSEPOLY]
        elif chrom == "X":
            verts = [(0.01, 0.02), (0.9, 0.02), (0.99, 0.3), (0.99, 0.7), (0.9, .98), (0.01, .98), (0.01, 0.02)]
            codes = [Path.MOVETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.CLOSEPOLY]
        else:
            verts = [(0.1, 0.02), (.99, 0.02), (.99, .98), (.1, .98), (-0.01, .7), (-0.01, .3), (0.1, 0.02)]
            codes = [Path.MOVETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.CLOSEPOLY]
        path = Path(verts, codes)
        clip_patch = mpl.patches.PathPatch(path, transform=ax.transAxes, **clip_patch_kwargs)

        # do the plotting
        plotf(chrom=chrom, ax=ax, clip_patch=clip_patch, **kwargs)

        # increment left coordinate
        left += len(genome[chrom]) / (genome_size * width_factor)
        if chrom in {'2L', '3L'}:
            left += chrom_pad

    return axs

In [13]:
autosomes = '2R', '2L', '3R', '3L'
chromosomes = autosomes + ('X',)


class GenomeFigure(object):
    
    def __init__(self, genome, *args, **kwargs):
        self.chromosomes = kwargs.pop('chromosomes', ['2R', '2L', '3R', '3L', 'X'])
        maxchrsize = max(np.array(genome[chrom]).size for chrom in self.chromosomes)
        fig = plt.figure(*args, **kwargs)
        self.fig = fig
        self.ax = dict()
        for i, chrom in enumerate(self.chromosomes):
            ax = fig.add_subplot(3, 2, i+1)
            self.ax[chrom] = ax
            S = np.array(genome[chrom])
            if i % 2 == 1:
                sns.despine(ax=ax, offset=10, top=True, left=True, right=False)
                ax.set_xlim(0, maxchrsize)
                ax.yaxis.tick_right()
                ax.yaxis.set_label_position('right')
            else:
                ax.set_xlim((S.size)-(maxchrsize), S.size)
                ax.yaxis.tick_left()
                sns.despine(ax=ax, offset=10, top=True, left=False, right=True)
            ax.set_xticks(range(0, S.size, int(5e6)))
            ax.set_xticklabels(range(0, int(S.size/1e6), 5))
            ax.set_title(chrom, fontweight='bold')
            ax.xaxis.tick_bottom()
        fig.tight_layout()
        
    def apply(self, f, **kwargs):
        chromosomes = kwargs.pop('chromosomes', self.chromosomes)
        for chrom in chromosomes:
            ax = self.ax[chrom]
            f(chrom, ax, **kwargs)
        
        
def subplots(*args, **kwargs):
    fig, ax = plt.subplots(*args, **kwargs)
    sns.despine(ax=ax, offset=10)
    return fig, ax