## Setup

In [1]:
%run setup.ipynb
%matplotlib notebook
%reload_ext autoreload
%autoreload 1
%aimport hapclust

In [2]:
# setup callset - needed for variant annotations
callset_pass = phase1_ar31.callset_pass
pos_pass = allel.SortedIndex(callset_pass['2L/variants/POS'])
ann_pass = callset_pass['2L/variants/ANN'][:][['Annotation', 'HGVS_p']]

In [3]:
# setup haplotypes
callset_phased = phase1_ar31.callset_phased
genotypes_phased = allel.GenotypeDaskArray(callset_phased['2L/calldata/genotype'])
pos_phased = allel.SortedIndex(callset_phased['2L/variants/POS'])

In [4]:
pos_kdr_s = 2422651
pos_kdr_f = 2422652

In [5]:
# define region for EHH analysis
loc_region = pos_phased.locate_range(0, 4000000)
pos_phased_region = pos_phased[loc_region]
pos_phased_region

0,1,2,3,4,...,163958,163959,163960,163961,163962
44688,44691,44732,44736,44756,...,3997372,3997373,3997378,3997381,3997386


In [6]:
# locate the intersection - needed to tie in annotations
loc1, _ = pos_pass.locate_intersection(pos_phased_region)
np.count_nonzero(loc1)

163963

In [7]:
ann_phased_region = ann_pass[loc1]
ann_phased_region

array([(b'intergenic_region', b'.'), (b'intergenic_region', b'.'),
       (b'intergenic_region', b'.'), ...,
       (b'downstream_gene_variant', b'.'),
       (b'downstream_gene_variant', b'.'),
       (b'downstream_gene_variant', b'.')], 
      dtype=[('Annotation', 'S34'), ('HGVS_p', 'S14')])

In [8]:
collections.Counter(ann_phased_region['Annotation'])

Counter({b'3_prime_UTR_variant': 2941,
         b'5_prime_UTR_premature_start_codon_': 306,
         b'5_prime_UTR_variant': 1677,
         b'downstream_gene_variant': 18539,
         b'initiator_codon_variant': 3,
         b'intergenic_region': 54849,
         b'intragenic_variant': 48,
         b'intron_variant': 32362,
         b'missense_variant': 5805,
         b'missense_variant&splice_region_var': 70,
         b'splice_acceptor_variant&intron_var': 24,
         b'splice_donor_variant&intron_varian': 27,
         b'splice_region_variant': 36,
         b'splice_region_variant&intron_varia': 649,
         b'splice_region_variant&stop_retaine': 5,
         b'splice_region_variant&synonymous_v': 87,
         b'start_lost': 9,
         b'stop_gained': 37,
         b'stop_lost&splice_region_variant': 4,
         b'stop_retained_variant': 5,
         b'synonymous_variant': 8636,
         b'upstream_gene_variant': 37844})

In [9]:
# exclude cross parents
haps_phased_region = genotypes_phased[loc_region].to_haplotypes()[:, :-16].compute()

In [10]:
ac_phased_region = haps_phased_region.count_alleles(max_allele=1)
ac_phased_region

Unnamed: 0,0,1,Unnamed: 3
0,1529,1,
1,1528,2,
2,1528,2,
...,...,...,...
163960,1527,3,
163961,1529,1,
163962,1527,3,


In [11]:
region_vgsc = SeqFeature('2L', 2358158, 2431617, label='Vgsc')
region_vgsc

<SeqFeature 'Vgsc' 2L:2358158-2431617>

In [12]:
loc_vgsc = pos_phased_region.locate_range(region_vgsc.start, region_vgsc.end)
loc_vgsc

slice(24471, 26181, None)

In [13]:
haps_vgsc = haps_phased_region[loc_vgsc]
haps_vgsc

Unnamed: 0,0,1,2,3,4,...,1525,1526,1527,1528,1529,Unnamed: 12
0,0,0,0,0,0,...,0,0,0,0,0,
1,0,0,0,0,0,...,0,0,0,0,0,
2,0,0,0,0,0,...,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...
1707,0,0,0,0,0,...,0,0,0,0,0,
1708,0,0,0,0,0,...,0,0,0,0,0,
1709,0,0,0,0,0,...,0,0,0,0,0,


In [14]:
ac_vgsc = haps_vgsc.count_alleles(max_allele=1)
ac_vgsc

Unnamed: 0,0,1,Unnamed: 3
0,1529,1,
1,1457,73,
2,1528,2,
...,...,...,...
1707,1513,17,
1708,1525,5,
1709,1519,11,


In [15]:
ann_vgsc = ann_phased_region[loc_vgsc]

In [16]:
loc_vgsc_missense = (ann_vgsc['Annotation'] == b'missense_variant') & (ac_vgsc[:, 1] > 7)
np.count_nonzero(loc_vgsc_missense)

16

In [17]:
haps_vgsc_missense = haps_vgsc[loc_vgsc_missense]

In [18]:
lbl_vgsc_missense = [l[2:] for l in ann_vgsc[loc_vgsc_missense]['HGVS_p'].astype('U')]
lbl_vgsc_missense

['Arg254Lys',
 'Asp466His',
 'Thr791Met',
 'Leu995Ser',
 'Leu995Phe',
 'Ala1125Val',
 'Ile1527Thr',
 'Glu1597Gly',
 'Ala1746Ser',
 'Val1853Ile',
 'Ile1868Thr',
 'Pro1874Ser',
 'Pro1874Leu',
 'Phe1920Ser',
 'Ala1934Val',
 'Ile1940Thr']

In [19]:
# def plot_haplotypes(ax, h, mut_labels):
#     mycol = ['w', 'k'] 
#     cake = mpl.colors.ListedColormap(mycol, name='mymap', N=3)
#     ax.pcolormesh(np.asarray(h[::-1]), cmap=cake, vmin=0, vmax=2)
#     ax.set_yticks(np.arange(h.shape[0])+.5)
#     ax.set_yticklabels(mut_labels[::-1], family='monospace')
#     ax.hlines(np.arange(h.shape[0]+1), 0, h.shape[1], color='k', lw=.5)
#     ax.set_xlim(0, h.shape[1])
#     ax.set_ylim(0, h.shape[0])


In [20]:
fig, ax = plt.subplots()
hapclust.plot_haplotypes(ax, haps_vgsc_missense, lbl_vgsc_missense)

<IPython.core.display.Javascript object>

In [21]:
# define types of variants to include in EHH analysis - should be mostly neutral
loc_type_neutral = ((ann_phased_region['Annotation'] == b'intergenic_region') | 
                    (ann_phased_region['Annotation'] == b'intron_variant') |
                    (ann_phased_region['Annotation'] == b'downstream_gene_variant') |
                    (ann_phased_region['Annotation'] == b'upstream_gene_variant') |
                    (ann_phased_region['Annotation'] == b'synonymous_variant') |
                    (ann_phased_region['Annotation'] == b'3_prime_UTR_variant') |
                    (ann_phased_region['Annotation'] == b'5_prime_UTR_variant') 
                    )
np.count_nonzero(loc_type_neutral), loc_type_neutral.shape

(156848, (163963,))

In [22]:
# locate singletons - will exclude from EHH analysis
loc_sgl = ac_phased_region.min(axis=1) == 1
loc_nosgl = ac_phased_region.min(axis=1) > 1
np.count_nonzero(loc_sgl), np.count_nonzero(loc_nosgl), loc_nosgl.shape

(52221, 111611, (163963,))

In [23]:
# these are the variants to use for EHH
loc_ehh = loc_type_neutral & loc_nosgl
np.count_nonzero(loc_ehh), loc_ehh.shape

(107531, (163963,))

In [24]:
# these are the variants to use for mutational distance
#loc_mut = loc_sgl
# include non-neutral mutations
loc_mut = loc_sgl | ~loc_type_neutral
np.count_nonzero(loc_mut), loc_mut.shape

(56311, (163963,))

In [25]:
haps_mut = haps_phased_region[loc_mut]
haps_mut

Unnamed: 0,0,1,2,3,4,...,1525,1526,1527,1528,1529,Unnamed: 12
0,0,0,0,0,0,...,0,0,0,0,0,
1,0,0,0,0,0,...,0,0,0,0,0,
2,0,0,0,0,0,...,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...
56308,0,0,0,0,0,...,0,0,0,0,0,
56309,0,0,0,0,0,...,0,0,0,0,0,
56310,0,0,0,0,0,...,0,0,0,0,0,


In [26]:
haps_ehh = haps_phased_region[loc_ehh]
haps_ehh

Unnamed: 0,0,1,2,3,4,...,1525,1526,1527,1528,1529,Unnamed: 12
0,0,0,0,0,0,...,0,0,0,0,0,
1,0,0,0,0,0,...,0,0,0,0,0,
2,1,1,1,1,1,...,1,1,1,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...
107528,0,0,0,0,0,...,0,0,0,0,0,
107529,0,0,0,0,0,...,0,0,0,0,0,
107530,0,0,0,0,0,...,0,0,0,0,0,


In [27]:
pos_ehh = pos_phased_region[loc_ehh]
pos_ehh

0,1,2,3,4,...,107526,107527,107528,107529,107530
44691,44732,44756,44767,44872,...,3997366,3997367,3997373,3997378,3997386


In [28]:
pos_mut = pos_phased_region[loc_mut]
pos_mut

0,1,2,3,4,...,56306,56307,56308,56309,56310
44688,44736,44887,44904,44910,...,3997357,3997362,3997371,3997372,3997381


In [29]:
pos_mut.locate_key(pos_kdr_s)

10210

In [30]:
pos_mut.locate_key(pos_kdr_f)

10211

In [31]:
# read in haplotype metadata to get population
df_haplotypes = phase1_ar31.df_haplotypes
df_haplotypes = df_haplotypes[df_haplotypes.population != 'colony']
df_haplotypes.head()

Unnamed: 0_level_0,label,ox_code,population,label_aug,country,region,sex,m_s,kt_2la,kt_2rb
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,AB0085-Ca,AB0085-C,BFS,"AB0085-Ca [Burkina Faso, Pala, S, F]",Burkina Faso,Pala,F,S,2.0,2.0
1,AB0085-Cb,AB0085-C,BFS,"AB0085-Cb [Burkina Faso, Pala, S, F]",Burkina Faso,Pala,F,S,2.0,2.0
2,AB0087-Ca,AB0087-C,BFM,"AB0087-Ca [Burkina Faso, Bana, M, F]",Burkina Faso,Bana,F,M,2.0,1.0
3,AB0087-Cb,AB0087-C,BFM,"AB0087-Cb [Burkina Faso, Bana, M, F]",Burkina Faso,Bana,F,M,2.0,1.0
4,AB0088-Ca,AB0088-C,BFM,"AB0088-Ca [Burkina Faso, Bana, M, F]",Burkina Faso,Bana,F,M,2.0,0.0


In [32]:
core_pos = pos_kdr_f

In [33]:
dist_ehh_right, dist_ehh_left, haps_ehh_right, haps_ehh_left = hapclust.split_flanks(
    haps_ehh, pos_ehh, core_pos
)

In [34]:
dist_ehh_right

array([     45,     255,     258, ..., 1574721, 1574726, 1574734], dtype=int32)

In [35]:
dist_ehh_left

array([    108,     250,     270, ..., 2377896, 2377920, 2377961], dtype=int32)

In [36]:
dist_ehh_right.shape, dist_ehh_left.shape

((91777,), (15754,))

In [37]:
dist_ehh_right.min(), dist_ehh_left.min()

(45, 108)

In [38]:
dist_mut_right, dist_mut_left, haps_mut_right, haps_mut_left = hapclust.split_flanks(
    haps_mut, pos_mut, core_pos
)

In [39]:
dist_mut_right

array([      0,      82,     353, ..., 1574719, 1574720, 1574729], dtype=int32)

In [40]:
dist_mut_left

array([      1,     166,     505, ..., 2377765, 2377916, 2377964], dtype=int32)

In [41]:
dist_mut_right.shape, dist_mut_left.shape

((46100,), (10211,))

In [42]:
dist_mut_right.min(), dist_mut_left.min()

(0, 1)

In [43]:
haps_ehh_left.shape, haps_ehh_right.shape, haps_mut_left.shape, haps_mut_right.shape

((15754, 1530), (91777, 1530), (10211, 1530), (46100, 1530))

## Shared haplotype length

In [44]:
idx_sorted_right, nspl_right, nspd_right, muts_right = hapclust.neighbour_haplotype_sharing(
    haps_ehh_right, haps_mut_right, dist_ehh_right, dist_mut_right
)

In [45]:
idx_sorted_left, nspl_left, nspd_left, muts_left = hapclust.neighbour_haplotype_sharing(
    haps_ehh_left, haps_mut_left, dist_ehh_left, dist_mut_left
)

In [46]:
nspl_right.min(), nspl_right.max()

(0, 76267)

In [47]:
nspl_left.min(), nspl_left.max()

(0, 15754)

In [48]:
nspd_right.min(), nspd_right.max()

(255, 1437049)

In [49]:
nspd_left.min(), nspd_left.max()

(250, 2377961)

In [50]:
muts_right.min(), muts_right.max()

(0, 14)

In [51]:
muts_left.min(), muts_left.max()

(0, 23)

In [52]:
# 1 cM/Mb convert to M/bp
1 / (1e2 * 1e6)

1e-08

In [53]:
# assume constant recombination rate
rr = 1e-8
rr

1e-08

In [54]:
mu = 3.5e-9
mu

3.5e-09

In [55]:
pops_right = df_haplotypes.population[idx_sorted_right]
pop_colors_right = [phase1_ar3.pop_colors[p] for p in pops_right]

fig = plt.figure(figsize=(11, 6))
hapclust.fig_neighbour_haplotype_sharing(nspd=nspd_right, 
                                         muts=muts_right, 
                                         haps_display=haps_vgsc_missense[:, idx_sorted_right],
                                         haps_display_vlbl=lbl_vgsc_missense,
                                         pop_colors=pop_colors_right,
                                         nspd_cut=2e4,
                                         nspd_ylim=(1e3, 3e6),
                                         that_ylim=(2e1, 5e5),
                                         mu=mu, rr=rr,
                                         fig=fig)
fig.suptitle('Right flank');

<IPython.core.display.Javascript object>

In [56]:
pops_left = df_haplotypes.population[idx_sorted_left]
pop_colors_left = [phase1_ar3.pop_colors[p] for p in pops_left]

fig = plt.figure(figsize=(11, 6))
hapclust.fig_neighbour_haplotype_sharing(nspd=nspd_left, 
                                         muts=muts_left, 
                                         haps_display=haps_vgsc_missense[:, idx_sorted_left],
                                         haps_display_vlbl=lbl_vgsc_missense,
                                         pop_colors=pop_colors_left,
                                         nspd_cut=2e4,
                                         nspd_ylim=(1e3, 3e6),
                                         that_ylim=(2e1, 5e5),
                                         mu=mu, rr=rr,
                                         fig=fig)
fig.suptitle('Left flank')

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x7fcb6ffc6898>

In [57]:
nspl_left

array([14900, 10152,  8213, ...,  5231,     0,   212], dtype=int32)

In [58]:
# nspd_left = pos_ehh_left[-1] - np.asarray(pos_ehh_left)[(pos_ehh_left.shape[0] - 1 - nspl_left - 1).clip(min=0)]
nspd_left.min(), nspd_left.max()

(250, 2377961)

In [59]:
fig = plt.figure()
ax = fig.add_subplot(111)

x = nspd_right
t_hat = (1 + muts_right) / (2 * (nspd_right * rr + nspd_right * mu))
ax.plot(x, t_hat, marker='o', mfc='none', mec='g', markersize=3, linestyle=' ', label='Downstream')

x = nspd_left
t_hat = (1 + muts_left) / (2 * (nspd_left * rr + nspd_left * mu))
ax.plot(x, t_hat, marker='o', mfc='none', mec='b', markersize=3, linestyle=' ', label='Upstream')

ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('Shared haplotype length (bp)')
ax.set_ylabel('$\hat{t}$', rotation=0, ha='right')
ax.legend();


<IPython.core.display.Javascript object>

In [60]:
fig = plt.figure()
ax = fig.add_subplot(111)
x = nspd_left
t_hat = (1 + muts_left) / (2 * (nspd_left * rr + nspd_left * mu))
ax.hist(t_hat[(nspd_left > 0)], bins=np.logspace(1, 6, 80), histtype='step', lw=2, label='Upstream')
t_hat = (1 + muts_right) / (2 * (nspd_right * rr + nspd_right * mu))
ax.hist(t_hat[(nspd_right > 0)], bins=np.logspace(1, 6, 80), histtype='step', lw=2, label='Downstream')
ax.set_xscale('log')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Frequency')
ax.legend();


<IPython.core.display.Javascript object>

## Tree

In [61]:
pspl_right, pspd_right, pmuts_right = hapclust.pairwise_haplotype_sharing(
    haps_ehh_right, haps_mut_right, dist_ehh_right, dist_mut_right)

In [62]:
pspl_left, pspd_left, pmuts_left = hapclust.pairwise_haplotype_sharing(
    haps_ehh_left, haps_mut_left, dist_ehh_left, dist_mut_left)

In [63]:
pspl_right.shape, pspl_left.shape

((1169685,), (1169685,))

In [64]:
pspd_right.min(), pspd_right.max()

(255, 1437049)

In [65]:
pspd_left.min(), pspd_left.max()

(250, 2377961)

In [66]:
fig, ax = plt.subplots()
ax.hist(pspl_right, bins=np.logspace(1, 5, 50), histtype='step', lw=2)
ax.hist(pspl_left, bins=np.logspace(1, 5, 50), histtype='step', lw=2)
ax.set_xscale('log');

<IPython.core.display.Javascript object>

In [67]:
fig, ax = plt.subplots()
ax.hist(pspd_right, bins=np.logspace(1, 7, 50), histtype='step', lw=2)
ax.hist(pspd_left, bins=np.logspace(1, 7, 50), histtype='step', lw=2)
ax.set_xscale('log');

<IPython.core.display.Javascript object>

In [68]:
pspl_right.max(), nspl_right.max()

(76267, 76267)

In [69]:
pspd_right.max(), nspd_right.max()

(1437049, 1437049)

In [70]:
pspl_left.max(), nspl_left.max()

(15754, 15754)

In [71]:
pspd_left.max(), nspd_left.max()

(2377961, 2377961)

In [72]:
pmuts_right.shape, pmuts_left.shape

((1169685,), (1169685,))

In [73]:
muts_right.max(), muts_left.max()

(14, 23)

In [74]:
pmuts_right.max(), pmuts_left.max()

(16, 23)

In [75]:
np.bincount(pmuts_right)

array([480796, 637876,  38795,   8835,   2206,    739,    272,    120,
           23,     11,      1,      2,      0,      3,      4,      1,
            1])

In [76]:
np.bincount(pmuts_left)

array([637057, 510780,  15903,   3582,   1237,    552,    283,     89,
           43,     59,     32,     18,     13,     28,      7,      1,
            0,      0,      0,      0,      0,      0,      0,      1])

In [77]:
fig, ax = plt.subplots()
ax.hist([pmuts_right, pmuts_left], stacked=True, bins=np.arange(25));

<IPython.core.display.Javascript object>

In [78]:
pt_hat_right = (1 + pmuts_right) / (2 * (pspd_right * rr + pspd_right * mu))
fig, ax = plt.subplots()
ax.hist(pt_hat_right, bins=np.logspace(1, 6, 100))
ax.set_xscale('log')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Frequency');

<IPython.core.display.Javascript object>

In [79]:
pt_hat_left = (1 + pmuts_left) / (2 * (pspd_left * rr + pspd_left * mu))
fig, ax = plt.subplots()
ax.hist(pt_hat_left, bins=np.logspace(1, 6, 100))
ax.set_xscale('log')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Frequency');

<IPython.core.display.Javascript object>

In [80]:
pspd_both = pspd_left + pspd_right
pmuts_both = pmuts_left + pmuts_right
pt_hat_both = (1 + pmuts_both) / (2 * (pspd_both * rr + pspd_both * mu))
fig, ax = plt.subplots()
ax.hist(pt_hat_both, bins=np.logspace(1, 6, 100))
ax.set_xscale('log')
ax.set_xlabel('$\hat{t}$')
ax.set_ylabel('Frequency');

<IPython.core.display.Javascript object>

In [81]:
def plot_dendrogram(dist, cut_height=1e3, yscale='log', ylim=(10, 1e6), linkage_method='average'):
    z = scipy.cluster.hierarchy.linkage(dist, method=linkage_method)

    fig = plt.figure(figsize=(16, 8))
    gs = mpl.gridspec.GridSpec(nrows=3, ncols=1, height_ratios=[6, .5, 4], hspace=0)

    ax = fig.add_subplot(gs[0])
    sns.despine(ax=ax, offset=3, bottom=True, top=False)
    r = scipy.cluster.hierarchy.dendrogram(
        z, no_labels=True, count_sort=True, 
        color_threshold=0, 
        above_threshold_color='k',
        ax=ax)
    ax.set_ylim(*ylim)
    ax.set_yscale(yscale)
    # ax.set_ylim(bottom=-1000)
    xmin, xmax = ax.xaxis.get_data_interval()
    xticklabels = np.array(list(range(0, len(df_haplotypes), 200)) + [len(df_haplotypes)])
    xticks = xticklabels / len(df_haplotypes)
    xticks = (xticks * (xmax - xmin)) + xmin
    ax.set_xticks(xticks)
    ax.set_xticklabels(xticklabels)
    ax.set_xlabel('Haplotypes')
    ax.xaxis.set_label_position('top')
    ax.axhline(cut_height, linestyle='--', color='k')
    ax.set_ylabel('$\hat{t}$', rotation=0, ha='right')

    ax = fig.add_subplot(gs[1])
    sns.despine(ax=ax, left=True, bottom=True)
    pops = df_haplotypes.population[r['leaves']]
    pop_colors = [phase1_ar3.pop_colors[p] for p in pops]
    ax.broken_barh([(i, 1) for i in range(len(df_haplotypes))], yrange=(0, 1), color=pop_colors)
    ax.set_xlim(0, len(df_haplotypes))
    ax.set_yticks([])
    ax.set_ylabel('Population', rotation=0, ha='right', va='center')
    ax.set_xticks([])

    ax = fig.add_subplot(gs[2])
    hapclust.plot_haplotypes(ax, haps_vgsc_missense[:, r['leaves']], lbl_vgsc_missense)
    ax.set_xlim(0, len(df_haplotypes))
    ax.set_xlabel('Haplotypes')

    gs.tight_layout(fig, h_pad=0)

In [82]:
plot_dendrogram(pt_hat_right)

<IPython.core.display.Javascript object>

In [83]:
plot_dendrogram(pt_hat_left)

<IPython.core.display.Javascript object>

In [84]:
plot_dendrogram(pt_hat_both)

<IPython.core.display.Javascript object>

In [85]:
plot_dendrogram(pt_hat_both, yscale='linear', ylim=(-1000, 1.2e5))

<IPython.core.display.Javascript object>

## Sandbox