In [1]:
%run setup.ipynb
%matplotlib inline
# import hapclust
%reload_ext autoreload
%autoreload 1
%aimport hapclust

In [7]:
# obtain data from unphased callset - only needed for variant annotations
callset_pass = phase1_ar31.callset_pass
pos_pass = allel.SortedIndex(callset_pass['2L/variants/POS'])
ann_pass = callset_pass['2L/variants/ANN'][:][['Annotation', 'HGVS_p']]

In [8]:
# setup haplotype data
callset_phased = phase1_ar31.callset_phased
genotypes_phased = allel.GenotypeDaskArray(callset_phased['2L/calldata/genotype'])
pos_phased = allel.SortedIndex(callset_phased['2L/variants/POS'])

In [9]:
pos_kdr_s = 2422651
pos_kdr_f = 2422652

In [10]:
# define region we're going to analyse
loc_region = pos_phased.locate_range(0, 4000000)
pos_phased_region = pos_phased[loc_region]
pos_phased_region

0,1,2,3,4,...,163958,163959,163960,163961,163962
44688,44691,44732,44736,44756,...,3997372,3997373,3997378,3997381,3997386


In [11]:
# locate the intersection with unphased callset - needed to tie in annotations
loc1, _ = pos_pass.locate_intersection(pos_phased_region)
np.count_nonzero(loc1)

163963

In [12]:
ann_phased_region = ann_pass[loc1]
ann_phased_region

array([(b'intergenic_region', b'.'), (b'intergenic_region', b'.'),
       (b'intergenic_region', b'.'), ...,
       (b'downstream_gene_variant', b'.'),
       (b'downstream_gene_variant', b'.'),
       (b'downstream_gene_variant', b'.')], 
      dtype=[('Annotation', 'S34'), ('HGVS_p', 'S14')])

In [13]:
# exclude cross parents
haps_phased_region = genotypes_phased[loc_region].to_haplotypes()[:, :-16].compute()

In [14]:
# perform allele count - needed to locate singletons
ac_phased_region = haps_phased_region.count_alleles(max_allele=1)

In [15]:
# define types of variants to include in EHH analysis - should be mostly neutral
loc_type_neutral = ((ann_phased_region['Annotation'] == b'intergenic_region') | 
                    (ann_phased_region['Annotation'] == b'intron_variant') |
                    (ann_phased_region['Annotation'] == b'downstream_gene_variant') |
                    (ann_phased_region['Annotation'] == b'upstream_gene_variant') |
                    (ann_phased_region['Annotation'] == b'synonymous_variant') |
                    (ann_phased_region['Annotation'] == b'3_prime_UTR_variant') |
                    (ann_phased_region['Annotation'] == b'5_prime_UTR_variant') 
                    )
np.count_nonzero(loc_type_neutral), loc_type_neutral.shape

(156848, (163963,))

In [16]:
# locate singletons - will exclude from EHH analysis
loc_sgl = ac_phased_region.min(axis=1) == 1
loc_nosgl = ac_phased_region.min(axis=1) > 1
np.count_nonzero(loc_sgl), np.count_nonzero(loc_nosgl), loc_nosgl.shape

(52221, 111611, (163963,))

In [17]:
# these are the variants to use for EHH
loc_ehh = loc_type_neutral & loc_nosgl
np.count_nonzero(loc_ehh), loc_ehh.shape

(107531, (163963,))

In [18]:
haps_ehh = haps_phased_region[loc_ehh]
pos_ehh = pos_phased_region[loc_ehh]

In [19]:
core_pos = pos_kdr_f

In [20]:
# split the EHH dataset
dist_ehh_right, dist_ehh_left, haps_ehh_right, haps_ehh_left = hapclust.split_flanks(
    haps_ehh, pos_ehh, core_pos
)

In [22]:
def reconstruct_ancestral_haplotype(haps_flank):
    
    # setup intermediates and outputs
    n_variants, n_haplotypes = haps_flank.shape
    clust_anc = np.ones(n_haplotypes, dtype=bool)
    ac = np.zeros(2, dtype=int)
    hap_anc = np.full(n_variants, dtype='i1', fill_value=-1)
    
    # iterate over variants
    for i in range(n_variants):
                
        # first count alleles within the ancestral cluster
        ac[:] = 0  # reset
        for j in range(n_haplotypes):
            if clust_anc[j]:
                allele = haps_flank[i, j]
                if allele >= 0:
                    ac[allele] += 1
                    
        # find ancestral allele
        if ac[0] >= ac[1]:
            # if tie, assume reference
            anc_allele = 0
            der_allele = 1
        else:
            anc_allele = 1
            der_allele = 0
        # patch into ancestral haplotype
        hap_anc[i, j] = anc_allele

        # detect bifurcation
        if ac[0] > 0 and ac[1] > 0:
            n_anc = 0
            # split cluster
            if j in range(n_haplotypes):
                if clust_anc[j]:
                    if haps_flank[i, j] == der_allele:
                        clust_anc[j] = False
                    else:
                        n_anc += 1
        if n_anc < 2:
            break
            
    return hap_anc


In [26]:
import pickle
with open('../data/clust_dict.pickle', 'rb') as handle:
    vgsc_haplogroups = pickle.load(handle) 
sorted(vgsc_haplogroups.keys())

['F1', 'F2', 'F3', 'F4', 'F5', 'L1', 'L2', 'S1', 'S2', 'S3', 'S4']

In [None]:
# TODO merge in PR which fixes haplogroup names

In [28]:
haps_f1_right = haps_ehh_right.take(list(vgsc_haplogroups['F1']), axis=1)
haps_f1_right

Unnamed: 0,0,1,2,3,4,...,37,38,39,40,41,Unnamed: 12
0,0,0,0,0,0,...,0,0,0,0,0,
1,0,0,0,0,0,...,0,0,0,0,0,
2,1,1,1,1,1,...,1,1,1,1,1,
...,...,...,...,...,...,...,...,...,...,...,...,...
91774,0,0,0,0,0,...,0,0,0,0,0,
91775,0,0,0,0,0,...,0,0,0,0,0,
91776,0,0,0,0,0,...,0,0,0,0,0,
