## add mvncall multiallelics/1570 to the haplotype genotype array

In [1]:
%run setup.ipynb

### get phased haplotypes for 2L centromere proximal region

In [2]:
# setup haplotype data
callset_phased = phase1_ar31.callset_phased
genotypes_phased = allel.GenotypeDaskArray(callset_phased['2L/calldata/genotype'])
pos_phased = allel.SortedIndex(callset_phased['2L/variants/POS'])

In [3]:
genotypes_phased.shape, pos_phased.shape

((8296600, 773, 2), (8296600,))

In [4]:
#kdr locations
pos_kdr_s = 2422651
pos_kdr_f = 2422652

In [5]:
# define region we're going to analyse
loc_region = pos_phased.locate_range(0, 4000000)
pos_phased_region = pos_phased[loc_region]
pos_phased_region

0,1,2,3,4,...,163958,163959,163960,163961,163962
44688,44691,44732,44736,44756,...,3997372,3997373,3997378,3997381,3997386


In [6]:
# chop genotypes to region, remove colony parents (8 samples) and turn into haplotype array
gen_phased_region = genotypes_phased[loc_region][:, :-8].compute()
gen_phased_region.shape
# don't turn in .to_haplotypes() yet - might be easier to interleave new positions first, then turn whole lot into hap

(163963, 765, 2)

In [7]:
gen_phased_region

Unnamed: 0,0,1,2,3,4,...,760,761,762,763,764,Unnamed: 12
0,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
1,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
2,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
...,...,...,...,...,...,...,...,...,...,...,...,...
163960,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
163961,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
163962,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,


### grab the mvncalls (in npz format)

In [8]:
#genotypes
callset_nu = np.load('../data/missense_multiallelics_and_1570.mvncall.200.npz')
calldata_nu = callset_nu['calldata']
gen_nu = allel.GenotypeArray(calldata_nu['genotype'])

In [9]:
#positions
pos_nu = allel.SortedIndex(callset_nu['variants']['POS'])
pos_nu

0,1,2
2391228,2400071,2429745


# weave the genotypes and positions into the phase1 haplotype data

### Alistair's weave technique

In [10]:
#concatenate old and new gen/pos arrays
haps_combined = np.concatenate([gen_phased_region, gen_nu], axis=0)
pos_combined = np.concatenate([pos_phased_region, pos_nu], axis=0)

In [11]:
#sort pos indices
idx_sorted =np.argsort(pos_combined)

In [12]:
#use sorted indices to re-order the combined arrays
haps_combined = allel.GenotypeArray(haps_combined[idx_sorted])
haps_combined.shape

(163966, 765, 2)

In [13]:
pos_combined = allel.SortedIndex(pos_combined[idx_sorted])
pos_combined.shape

(163966,)

In [14]:
#then turn into haplotype array...
haps_arr = haps_combined.to_haplotypes()
haps_arr

Unnamed: 0,0,1,2,3,4,...,1525,1526,1527,1528,1529,Unnamed: 12
0,0,0,0,0,0,...,0,0,0,0,0,
1,0,0,0,0,0,...,0,0,0,0,0,
2,0,0,0,0,0,...,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...
163963,0,0,0,0,0,...,0,0,0,0,0,
163964,0,0,0,0,0,...,0,0,0,0,0,
163965,0,0,0,0,0,...,0,0,0,0,0,


### Kaibosely's weave technique

In [15]:
#list of new pos
nos = [2391228, 2400071, 2429745]
#search for where they need to be inserted
np.searchsorted(pos_phased_region, nos)

array([25309, 25553, 26142])

In [16]:
#but does this account for the insertion of the previous nos in list?
#test
np.searchsorted([0,1,2,3,5,6,7,9], [4,8])

array([4, 7])

In [17]:
#nope, that's annoying
#would need to loop through the new positions one a time - find insertion position, insert position in pos_phased region and line of gen_nu in gen_phased_region
for i in range(3):
    print(nos[i])
    ins = np.searchsorted(pos_phased_region, nos[i])
    print(ins)
    pos_phased_region = np.insert(pos_phased_region, ins, nos[i], axis=0)
    gen_phased_region = np.insert(gen_phased_region, ins, gen_nu[i], axis=0)

2391228
25309
2400071
25554
2429745
26144


In [22]:
np.count_nonzero(pos_phased_region != pos_combined)

0

In [21]:
np.count_nonzero(gen_phased_region != haps_combined)

0

In [None]:
# it works but I think it is harder to verify than Alistairs method.