In [1]:
import malariagen_data
import allel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%run "../malariagen_data/util.py"

In [3]:
ag3 = malariagen_data.Ag3()

In [4]:
aims = "gamb_vs_colu"
sample_sets = "AG1000G-UG"

In [5]:
def aim_calls(aims, sample_sets, sample_query=None):
    
    # dask dict
    dask_dict = {}
    
    # aim_calls
    aim_sites = ag3.aim_sites(aims=aims)
    aim_sites = aim_sites.set_index(variants='variant_contig')
    num_contigs = len(aim_sites.attrs['contigs'])
    
    # get calls for each aim
    for idx in range(num_contigs):

        arm = aim_sites.attrs['contigs'][idx]
        print(f"collecting {arm} aim calls")

        # aim alleles and positions
        aim_arm_sites = aim_sites.sel(variants=idx)
        aim_pos = aim_arm_sites['variant_position'].values
        aim_alleles = aim_arm_sites['variant_allele'].values

        # snp alleles and positions
        snp_calls = ag3.snp_calls(region=arm, sample_sets=sample_sets)
        snp_calls = snp_calls.set_index(variants='variant_position')
        snp_aim_calls = snp_calls.sel(variants=aim_pos)
        snp_alleles = snp_aim_calls['variant_allele'][:].values

        # mapping alleles
        snp_ref = snp_alleles[:,0]
        snp_alt = snp_alleles[:,1:]
        mapping = allel.create_allele_mapping(snp_ref, snp_alt, aim_alleles)
        gt = allel.GenotypeDaskArray(snp_aim_calls['call_genotype'].data)
        gt_map = gt.map_alleles(mapping)         
            
        dask_dict[idx] = gt_map.values

    dask_cat = da.concatenate([dask_dict[i] for i in range(num_contigs)], axis=0) 
    aim_sites['call_genotype'] = (("variants", "samples", "ploidy"), dask_cat)
        
    return aim_sites

In [6]:
%%time
aim_sites = aim_calls(aims=aims, sample_sets=sample_sets)

collecting 2R aim calls
collecting 2L aim calls
collecting 3R aim calls
collecting 3L aim calls
collecting X aim calls
CPU times: user 20.6 s, sys: 6.56 s, total: 27.2 s
Wall time: 42.4 s


In [7]:
aim_sites['call_genotype']

Unnamed: 0,Array,Chunk
Bytes,422.82 kB,5.80 kB
Shape,"(729, 290, 2)","(58, 50, 2)"
Count,6672 Tasks,648 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 422.82 kB 5.80 kB Shape (729, 290, 2) (58, 50, 2) Count 6672 Tasks 648 Chunks Type int8 numpy.ndarray",2  290  729,

Unnamed: 0,Array,Chunk
Bytes,422.82 kB,5.80 kB
Shape,"(729, 290, 2)","(58, 50, 2)"
Count,6672 Tasks,648 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.83 kB,5.83 kB
Shape,"(729,)","(729,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 5.83 kB 5.83 kB Shape (729,) (729,) Count 2 Tasks 1 Chunks Type int64 numpy.ndarray",729  1,

Unnamed: 0,Array,Chunk
Bytes,5.83 kB,5.83 kB
Shape,"(729,)","(729,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray


In [8]:
def _contig_map_aims(idx, aim_sites):

    arm = aim_sites.attrs['contigs'][idx]
    print(f"collecting {arm} aim calls")

    # aim alleles and positions
    aim_arm_sites = aim_sites.sel(variants=idx)
    aim_pos = aim_arm_sites['variant_position'].values
    aim_alleles = aim_arm_sites['variant_allele'].values

    # snp alleles and positions
    snp_calls = ag3.snp_calls(region=arm, sample_sets=sample_sets)
    snp_calls = snp_calls.set_index(variants='variant_position')
    snp_aim_calls = snp_calls.sel(variants=aim_pos)
    snp_alleles = snp_aim_calls['variant_allele'][:].values

    # mapping alleles
    snp_ref = snp_alleles[:,0]
    snp_alt = snp_alleles[:,1:]
    mapping = allel.create_allele_mapping(snp_ref, snp_alt, aim_alleles)
    gt = allel.GenotypeDaskArray(snp_aim_calls['call_genotype'].data)
    gt_map = gt.map_alleles(mapping)
    
    return gt_map

In [9]:
def nu_aim_calls(aims, sample_sets, sample_query=None):
    
    # dask dict
    dask_dict = {}
    
    # aim_calls
    aim_sites = ag3.aim_sites(aims=aims)
    aim_sites = aim_sites.set_index(variants='variant_contig')
    num_contigs = len(aim_sites.attrs['contigs'])
    
    # get calls for each aim
    for idx in range(num_contigs):
        gt_map = _contig_map_aims(idx=idx, aim_sites=aim_sites)    
        
        dask_dict[idx] = gt_map.values

    dask_cat = da.concatenate([dask_dict[i] for i in range(num_contigs)], axis=0) 
    aim_sites['call_genotype'] = (("variants", "samples", "ploidy"), dask_cat)
        
    return aim_sites

In [10]:
nu_test = nu_aim_calls(aims=aims, sample_sets=sample_sets)

collecting 2R aim calls
collecting 2L aim calls
collecting 3R aim calls
collecting 3L aim calls
collecting X aim calls


In [11]:
nu_test

Unnamed: 0,Array,Chunk
Bytes,5.83 kB,5.83 kB
Shape,"(729,)","(729,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray
"Array Chunk Bytes 5.83 kB 5.83 kB Shape (729,) (729,) Count 2 Tasks 1 Chunks Type int64 numpy.ndarray",729  1,

Unnamed: 0,Array,Chunk
Bytes,5.83 kB,5.83 kB
Shape,"(729,)","(729,)"
Count,2 Tasks,1 Chunks
Type,int64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.46 kB,1.46 kB
Shape,"(729, 2)","(729, 2)"
Count,2 Tasks,1 Chunks
Type,|S1,numpy.ndarray
"Array Chunk Bytes 1.46 kB 1.46 kB Shape (729, 2) (729, 2) Count 2 Tasks 1 Chunks Type |S1 numpy.ndarray",2  729,

Unnamed: 0,Array,Chunk
Bytes,1.46 kB,1.46 kB
Shape,"(729, 2)","(729, 2)"
Count,2 Tasks,1 Chunks
Type,|S1,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,422.82 kB,5.80 kB
Shape,"(729, 290, 2)","(58, 50, 2)"
Count,6672 Tasks,648 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 422.82 kB 5.80 kB Shape (729, 290, 2) (58, 50, 2) Count 6672 Tasks 648 Chunks Type int8 numpy.ndarray",2  290  729,

Unnamed: 0,Array,Chunk
Bytes,422.82 kB,5.80 kB
Shape,"(729, 290, 2)","(58, 50, 2)"
Count,6672 Tasks,648 Chunks
Type,int8,numpy.ndarray
