In [1]:
%reload_ext autoreload
%autoreload 1

In [2]:
import cProfile
import pstats

In [3]:
%aimport malariagen_data.ag3
%aimport malariagen_data.veff 

In [4]:
ag3 = malariagen_data.Ag3("gs://vo_agam_release")
veff = malariagen_data.veff

In [5]:
genome = ag3._open_genome()

In [6]:
%%time
# this will go much faster on the cloud, don't worry about profiling
# or optimising instantiation of annotator, but time for interest
ann = veff.Annotator(
    genome=genome,
    gff3_path="gs://vo_agam_release/reference/genome/agamp4/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3.gz",
)

CPU times: user 9.07 s, sys: 61.3 ms, total: 9.13 s
Wall time: 16.2 s


In [9]:
%%time
for effect in ann.get_effects(chrom='2L', pos=2429745, ref='A', alt='T',
                                  transcript_ids=["AGAP004707-RA"]):
       print(effect)

VariantEffect(effect='NON_SYNONYMOUS_CODING', impact='MODERATE', chrom='2L', pos=2429745, ref='A', alt='T', vlen=0, ref_start=2429745, ref_stop=2429745, gene_id='AGAP004707', gene_start=2358158, gene_stop=2431617, gene_strand='+', transcript_id='AGAP004707-RA', transcript_start=2358158, transcript_stop=2431617, transcript_strand='+', cds_id='AGAP004707-PA', cds_start=2429556, cds_stop=2429801, cds_strand='+', intron_start=None, intron_stop=None, intron_5prime_dist=None, intron_3prime_dist=None, intron_cds_5prime=None, intron_cds_3prime=None, ref_cds_start=4545, ref_cds_stop=4545, ref_intron_start=None, ref_intron_stop=None, ref_start_phase=0, ref_codon='Aat', alt_codon='Tat', codon_change='Aat/Tat', aa_pos=1516, ref_aa='N', alt_aa='Y', aa_change='N1516Y')
CPU times: user 1.21 ms, sys: 0 ns, total: 1.21 ms
Wall time: 1.22 ms


In [10]:
pos, ref, alt = ag3.snp_sites(contig='2L')
pos, ref, alt

(dask.array<array, shape=(48525747,), dtype=int32, chunksize=(524288,), chunktype=numpy.ndarray>,
 dask.array<array, shape=(48525747,), dtype=|S1, chunksize=(524288,), chunktype=numpy.ndarray>,
 dask.array<array, shape=(48525747, 3), dtype=|S1, chunksize=(524288, 3), chunktype=numpy.ndarray>)

In [11]:
import allel

In [12]:
k = allel.SortedIndex(pos).locate_key(2429745)
k

2150929

In [13]:
pp, rr, aaa = pos[k:k+3].compute(), ref[k:k+3].compute(), alt[k:k+3].compute()

In [14]:
def pfun():
    for p, r, aa in zip(pp, rr, aaa):
        for a in aa:
            for effect in ann.get_effects(chrom='2L', pos=p, ref=r.decode(), alt=a.decode(),
                                          transcript_ids=["AGAP004707-RA"]):
                pass

In [15]:
%%time
pfun()

CPU times: user 7.31 ms, sys: 0 ns, total: 7.31 ms
Wall time: 7.27 ms


In [16]:
cProfile.run('pfun()', sort='cumtime')

         17785 function calls (17668 primitive calls) in 0.010 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.010    0.010 {built-in method builtins.exec}
        1    0.000    0.000    0.010    0.010 <string>:1(<module>)
        1    0.000    0.000    0.010    0.010 <ipython-input-14-f882d6cad5c7>:1(pfun)
       18    0.000    0.000    0.010    0.001 veff.py:136(get_effects)
       18    0.000    0.000    0.005    0.000 veff.py:221(_get_gene_effects)
       18    0.000    0.000    0.005    0.000 veff.py:259(_get_transcript_effects)
       18    0.000    0.000    0.005    0.000 veff.py:326(_get_within_transcript_effects)
        9    0.000    0.000    0.004    0.000 veff.py:59(find)
        9    0.000    0.000    0.004    0.000 intervals.py:216(search)
        9    0.000    0.000    0.004    0.000 intervals.py:190(_search_tree)
        9    0.000    0.000    0.003    0.000 intervaltree.py

In [17]:
cProfile.run('pfun()', sort='time')

         17785 function calls (17668 primitive calls) in 0.037 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     3393    0.006    0.000    0.007    0.000 base.py:570(__getattr__)
     3753    0.004    0.000    0.004    0.000 interval.py:159(__eq__)
    126/9    0.003    0.000    0.010    0.001 node.py:309(search_point)
     1494    0.003    0.000    0.003    0.000 interval.py:173(__cmp__)
      369    0.002    0.000    0.007    0.000 {method 'add' of 'set' objects}
        9    0.002    0.000    0.003    0.000 veff.py:357(<listcomp>)
     3672    0.002    0.000    0.002    0.000 {method 'index' of 'list' objects}
       27    0.001    0.000    0.006    0.000 {built-in method builtins.sorted}
        9    0.001    0.000    0.001    0.000 veff.py:361(<listcomp>)
     1494    0.001    0.000    0.003    0.000 interval.py:204(__lt__)
        9    0.001    0.000    0.002    0.000 veff.py:335(<listcomp>)
        9    0.001    0

In [39]:
x = genome['2L'][:]
x

array([b'a', b'a', b'c', ..., b'a', b'a', b'a'], dtype='|S1')

In [40]:
x.nbytes

49364325