In [1]:
%reload_ext autoreload
%autoreload 

In [2]:
import numpy as np
from time import time

In [3]:
import cProfile
import pstats
import allel
import pandas as pd

In [4]:
%aimport malariagen_data.ag3

In [5]:
%aimport malariagen_data.veff 

In [6]:
veff = malariagen_data.veff

In [7]:
ag3 = malariagen_data.Ag3("simplecache::gs://vo_agam_release", simplecache=dict(cache_storage="gcs_cache"))

In [8]:
genome = ag3.open_genome()
genome

<zarr.hierarchy.Group '/'>

In [None]:
list(genome)

In [None]:
genome["3R"][0:10].tobytes().decode()

In [9]:
geneset = ag3.geneset()
geneset

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent,Name
0,2L,VectorBase,chromosome,1,49364325,,,,2L,,
1,2L,VectorBase,gene,157348,186936,,-,,AGAP004677,,
2,2L,VectorBase,mRNA,157348,181305,,-,,AGAP004677-RA,AGAP004677,
3,2L,VectorBase,three_prime_UTR,157348,157495,,-,,,AGAP004677-RA,
4,2L,VectorBase,exon,157348,157623,,-,,,AGAP004677-RA,AGAP004677-RB-E4
...,...,...,...,...,...,...,...,...,...,...,...
196140,Y_unplaced,VectorBase,five_prime_UTR,47932,48111,,+,,,AGAP029375-RA,
196141,Y_unplaced,VectorBase,exon,47932,48138,,+,,,AGAP029375-RA,AGAP029375-RA-E2
196142,Y_unplaced,VectorBase,CDS,48112,48138,,+,0.0,AGAP029375-PA,AGAP029375-RA,
196143,Y_unplaced,VectorBase,exon,48301,48385,,+,,,AGAP029375-RA,AGAP029375-RA-E3


In [None]:
geneset[geneset.Parent == 'AGAP004679-RB']

In [None]:
%%time
# this will take some time, loading gff, one-off cost
ann = veff.Annotator(
    genome=genome,
    geneset=geneset,
)

In [None]:
%%time
for effect in ann.get_effects(chrom='2L', pos=2422652, ref='A', alt='T',
                                  transcript_ids=["AGAP004707-RD"]):
       print(effect)

In [None]:
for a in effect:
    print(a)

In [None]:
effect.effect, effect.impact

In [None]:
effect.ref_codon, effect.alt_codon

In [None]:
effect.aa_pos, effect.ref_aa, effect.alt_aa, effect.aa_change

In [None]:
type(effect.aa_pos)

In [None]:
pos, ref, alt = ag3.snp_sites('2L')

In [None]:
import allel

In [None]:
start_idx = allel.SortedIndex(pos).locate_key(2429745)
start_idx

In [None]:
pp = pos[start_idx:start_idx+100].compute()
rr = ref[start_idx:start_idx+100].compute()
aaa = alt[start_idx:start_idx+100].compute()

In [None]:
def testf(n, show=False):
    # loop over sites
    for i, (p, r, aa) in enumerate(zip(pp, rr, aaa)):
        if i < n:
            # loop over alt alleles
            for a in aa:
                for effect in ann.get_effects(chrom='2L', pos=p, ref=r.decode(), alt=a.decode(),
                                              transcript_ids=["AGAP004707-RA"]):
                    if show:
                        print(effect)
    

In [None]:
%%time
testf(20)

In [None]:
%%time
x = ref.compute()

In [None]:
x.nbytes

In [None]:
cProfile.run('testf(100)', sort='cumtime')

In [None]:
cProfile.run('testf(20)', sort='time')

In [None]:
ann.get_feature("AGAP004707-RD")

In [None]:
ann.get_children("AGAP004707")

In [None]:
ann.find("2L", 2358158, 2358159)

# ag3.snp_effects appears to work for vgsc and some other genes but most genes seem to fail
- the number of effects is less than the number of rows of positions

#### run veff locally and try to debug

In [None]:
genome = ag3.open_genome()
genome

In [9]:
gste2 = 'AGAP009194-RA'
site_mask = 'gamb_colu'

In [None]:
geneset[geneset.Parent == gste2]

In [None]:
%%time
for effect in ann.get_effects(chrom='3R', pos=28598166, ref='A', alt='G',
                                  transcript_ids=[gste2]):
       print(effect)

In [None]:
feature = ann.get_feature(gste2)
#feature = ann.get_feature('AGAP004707-RA')
contig = feature[0]
start = feature[3]
stop = feature[4]
strand = feature[6]

In [None]:
feature

In [None]:
sites = ag3.snp_sites(contig=contig, site_mask=site_mask)

In [None]:
# sites are dask arrays, turn pos into sorted index
pos = allel.SortedIndex(sites[0].compute())
# locate transcript range
loc = pos.locate_range(start, stop)
# dask compute on the sliced arrays to speed things up
ref = sites[1][loc].compute()
alt = sites[2][loc].compute()

In [None]:
loc

In [None]:
df_in = pd.DataFrame()
df_in["position"] = np.asarray(pos[loc])
df_in["ref_allele"] = [q.tobytes().decode() for q in np.asarray(ref)]
# bytes within lists within lists...
df_in["alt_alleles"] = [list(q.tobytes().decode()) for q in list(alt)]

In [None]:
df_in.shape[0] *3

In [None]:
#explode
df_effects = df_in.explode("alt_alleles").reset_index(drop=True)

In [None]:
df_effects.shape

In [None]:
df_effects

In [None]:
leffect = []
limpact = []
lref_codon = []
lalt_codon = []
laa_pos = []
lref_aa = []
lalt_aa = []
laa_change = []
lpos = []

pos = []

for row in df_effects.itertuples(index=True):
    pos.append(row.position)
    for effect in ann.get_effects(
        chrom=contig,
        pos=row.position,
        ref=row.ref_allele,
        alt=row.alt_alleles,
        transcript_ids=[gste2],
    ):

        leffect.append(effect.effect)
        lpos.append(effect.ref_start)
        limpact.append(effect.impact)
        lref_codon.append(effect.ref_codon)
        lalt_codon.append(effect.alt_codon)
        laa_pos.append(effect.aa_pos)
        lref_aa.append(effect.ref_aa)
        lalt_aa.append(effect.alt_aa)
        laa_change.append(effect.aa_change)

In [None]:
len(pos)

In [None]:
len(lpos)

In [None]:
s = set(lpos)
missing = [x for x in pos if x not in s]

In [None]:
len(missing) + len(lpos)

In [None]:
missing

In [None]:
%%time
for effect in ann.get_effects(chrom='3R', pos=28597652, ref='G', alt='A',
                                  transcript_ids=[gste2]):
       print(effect)

In [None]:
e = ag3.snp_single_effect('3R', pos=28598166, ref='A', alt='G', transcript=[gste2])
e

In [None]:
type(e)

In [None]:
e = ag3.snp_single_effect('3R', pos=28597652, ref='G', alt='A',
                                  transcript=[gste2])
e

In [None]:
assert e.effect == 'UTR_VARIANT'

# working out test parameters

### reverse strand

In [None]:
df = ag3.snp_effects(gste2, site_mask)

In [None]:
a = df.columns[:]

In [None]:
b = ['position', 'ref_allele', 'alt_alleles', 'effect', 'impact',
       'ref_codon', 'alt_codon', 'aa_pos', 'ref_aa', 'alt_aa', 'aa_change']

In [None]:
assert isinstance(df, pd.DataFrame)

In [None]:
for e in b:
    assert e in a

In [None]:
assert df.shape == (2838, 11)

In [None]:
df[df.effect == 'FIVE_PRIME_UTR']

In [None]:
df.iloc[1451]

In [None]:
assert df.iloc[1451].aa_change == 'I114T'

In [None]:
df = ag3.snp_effects(gste2, site_mask)

In [None]:
df

### forward strand

In [None]:
gste6 = "AGAP009196-RA"

In [None]:
df = ag3.snp_effects(gste6, site_mask)

In [None]:
df.shape

In [None]:
df[900:]

### introns in 5' UTRs

In [14]:
geneset[geneset.Parent == "AGAP004679-RB"]

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,ID,Parent,Name
34,2L,VectorBase,exon,207894,207953,,+,,,AGAP004679-RB,AGAP004679-RB-E1
35,2L,VectorBase,five_prime_UTR,207894,207953,,+,,,AGAP004679-RB,
36,2L,VectorBase,five_prime_UTR,208393,208416,,+,,,AGAP004679-RB,
37,2L,VectorBase,exon,208393,208581,,+,,,AGAP004679-RB,AGAP004679-RB-E2
38,2L,VectorBase,CDS,208417,208581,,+,0.0,AGAP004679-PB,AGAP004679-RB,
39,2L,VectorBase,exon,208639,208824,,+,,,AGAP004679-RB,AGAP004679-RB-E3
40,2L,VectorBase,CDS,208639,208824,,+,0.0,AGAP004679-PB,AGAP004679-RB,
41,2L,VectorBase,exon,208897,209394,,+,,,AGAP004679-RB,AGAP004679-RB-E4
42,2L,VectorBase,CDS,208897,209394,,+,0.0,AGAP004679-PB,AGAP004679-RB,
43,2L,VectorBase,exon,209473,209816,,+,,,AGAP004679-RB,AGAP004679-RB-E5


In [11]:
#this gene has a 5' utr
utrintron5 = "AGAP004679-RB"
df = ag3.snp_effects(utrintron5, site_mask)

transcript : AGAP004679-RB
chromosome : 2L 
start : 207894
stop : 210460
strand : +


In [12]:
df[178:]

Unnamed: 0,position,ref_allele,alt_alleles,effect,impact,ref_codon,alt_codon,aa_pos,ref_aa,alt_aa,aa_change
178,207953,T,C,FIVE_PRIME_UTR,LOW,,,,,,
179,207953,T,G,FIVE_PRIME_UTR,LOW,,,,,,
180,207954,G,A,INTRAGENIC,LOW,,,,,,
181,207954,G,C,INTRAGENIC,LOW,,,,,,
182,207954,G,T,INTRAGENIC,LOW,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
7681,210459,A,T,THREE_PRIME_UTR,LOW,,,,,,
7682,210459,A,G,THREE_PRIME_UTR,LOW,,,,,,
7683,210460,G,A,THREE_PRIME_UTR,LOW,,,,,,
7684,210460,G,C,THREE_PRIME_UTR,LOW,,,,,,


In [None]:
df[1490:]

### introns in 3' UTRs

In [None]:
AGAP028431-RA

In [29]:
#this gene has a 3' utr
utrintron3 = "AGAP029346-RA"
site_mask = 'gamb_colu'
df = ag3.snp_effects(utrintron3, site_mask)

transcript : AGAP029346-RA
chromosome : 2L 
start : 5605327
stop : 5606838
strand : -


In [None]:
three_prime_UTR	5605327	5605863
three_prime_UTR	5605952	5606353

In [58]:
df[1590:]

Unnamed: 0,position,ref_allele,alt_alleles,effect,impact,ref_codon,alt_codon,aa_pos,ref_aa,alt_aa,aa_change
1590,5605863,T,A,THREE_PRIME_UTR,LOW,,,,,,
1591,5605863,T,C,THREE_PRIME_UTR,LOW,,,,,,
1592,5605863,T,G,THREE_PRIME_UTR,LOW,,,,,,
1593,5605864,C,A,INTRAGENIC,LOW,,,,,,
1594,5605864,C,T,INTRAGENIC,LOW,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
4303,5606837,G,C,FIVE_PRIME_UTR,LOW,,,,,,
4304,5606837,G,T,FIVE_PRIME_UTR,LOW,,,,,,
4305,5606838,A,C,FIVE_PRIME_UTR,LOW,,,,,,
4306,5606838,A,T,FIVE_PRIME_UTR,LOW,,,,,,


# SNP allele frequencies

In [None]:
%%time
df_meta, gt = ag3.snp_allele_frequencies("AGAP004707-RD", "gamb_colu")

In [None]:
df

In [None]:
len(gt) *3

In [None]:
df_meta.loc[df_meta.country == 'AG1000G-UG']
