In [1]:
import malariagen_data
import numpy as np
import pandas as pd

In [2]:
# !poetry add plotly
import plotly.express as px

### setup malariagen_data and parameters

In [3]:
# simplecache allows local cacheing
ag3 = malariagen_data.Ag3("simplecache::gs://vo_agam_release/")

In [4]:
# RDL transcript
transcript = "AGAP006028-RA"
site_mask = "gamb_colu" # other options...
sample_sets = "v3_wild" # explain?

In [15]:
# snp_allele_frequencies requires a population parameter in the form of a dictionary.
# Here we want the allele frequencies from all Kenya samples and from Burkina Faso 
# An. coluzzii samples collected in 2012
populations = {
    "Kenya": "country == 'Ghana' and species == 'gambiae'",
    "Burkina Faso": "country == 'Burkina Faso' and species == 'gambiae'",
    "Mayotte": "country == 'Mayotte'",
}

### generate statistic dataframes

In [17]:
%%time
snp_effects_df = ag3.snp_effects(transcript, site_mask)

transcript : AGAP006028-RA
chromosome : 2L 
start : 25363652
stop : 25434556
strand : +
CPU times: user 52.4 s, sys: 302 ms, total: 52.7 s
Wall time: 52.5 s


In [18]:
snp_effects_df.head()

Unnamed: 0,position,ref_allele,alt_allele,effect,impact,ref_codon,alt_codon,aa_pos,ref_aa,alt_aa,aa_change
0,25363652,A,C,START_LOST,HIGH,Atg,Ctg,1.0,M,L,M1L
1,25363652,A,T,START_LOST,HIGH,Atg,Ttg,1.0,M,L,M1L
2,25363652,A,G,START_LOST,HIGH,Atg,Gtg,1.0,M,V,M1V
3,25363653,T,A,NON_SYNONYMOUS_CODING,MODERATE,aTg,aAg,1.0,M,K,M1K
4,25363653,T,C,NON_SYNONYMOUS_CODING,MODERATE,aTg,aCg,1.0,M,T,M1T


In [20]:
%%time
allele_freq_df = ag3.snp_allele_frequencies(transcript=transcript, 
                           populations=populations, 
                           site_mask=site_mask, 
                           sample_sets=sample_sets,
                           drop_invariants=False)

transcript : AGAP006028-RA
chromosome : 2L 
start : 25363652
stop : 25434556
strand : +
CPU times: user 9.66 s, sys: 2.45 s, total: 12.1 s
Wall time: 6.23 s


In [21]:
allele_freq_df.head()

Unnamed: 0,position,ref_allele,alt_allele,Kenya,Burkina Faso,Mayotte,maximum
0,25363652,A,C,0.0,0.0,0.0,0.0
1,25363652,A,T,0.0,0.0,0.0,0.0
2,25363652,A,G,0.0,0.0,0.0,0.0
3,25363653,T,A,0.0,0.0,0.0,0.0
4,25363653,T,C,0.0,0.0,0.0,0.0


### join dataframes together so we can do some filtering

In [35]:
merged_df = pd.merge(snp_effects_df, allele_freq_df, on=['position', 'ref_allele', 'alt_allele'])

In [36]:
merged_df

Unnamed: 0,position,ref_allele,alt_allele,effect,impact,ref_codon,alt_codon,aa_pos,ref_aa,alt_aa,aa_change,Kenya,Burkina Faso,Mayotte,maximum
0,25363652,A,C,START_LOST,HIGH,Atg,Ctg,1.0,M,L,M1L,0.0,0.0,0.0,0.0
1,25363652,A,T,START_LOST,HIGH,Atg,Ttg,1.0,M,L,M1L,0.0,0.0,0.0,0.0
2,25363652,A,G,START_LOST,HIGH,Atg,Gtg,1.0,M,V,M1V,0.0,0.0,0.0,0.0
3,25363653,T,A,NON_SYNONYMOUS_CODING,MODERATE,aTg,aAg,1.0,M,K,M1K,0.0,0.0,0.0,0.0
4,25363653,T,C,NON_SYNONYMOUS_CODING,MODERATE,aTg,aCg,1.0,M,T,M1T,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168829,25434555,A,T,STOP_LOST,HIGH,tAa,tTa,556.0,*,L,*556L,0.0,0.0,0.0,0.0
168830,25434555,A,G,SYNONYMOUS_CODING,LOW,tAa,tGa,556.0,*,*,*556*,0.0,0.0,0.0,0.0
168831,25434556,A,C,STOP_LOST,HIGH,taA,taC,556.0,*,Y,*556Y,0.0,0.0,0.0,0.0
168832,25434556,A,T,STOP_LOST,HIGH,taA,taT,556.0,*,Y,*556Y,0.0,0.0,0.0,0.0


### filter down to variants-of-interest

In [37]:
# #let's keep  variants that have MODERATE (e.g. NON-SYN) or HIGH (e.g. STOP-LOST) impact.
merged_df = merged_df[(merged_df.impact == "HIGH") | (merged_df.impact == "MODERATE")].copy()

In [38]:
merged_df

Unnamed: 0,position,ref_allele,alt_allele,effect,impact,ref_codon,alt_codon,aa_pos,ref_aa,alt_aa,aa_change,Kenya,Burkina Faso,Mayotte,maximum
0,25363652,A,C,START_LOST,HIGH,Atg,Ctg,1.0,M,L,M1L,0.0,0.0,0.0,0.0
1,25363652,A,T,START_LOST,HIGH,Atg,Ttg,1.0,M,L,M1L,0.0,0.0,0.0,0.0
2,25363652,A,G,START_LOST,HIGH,Atg,Gtg,1.0,M,V,M1V,0.0,0.0,0.0,0.0
3,25363653,T,A,NON_SYNONYMOUS_CODING,MODERATE,aTg,aAg,1.0,M,K,M1K,0.0,0.0,0.0,0.0
4,25363653,T,C,NON_SYNONYMOUS_CODING,MODERATE,aTg,aCg,1.0,M,T,M1T,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168827,25434553,G,T,NON_SYNONYMOUS_CODING,MODERATE,aaG,aaT,555.0,K,N,K555N,0.0,0.0,0.0,0.0
168828,25434555,A,C,STOP_LOST,HIGH,tAa,tCa,556.0,*,S,*556S,0.0,0.0,0.0,0.0
168829,25434555,A,T,STOP_LOST,HIGH,tAa,tTa,556.0,*,L,*556L,0.0,0.0,0.0,0.0
168831,25434556,A,C,STOP_LOST,HIGH,taA,taC,556.0,*,Y,*556Y,0.0,0.0,0.0,0.0


In [39]:
#next, we can remove invariant positions (where maximum allele frequency is >0)
freq_filtered_df = merged_df[merged_df.maximum > 0].copy()

In [61]:
freq_filtered_df.columns

Index(['position', 'ref_allele', 'alt_allele', 'effect', 'impact', 'ref_codon',
       'alt_codon', 'aa_pos', 'ref_aa', 'alt_aa', 'aa_change', 'Kenya',
       'Burkina Faso', 'Mayotte', 'maximum'],
      dtype='object')

In [70]:
freq_filtered_df.shape

(35, 15)

### plot

In [95]:
# first we need a plotly friendly dataframe...
plotly_cols = {
    'position' : np.tile(freq_filtered_df.position, len(populations)),
    'effect' : np.tile(freq_filtered_df.effect, len(populations)),
    'aa_change' : np.tile(freq_filtered_df.aa_change, len(populations)),
    'frequency' : [],
    'population' : []
}

for pop in freq_filtered_df[populations]:
    for freq in freq_filtered_df[pop]:
        plotly_cols['frequency'].append(freq)
        plotly_cols['population'].append(pop)
        
plotly_df = pd.DataFrame(plotly_cols)

In [97]:
fig = px.scatter(plotly_df, x="position", y="frequency", color="population", symbol='effect', hover_data=["population", "frequency", "effect", "aa_change"])
fig.show()

In [98]:
# now build in 'normal' plotly with the gene model representation
# will the the gff to get CDS, UTR