In [50]:
import malariagen_data
import numpy as np
import pandas as pd

In [60]:
# !poetry add plotly
import plotly.express as px

### setup malariagen_data and parameters

In [2]:
# simplecache allows local cacheing
ag3 = malariagen_data.Ag3("simplecache::gs://vo_agam_release/")

In [3]:
# Vgsc transcript (the one we use codon numbering from in MalariaGEN)
transcript = "AGAP004707-RD"
site_mask = "gamb_colu" # other options...
sample_sets = "v3_wild" # explain?

In [34]:
# snp_allele_frequencies requires a population parameter in the form of a dictionary.
# Here we want the allele frequencies from all Kenya samples and from Burkina Faso 
# An. coluzzii samples collected in 2012
populations = {
    "ke": "country == 'Ghana' and species == 'gambiae'",
    "my": "country == 'Mayotte'",
}

### generate statistic dataframes

In [35]:
snp_effects_df = ag3.snp_effects(transcript, site_mask)

transcript : AGAP004707-RD
chromosome : 2L 
start : 2358158
stop : 2431617
strand : +


In [36]:
snp_effects_df.head()

Unnamed: 0,position,ref_allele,alt_allele,effect,impact,ref_codon,alt_codon,aa_pos,ref_aa,alt_aa,aa_change
0,2358158,A,C,START_LOST,HIGH,Atg,Ctg,1.0,M,L,M1L
1,2358158,A,T,START_LOST,HIGH,Atg,Ttg,1.0,M,L,M1L
2,2358158,A,G,START_LOST,HIGH,Atg,Gtg,1.0,M,V,M1V
3,2358159,T,A,NON_SYNONYMOUS_CODING,MODERATE,aTg,aAg,1.0,M,K,M1K
4,2358159,T,C,NON_SYNONYMOUS_CODING,MODERATE,aTg,aCg,1.0,M,T,M1T


In [38]:
allele_freq_df = ag3.snp_allele_frequencies(transcript=transcript, 
                           populations=populations, 
                           site_mask=site_mask, 
                           sample_sets=sample_sets,
                           drop_invariants=False)

transcript : AGAP004707-RD
chromosome : 2L 
start : 2358158
stop : 2431617
strand : +


In [39]:
allele_freq_df.head()

Unnamed: 0,position,ref_allele,alt_allele,ke,my,maximum
0,2358158,A,C,0.0,0.0,0.0
1,2358158,A,T,0.0,0.0,0.0
2,2358158,A,G,0.0,0.0,0.0
3,2358159,T,A,0.0,0.0,0.0
4,2358159,T,C,0.0,0.0,0.0


### join dataframes together so we can do some filtering

In [40]:
merged_df = pd.merge(snp_effects_df, allele_freq_df, on=['position', 'ref_allele', 'alt_allele'])

In [41]:
merged_df

Unnamed: 0,position,ref_allele,alt_allele,effect,impact,ref_codon,alt_codon,aa_pos,ref_aa,alt_aa,aa_change,ke,my,maximum
0,2358158,A,C,START_LOST,HIGH,Atg,Ctg,1.0,M,L,M1L,0.0,0.0,0.0
1,2358158,A,T,START_LOST,HIGH,Atg,Ttg,1.0,M,L,M1L,0.0,0.0,0.0
2,2358158,A,G,START_LOST,HIGH,Atg,Gtg,1.0,M,V,M1V,0.0,0.0,0.0
3,2358159,T,A,NON_SYNONYMOUS_CODING,MODERATE,aTg,aAg,1.0,M,K,M1K,0.0,0.0,0.0
4,2358159,T,C,NON_SYNONYMOUS_CODING,MODERATE,aTg,aCg,1.0,M,T,M1T,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132301,2431616,G,C,STOP_LOST,HIGH,tGa,tCa,2119.0,*,S,*2119S,0.0,0.0,0.0
132302,2431616,G,T,STOP_LOST,HIGH,tGa,tTa,2119.0,*,L,*2119L,0.0,0.0,0.0
132303,2431617,A,C,STOP_LOST,HIGH,tgA,tgC,2119.0,*,C,*2119C,0.0,0.0,0.0
132304,2431617,A,T,STOP_LOST,HIGH,tgA,tgT,2119.0,*,C,*2119C,0.0,0.0,0.0


### filter down to variants-of-interest

In [44]:
#let's keep  variants that have MODERATE (e.g. NON-SYN) or HIGH (e.g. STOP-LOST) impact.
impact_filtered_df = merged_df[(merged_df.impact == "HIGH") | (merged_df.impact == "MODERATE")].copy()

In [45]:
impact_filtered_df.head()

Unnamed: 0,position,ref_allele,alt_allele,effect,impact,ref_codon,alt_codon,aa_pos,ref_aa,alt_aa,aa_change,ke,my,maximum
0,2358158,A,C,START_LOST,HIGH,Atg,Ctg,1.0,M,L,M1L,0.0,0.0,0.0
1,2358158,A,T,START_LOST,HIGH,Atg,Ttg,1.0,M,L,M1L,0.0,0.0,0.0
2,2358158,A,G,START_LOST,HIGH,Atg,Gtg,1.0,M,V,M1V,0.0,0.0,0.0
3,2358159,T,A,NON_SYNONYMOUS_CODING,MODERATE,aTg,aAg,1.0,M,K,M1K,0.0,0.0,0.0
4,2358159,T,C,NON_SYNONYMOUS_CODING,MODERATE,aTg,aCg,1.0,M,T,M1T,0.0,0.0,0.0


In [47]:
#next, we can remove invariant positions (where maximum allele frequency is >0)
freq_filtered_df = impact_filtered_df[impact_filtered_df.maximum > 0].copy()

In [48]:
freq_filtered_df

Unnamed: 0,position,ref_allele,alt_allele,effect,impact,ref_codon,alt_codon,aa_pos,ref_aa,alt_aa,aa_change,ke,my,maximum
33583,2382308,T,C,NON_SYNONYMOUS_CODING,MODERATE,gTa,gCa,177.0,V,A,V177A,0.013889,0.0,0.013889
80269,2403275,C,T,SPLICE_REGION,MODERATE,,,,,,,0.013889,0.0,0.013889
92962,2416877,A,T,NON_SYNONYMOUS_CODING,MODERATE,Atg,Ttg,757.0,M,L,M757L,0.0,0.065217,0.065217
93229,2416980,C,T,NON_SYNONYMOUS_CODING,MODERATE,aCg,aTg,791.0,T,M,T791M,0.388889,0.0,0.388889
105679,2422609,C,T,NON_SYNONYMOUS_CODING,MODERATE,tCc,tTc,981.0,S,F,S981F,0.0,0.021739,0.021739
105808,2422652,A,T,NON_SYNONYMOUS_CODING,MODERATE,ttA,ttT,995.0,L,F,L995F,1.0,0.0,1.0
126700,2429745,A,T,NON_SYNONYMOUS_CODING,MODERATE,Aat,Tat,1570.0,N,Y,N1570Y,0.111111,0.0,0.111111
128732,2430424,G,T,NON_SYNONYMOUS_CODING,MODERATE,Gcc,Tcc,1746.0,A,S,A1746S,0.375,0.0,0.375
129906,2430817,G,A,NON_SYNONYMOUS_CODING,MODERATE,Gtt,Att,1853.0,V,I,V1853I,0.305556,0.0,0.305556
130045,2430863,T,C,NON_SYNONYMOUS_CODING,MODERATE,aTa,aCa,1868.0,I,T,I1868T,0.097222,0.0,0.097222


### plot

In [68]:
list('k' * len(list(freq_filtered_df.ke)))+list('m' * len(list(freq_filtered_df.my)))

['k',
 'k',
 'k',
 'k',
 'k',
 'k',
 'k',
 'k',
 'k',
 'k',
 'k',
 'k',
 'm',
 'm',
 'm',
 'm',
 'm',
 'm',
 'm',
 'm',
 'm',
 'm',
 'm',
 'm']

In [73]:
# first we need a plotly friendly dataframe... 
# TODO make this work whatever the number of pops and make is less ugly!
plotly_cols = {
    'position' : list(freq_filtered_df.position)+list(freq_filtered_df.position),
    'effect' : list(freq_filtered_df.effect)+list(freq_filtered_df.effect),
    'frequency' : list(freq_filtered_df.ke)+list(freq_filtered_df.my),
    'population' : list('k' * len(list(freq_filtered_df.ke)))+list('m' * len(list(freq_filtered_df.my)))
}

In [74]:
plotly_df = pd.DataFrame(plotly_cols)

In [75]:
fig = px.scatter(plotly_df, x="position", y="frequency", color="population", symbol='effect')
fig.show()