In [2]:
import malariagen_data
import numpy as np
import pandas as pd

### setup malariagen_data and parameters

In [3]:
# simplecache allows local cacheing
ag3 = malariagen_data.Ag3("simplecache::gs://vo_agam_release/")

In [8]:
# Vgsc transcript (the one we use codon numbering from in MalariaGEN)
transcript = "AGAP004707-RD"
site_mask = "gamb_colu" # other options...
sample_sets = "v3_wild" # explain?

In [6]:
# snp_allele_frequencies requires a population parameter in the form of a dictionary.
# Here we want the allele frequencies from all Kenya samples and from Burkina Faso 
# An. coluzzii samples collected in 2012
populations = {
    "ke": "country == 'Kenya'",
    "bf_bana_2012_col": "country == 'Burkina Faso' and year == 2012 and species == 'coluzzii'",
}

### generate statistics

In [11]:
snp_effects_df = ag3.snp_effects(transcript, site_mask)

transcript : AGAP004707-RD
chromosome : 2L 
start : 2358158
stop : 2431617
strand : +


In [12]:
snp_effects_df

Unnamed: 0,position,ref_allele,alt_allele,effect,impact,ref_codon,alt_codon,aa_pos,ref_aa,alt_aa,aa_change
0,2358158,A,C,START_LOST,HIGH,Atg,Ctg,1.0,M,L,M1L
1,2358158,A,T,START_LOST,HIGH,Atg,Ttg,1.0,M,L,M1L
2,2358158,A,G,START_LOST,HIGH,Atg,Gtg,1.0,M,V,M1V
3,2358159,T,A,NON_SYNONYMOUS_CODING,MODERATE,aTg,aAg,1.0,M,K,M1K
4,2358159,T,C,NON_SYNONYMOUS_CODING,MODERATE,aTg,aCg,1.0,M,T,M1T
...,...,...,...,...,...,...,...,...,...,...,...
132301,2431616,G,C,STOP_LOST,HIGH,tGa,tCa,2119.0,*,S,*2119S
132302,2431616,G,T,STOP_LOST,HIGH,tGa,tTa,2119.0,*,L,*2119L
132303,2431617,A,C,STOP_LOST,HIGH,tgA,tgC,2119.0,*,C,*2119C
132304,2431617,A,T,STOP_LOST,HIGH,tgA,tgT,2119.0,*,C,*2119C


In [9]:
allele_freq_df = ag3.snp_allele_frequencies(transcript=transcript, 
                           populations=populations, 
                           site_mask=site_mask, 
                           sample_sets=sample_sets,
                           drop_invariants=False)

transcript : AGAP004707-RD
chromosome : 2L 
start : 2358158
stop : 2431617
strand : +


In [10]:
allele_freq_df

Unnamed: 0,pos,ref_allele,alt_allele,ke,bf_bana_2012_col,maximum
0,2358158,A,C,0.0,0.0,0.0
1,2358158,A,T,0.0,0.0,0.0
2,2358158,A,G,0.0,0.0,0.0
3,2358159,T,A,0.0,0.0,0.0
4,2358159,T,C,0.0,0.0,0.0
...,...,...,...,...,...,...
132301,2431616,G,C,0.0,0.0,0.0
132302,2431616,G,T,0.0,0.0,0.0
132303,2431617,A,C,0.0,0.0,0.0
132304,2431617,A,T,0.0,0.0,0.0


### join dataframes together so we can do some filtering