# let's have a look at vgsc snps
- look at gamb_colu and arab separately

### setup

In [1]:
import malariagen_data

In [2]:
ag3 = malariagen_data.Ag3("simplecache::gs://vo_agam_release/", 
                          simplecache=dict(cache_storage="gcs_cache"))

In [3]:
#the transcript we have used previously for vgsc
vgsc = "AGAP004707-RD"


### gamb_colu

In [4]:
# get snp effects for gamb_colu
veff_gc_df = ag3.snp_effects(transcript=vgsc, site_mask='gamb_colu')

In [5]:
# meta data
df_metadata = ag3.sample_metadata(sample_sets="v3_wild")
df_metadata.columns

Index(['sample_id', 'partner_sample_id', 'contributor', 'country', 'location',
       'year', 'month', 'latitude', 'longitude', 'sex_call', 'sample_set',
       'release', 'aim_fraction_colu', 'aim_fraction_arab',
       'species_gambcolu_arabiensis', 'species_gambiae_coluzzii', 'species'],
      dtype='object')

In [6]:
# define populations
df_metadata.country.unique()

array(['Angola', 'Burkina Faso', 'Democratic Republic of Congo',
       'Central African Republic', "Cote d'Ivoire", 'Cameroon', 'Mayotte',
       'Gabon', 'Ghana', 'Gambia, The', 'Guinea', 'Mali',
       'Equatorial Guinea', 'Guinea-Bissau', 'Kenya', 'Malawi',
       'Mozambique', 'Tanzania', 'Uganda'], dtype=object)

In [7]:
# look at unique country+species counts - this will be our initial resolution
# drop arabiensis and pops smaller than 20
df_uni = df_metadata[['country', 'species', 'year']].groupby(['country','species']).count()
df_uni.reset_index(inplace=True)
df_uni = df_uni[df_uni['species'].map(lambda x: str(x)!="arabiensis")]
df_uni = df_uni[df_uni['year'].map(lambda x: x >= 20)]
len(df_uni), df_uni

(25,
                          country                        species  year
 0                         Angola                       coluzzii    81
 2                   Burkina Faso                       coluzzii   135
 3                   Burkina Faso                        gambiae   157
 6                       Cameroon                       coluzzii    26
 7                       Cameroon                        gambiae   416
 9       Central African Republic                        gambiae    55
 10                 Cote d'Ivoire                       coluzzii    80
 11  Democratic Republic of Congo                        gambiae    76
 13                         Gabon                        gambiae    69
 14                   Gambia, The                       coluzzii   169
 15                   Gambia, The                        gambiae    69
 16                   Gambia, The  intermediate_gambiae_coluzzii    41
 17                         Ghana                       coluzzii    64
 

In [13]:
#build dictionary
populations_gc = {c+"_"+s: "country == '''"+c+"''' and species == '"+s+"'" for (c,s) in zip(list(df_uni.country), list(df_uni.species))}

In [15]:
populations_gc

{'Angola_coluzzii': "country == '''Angola''' and species == 'coluzzii'",
 'Burkina Faso_coluzzii': "country == '''Burkina Faso''' and species == 'coluzzii'",
 'Burkina Faso_gambiae': "country == '''Burkina Faso''' and species == 'gambiae'",
 'Cameroon_coluzzii': "country == '''Cameroon''' and species == 'coluzzii'",
 'Cameroon_gambiae': "country == '''Cameroon''' and species == 'gambiae'",
 'Central African Republic_gambiae': "country == '''Central African Republic''' and species == 'gambiae'",
 "Cote d'Ivoire_coluzzii": "country == '''Cote d'Ivoire''' and species == 'coluzzii'",
 'Democratic Republic of Congo_gambiae': "country == '''Democratic Republic of Congo''' and species == 'gambiae'",
 'Gabon_gambiae': "country == '''Gabon''' and species == 'gambiae'",
 'Gambia, The_coluzzii': "country == '''Gambia, The''' and species == 'coluzzii'",
 'Gambia, The_gambiae': "country == '''Gambia, The''' and species == 'gambiae'",
 'Gambia, The_intermediate_gambiae_coluzzii': "country == '''Gamb

In [16]:
df_allele_freq = ag3.snp_allele_frequencies(transcript=vgsc, 
                           populations=populations_gc, 
                           site_mask="gamb_colu")

In [17]:
df_allele_freq

Unnamed: 0,position,ref_allele,alt_allele,Angola_coluzzii,Burkina Faso_coluzzii,Burkina Faso_gambiae,Cameroon_coluzzii,Cameroon_gambiae,Central African Republic_gambiae,Cote d'Ivoire_coluzzii,...,Guinea-Bissau_intermediate_gambiae_coluzzii,Kenya_gambiae,Kenya_intermediate_gambiae_coluzzii,Mali_coluzzii,Mali_gambiae,Mayotte_gambiae,Mozambique_gambiae,Tanzania_gambiae,Uganda_gambiae,maximum
11,2358161,A,G,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.004065
17,2358163,C,G,0.000000,0.0,0.000000,0.019231,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.019231
18,2358164,G,A,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.006579
31,2358168,A,T,0.000000,0.0,0.000000,0.000000,0.001202,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.001202
32,2358168,A,G,0.000000,0.0,0.000000,0.000000,0.001202,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.001202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132255,2431601,G,A,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.017241
132257,2431601,G,T,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.002415,0.002415
132281,2431609,G,T,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.027027,0.0,0.000000,0.027027
132292,2431613,T,C,0.006173,0.0,0.003185,0.000000,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.006173


In [None]:
geneset