# let's have a look at vgsc snps
- look at gamb_colu and arab separately

### setup

In [1]:
import malariagen_data
import pandas as pd
import numpy as np
import bokeh.plotting as bkplt
import bokeh.models as bkmod
import bokeh.layouts as bklay
from bokeh.layouts import column
from bokeh.io import curdoc
import bokeh.io as bkio

In [21]:
#bkio.output_notebook()
#curdoc().theme = 'dark_minimal'
curdoc().theme = 'contrast'

In [3]:
ag3 = malariagen_data.Ag3("simplecache::gs://vo_agam_release/", 
                          simplecache=dict(cache_storage="gcs_cache"))

In [4]:
#the transcript we have used previously for vgsc
vgsc = "AGAP004707-RD"


### gamb_colu

In [5]:
# get snp effects for gamb_colu
veff_gc_df = ag3.snp_effects(transcript=vgsc, site_mask='gamb_colu')

In [6]:
# meta data
metadata_df = ag3.sample_metadata(sample_sets="v3_wild")
metadata_df.columns

Index(['sample_id', 'partner_sample_id', 'contributor', 'country', 'location',
       'year', 'month', 'latitude', 'longitude', 'sex_call', 'sample_set',
       'release', 'aim_fraction_colu', 'aim_fraction_arab',
       'species_gambcolu_arabiensis', 'species_gambiae_coluzzii', 'species'],
      dtype='object')

In [7]:
# define populations
metadata_df.country.unique()

array(['Angola', 'Burkina Faso', 'Democratic Republic of Congo',
       'Central African Republic', "Cote d'Ivoire", 'Cameroon', 'Mayotte',
       'Gabon', 'Ghana', 'Gambia, The', 'Guinea', 'Mali',
       'Equatorial Guinea', 'Guinea-Bissau', 'Kenya', 'Malawi',
       'Mozambique', 'Tanzania', 'Uganda'], dtype=object)

In [8]:
# look at unique country+species counts - this will be our initial resolution
# drop arabiensis and pops smaller than 20
uni_df = metadata_df[['country', 'species', 'year']].groupby(['country','species']).count()
uni_df.reset_index(inplace=True)
uni_df = uni_df[uni_df['species'].map(lambda x: str(x)!="arabiensis")]
uni_df = uni_df[uni_df['year'].map(lambda x: x >= 20)]
len(uni_df), uni_df

(25,
                          country                        species  year
 0                         Angola                       coluzzii    81
 2                   Burkina Faso                       coluzzii   135
 3                   Burkina Faso                        gambiae   157
 6                       Cameroon                       coluzzii    26
 7                       Cameroon                        gambiae   416
 9       Central African Republic                        gambiae    55
 10                 Cote d'Ivoire                       coluzzii    80
 11  Democratic Republic of Congo                        gambiae    76
 13                         Gabon                        gambiae    69
 14                   Gambia, The                       coluzzii   169
 15                   Gambia, The                        gambiae    69
 16                   Gambia, The  intermediate_gambiae_coluzzii    41
 17                         Ghana                       coluzzii    64
 

In [9]:
#build dictionary
populations_gc = {c+" - "+s: "country == '''"+c+"''' and species == '"+s+"'" for (c,s) in zip(list(uni_df.country), list(uni_df.species))}

In [10]:
populations_gc

{'Angola - coluzzii': "country == '''Angola''' and species == 'coluzzii'",
 'Burkina Faso - coluzzii': "country == '''Burkina Faso''' and species == 'coluzzii'",
 'Burkina Faso - gambiae': "country == '''Burkina Faso''' and species == 'gambiae'",
 'Cameroon - coluzzii': "country == '''Cameroon''' and species == 'coluzzii'",
 'Cameroon - gambiae': "country == '''Cameroon''' and species == 'gambiae'",
 'Central African Republic - gambiae': "country == '''Central African Republic''' and species == 'gambiae'",
 "Cote d'Ivoire - coluzzii": "country == '''Cote d'Ivoire''' and species == 'coluzzii'",
 'Democratic Republic of Congo - gambiae': "country == '''Democratic Republic of Congo''' and species == 'gambiae'",
 'Gabon - gambiae': "country == '''Gabon''' and species == 'gambiae'",
 'Gambia, The - coluzzii': "country == '''Gambia, The''' and species == 'coluzzii'",
 'Gambia, The - gambiae': "country == '''Gambia, The''' and species == 'gambiae'",
 'Gambia, The - intermediate_gambiae_coluzz

In [11]:
#get allele freqs
allele_freq_df = ag3.snp_allele_frequencies(transcript=vgsc, 
                           populations=populations_gc, 
                           site_mask="gamb_colu")

In [12]:
#merge our two dfs
merged_df = pd.merge(veff_gc_df, allele_freq_df, on=['position', 'ref_allele', 'alt_allele'])

In [13]:
merged_df

Unnamed: 0,position,ref_allele,alt_allele,effect,impact,ref_codon,alt_codon,aa_pos,ref_aa,alt_aa,...,Guinea-Bissau - intermediate_gambiae_coluzzii,Kenya - gambiae,Kenya - intermediate_gambiae_coluzzii,Mali - coluzzii,Mali - gambiae,Mayotte - gambiae,Mozambique - gambiae,Tanzania - gambiae,Uganda - gambiae,maximum
0,2358161,A,G,NON_SYNONYMOUS_CODING,MODERATE,Acc,Gcc,2.0,T,A,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.004065
1,2358163,C,G,SYNONYMOUS_CODING,LOW,acC,acG,2.0,T,T,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.019231
2,2358164,G,A,NON_SYNONYMOUS_CODING,MODERATE,Gaa,Aaa,3.0,E,K,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.006579
3,2358168,A,T,NON_SYNONYMOUS_CODING,MODERATE,gAc,gTc,4.0,D,V,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.001202
4,2358168,A,G,NON_SYNONYMOUS_CODING,MODERATE,gAc,gGc,4.0,D,G,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.001202
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14117,2431601,G,A,NON_SYNONYMOUS_CODING,MODERATE,cGa,cAa,2114.0,R,Q,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.017241
14118,2431601,G,T,NON_SYNONYMOUS_CODING,MODERATE,cGa,cTa,2114.0,R,L,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.002415,0.002415
14119,2431609,G,T,NON_SYNONYMOUS_CODING,MODERATE,Gat,Tat,2117.0,D,Y,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.027027,0.0,0.000000,0.027027
14120,2431613,T,C,NON_SYNONYMOUS_CODING,MODERATE,gTc,gCc,2118.0,V,A,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.006173


In [14]:
# filter
filter_df  = merged_df[(merged_df.impact == "HIGH") | (merged_df.impact == "MODERATE" )].copy()
freq_filtered_df = filter_df[filter_df.maximum > 0].copy()

In [15]:
# plotting stuff
freq_filtered_df['pos_Mbp'] = freq_filtered_df.position/1e6

In [16]:
def plot_transcript_variation(freq_df, col_dict, transcript, populations, width=1600, height=800):

    # SCATTER
    # tooltips
    tooltips1 = [
        ("population", '@population'),
        ("frequency", '@frequency'),
        ("position", '@position'),
        ("aa change", '@aa_change'),
        ("effect", '@effect')
    ]

    fig1 = bkplt.figure(
        title=f'Transcript - {transcript}',
        tools='xpan,xzoom_in,xzoom_out,xwheel_zoom,reset,hover',
        active_scroll='xwheel_zoom',
        active_drag='xpan',
        plot_width=width, 
        plot_height=height, 
        tooltips=tooltips1,
        toolbar_location="above")

    for pop in populations:
        df = freq_df.copy()
        df['frequency'] = df[pop]
        df['population'] = pop
        fig1.circle("pos_Mbp", 
                    pop, size=10,
                    color=col_dict[pop],
                    alpha=0.8, 
                    source=df,
                    legend_label=pop)

    fig1.yaxis.axis_label = f'population nucleotide frequency'
    fig1.xaxis.visible = False
#     fig1.legend.location = "top_left"
    fig1.add_layout(fig1.legend[0], 'right')
    fig1.legend.click_policy="hide"
    
    # GENE MODEL
    df_geneset = ag3.geneset()
    df_transcripts = df_geneset.query(f"feature_id == '{transcript}'")

    # define tooltips for hover
    tooltips2 = [
        ("Type", '@type'),
        ("Location", '@seqid:@start{,}..@stop{,}'),
    ]

    # make a figure
    fig2 = bkplt.figure(   
        plot_width=width, 
        plot_height=int(height / 5),
        tools='xpan,xzoom_in,xzoom_out,xwheel_zoom,reset,hover',
        #toolbar_location='above',
        active_scroll='xwheel_zoom',
        active_drag='xpan',
        tooltips=tooltips2,
        x_range=fig1.x_range,
    )

    # plot the transcripts


    data = df_geneset.query(f"parent_id == '{transcript}'").copy()
    data['left'] = data['start'] / 1e6  # plot in Mbp coordinates
    data['right'] = data['stop'] / 1e6  # plot in Mbp coordinates
    data['bottom'] = 0 - 0.4
    data['top'] = 0 + 0.4

    # plot exons
    exons = data.query("type == 'exon'")
    fig2.quad(bottom='bottom', 
              top='top', 
              left='left', 
              right='right',
              source=exons, 
              fill_color=None, 
              line_color='black', 
              line_width=.5, 
              fill_alpha=0)

    # plot introns
    for l, r in zip(exons[:-1]['right'], exons[1:]['left']):
        m = (l + r) / 2
        fig2.line([l, m, r], [0, 0 +.1, 0], line_width=1, line_color="black")

    # plot UTRs
    fig2.quad(bottom='bottom', top='top', left='left', right='right',
             source=data.query("type == 'five_prime_UTR'"), 
             fill_color='green', line_width=0, fill_alpha=.5)
    fig2.quad(bottom='bottom', top='top', left='left', right='right',
             source=data.query("type == 'three_prime_UTR'"), 
             fill_color='red', line_width=0, fill_alpha=.5)

    # plot CDSs
    fig2.quad(bottom='bottom', top='top', left='left', right='right',
             source=data.query("type == 'CDS'"), 
             fill_color='blue', line_width=0, fill_alpha=.5)

    fig2.xaxis.axis_label = f'Position (Mbp)'
    fig2.y_range = bkmod.Range1d(-.5, len(df_transcripts)-.5)
    fig2.xaxis.axis_label = f'Contig {df_transcripts.seqid.item()} position (Mbp)'
    yticks = list(range(len(df_transcripts)))
    yticklabels = df_transcripts.index
    fig2.yaxis.ticker = yticks
    fig2.yaxis.major_label_overrides = {k: v for k, v in zip(yticks, df_transcripts.feature_id)}
    fig2.toolbar.logo = None 
    fig2.toolbar_location = None

    bkplt.show(column(fig1, fig2))
    return df_transcripts

In [17]:
col_dict = {
    "Angola - coluzzii" : "maroon",
    "Burkina Faso - coluzzii" : "tomato",
    "Burkina Faso - gambiae" : "lightskyblue",
    "Cameroon - coluzzii" : "crimson",
    "Cameroon - gambiae" : "blue",
    "Central African Republic - gambiae" : "midnightblue",
    "Cote d'Ivoire - coluzzii" : "lightsalmon",
    "Democratic Republic of Congo - gambiae" : "darkviolet",
    "Gabon - gambiae" :"seagreen",
    "Gambia, The - coluzzii" : "orangered",
    "Gambia, The - gambiae" : "palevioletred",
    "Gambia, The - intermediate_gambiae_coluzzii" : "darkorange",
    "Ghana - coluzzii" : "firebrick",
    "Ghana - gambiae" : "steelblue",
    "Guinea - gambiae" : "powderblue",
    "Guinea-Bissau - gambiae": "hotpink",
    "Guinea-Bissau - intermediate_gambiae_coluzzii": "gold",
    "Kenya - gambiae": "lime",
    "Kenya - intermediate_gambiae_coluzzii": "yellow",
    "Mali - coluzzii": "brown",
    "Mali - gambiae": "darkturquoise",
    "Mayotte - gambiae": "mediumspringgreen",
    "Mozambique - gambiae": "darkolivegreen",
    "Tanzania - gambiae": "yellowgreen",
    "Uganda - gambiae": "palegreen",
}

In [22]:
df_transcript = plot_transcript_variation(freq_df=freq_filtered_df, col_dict=col_dict, transcript=vgsc, populations=populations_gc)