In [1]:
import malariagen_data
import numpy as np
import pandas as pd

### setup malariagen_data and parameters

In [7]:
# simplecache allows local cacheing
ag3 = malariagen_data.Ag3("simplecache::gs://vo_agam_release/", simplecache=dict(cache_storage="gcs_cache"))

In [3]:
# RDL transcript
transcript = "AGAP006028-RA"
site_mask = "gamb_colu" # other options...
sample_sets = "v3_wild" # explain?

In [None]:
# snp_allele_frequencies requires a population parameter in the form of a dictionary.
# Here we want the allele frequencies from all Kenya samples and from Burkina Faso 
# An. coluzzii samples collected in 2012
populations = {
    "Kenya": "country == 'Ghana' and species == 'gambiae'",
    "Burkina Faso": "country == 'Burkina Faso' and species == 'gambiae'",
    "Mayotte": "country == 'Mayotte'",
}

### generate statistic dataframes

In [None]:
%%time
snp_effects_df = ag3.snp_effects(transcript, site_mask)

In [None]:
snp_effects_df.groupby(['effect','impact']).size()

In [None]:
snp_effects_df.effect.unique()

In [None]:
snp_effects_df.impact.unique()

In [None]:
%%time
allele_freq_df = ag3.snp_allele_frequencies(transcript=transcript, 
                           populations=populations, 
                           site_mask=site_mask, 
                           sample_sets=sample_sets,
                           drop_invariants=False)

In [None]:
allele_freq_df.head()

### join dataframes together so we can do some filtering

In [None]:
merged_df = pd.merge(snp_effects_df, allele_freq_df, on=['position', 'ref_allele', 'alt_allele'])

In [None]:
merged_df

### filter down to variants-of-interest

In [None]:
# #let's keep  variants that have MODERATE (e.g. NON-SYN) or HIGH (e.g. STOP-LOST) impact.
merged_df = merged_df[(merged_df.impact == "HIGH") | (merged_df.impact == "MODERATE" ) | (merged_df.impact == "LOW" )].copy()

In [None]:
merged_df

In [None]:
#next, we can remove invariant positions (where maximum allele frequency is >0)
freq_filtered_df = merged_df[merged_df.maximum > 0].copy()

In [None]:
freq_filtered_df.columns

In [None]:
freq_filtered_df.shape

In [None]:
freq_filtered_df['pos_Mbp'] = freq_filtered_df.position/1e6

### plot

In [None]:
# # first we need a plotly friendly dataframe...
# plotly_cols = {
#     'position' : np.tile(freq_filtered_df.position/1e6, len(populations)),
#     'effect' : np.tile(freq_filtered_df.effect, len(populations)),
#     'aa_change' : np.tile(freq_filtered_df.aa_change, len(populations)),
#     'frequency' : [],
#     'population' : []
# }

# for pop in freq_filtered_df[populations]:
#     for freq in freq_filtered_df[pop]:
#         plotly_cols['frequency'].append(freq)
#         plotly_cols['population'].append(pop)
        
# plotly_df = pd.DataFrame(plotly_cols)

In [None]:
# fig = px.scatter(plotly_df, x="position", y="frequency", color="population", symbol='effect', hover_data=["population", "frequency", "effect", "aa_change"])
# fig.show()

### we're going to use bokeh for the user guide

In [None]:
# !poetry add bokeh

In [None]:
import bokeh.plotting as bkplt
import bokeh.models as bkmod
import bokeh.layouts as bklay
from bokeh.layouts import column
import bokeh.io as bkio
from bokeh.layouts import row 

In [None]:
bkio.output_notebook()

In [None]:
df_geneset = ag3.geneset(attributes=["ID", "Parent", "Name", "description"]).set_index("ID")

#RDL
transcript = "AGAP006028-RA"

col_dict = {
    "Kenya" : "pink",
    "Mayotte" : "orange",
    "Burkina Faso" : "grey"
}

In [None]:
populations

In [None]:
def plot_transcript_variation(freq_df, col_dict, transcript, width=800, height=400):

    # SCATTER
    # tooltips
    tooltips1 = [
        ("population", '@population'),
        ("frequency", '@frequency'),
        ("position", '@position'),
        ("aa change", '@aa_change'),
    ]

    fig1 = bkplt.figure(
        title=f'Transcript - {transcript}',
        tools='xpan,xzoom_in,xzoom_out,xwheel_zoom,reset,hover',
        active_scroll='xwheel_zoom',
        active_drag='xpan',
        plot_width=width, 
        plot_height=height, 
        tooltips=tooltips1)

    for pop in populations:
        df = freq_df.copy()
        df['frequency'] = df[pop]
        df['population'] = pop
        fig1.circle("pos_Mbp", 
                    pop, size=10, 
                    color=col_dict[pop], 
                    alpha=0.8, 
                    source=df,
                    legend_label=pop)

    fig1.yaxis.axis_label = f'population nucleotide frequency'
    fig1.xaxis.visible = False
#     fig1.legend.location = "top_left"
    fig1.add_layout(fig1.legend[0], 'right')
    fig1.legend.click_policy="hide"
    
    # GENE MODEL
    df_transcripts = df_geneset.query(f"ID == '{transcript}'")

    # define tooltips for hover
    tooltips2 = [
        ("Type", '@type'),
        ("Location", '@seqid:@start{,}..@end{,}'),
    ]

    # make a figure
    fig2 = bkplt.figure(   
        plot_width=width, 
        plot_height=int(height / 4),
        tools='xpan,xzoom_in,xzoom_out,xwheel_zoom,reset,hover',
        #toolbar_location='above',
        active_scroll='xwheel_zoom',
        active_drag='xpan',
        tooltips=tooltips2,
        x_range=fig1.x_range,
    )

    # plot the transcripts


    data = df_geneset.query(f"Parent == '{transcript}'").copy()
    data['left'] = data['start'] / 1e6  # plot in Mbp coordinates
    data['right'] = data['end'] / 1e6  # plot in Mbp coordinates
    data['bottom'] = 0 - 0.4
    data['top'] = 0 + 0.4

    # plot exons
    exons = data.query("type == 'exon'")
    fig2.quad(bottom='bottom', 
              top='top', 
              left='left', 
              right='right',
              source=exons, 
              fill_color=None, 
              line_color='black', 
              line_width=.5, 
              fill_alpha=0)

    # plot introns
    for l, r in zip(exons[:-1]['right'], exons[1:]['left']):
        m = (l + r) / 2
        fig2.line([l, m, r], [0, 0 +.1, 0], line_width=1, line_color="black")

    # plot UTRs
    fig2.quad(bottom='bottom', top='top', left='left', right='right',
             source=data.query("type == 'five_prime_UTR'"), 
             fill_color='green', line_width=0, fill_alpha=.5)
    fig2.quad(bottom='bottom', top='top', left='left', right='right',
             source=data.query("type == 'three_prime_UTR'"), 
             fill_color='red', line_width=0, fill_alpha=.5)

    # plot CDSs
    fig2.quad(bottom='bottom', top='top', left='left', right='right',
             source=data.query("type == 'CDS'"), 
             fill_color='blue', line_width=0, fill_alpha=.5)

    fig2.xaxis.axis_label = f'Position (Mbp)'
    fig2.y_range = bkmod.Range1d(-.5, len(df_transcripts)-.5)
    fig2.xaxis.axis_label = f'Contig {df_transcripts.seqid[0]} position (Mbp)'
    yticks = list(range(len(df_transcripts)))
    yticklabels = df_transcripts.index
    fig2.yaxis.ticker = yticks
    fig2.yaxis.major_label_overrides = {k: v for k, v in zip(yticks, yticklabels)}

    bkplt.show(column(fig1, fig2))

In [None]:
plot_transcript_variation(freq_df=freq_filtered_df, col_dict=col_dict, transcript=transcript)

## TODO
- get lasso tool working
- legend outside or inside (the offset bokeh toolbar bugs me a bit)?

## more OPTIMIZE PRIME

In [None]:
!poetry add snakeviz

In [4]:
%load_ext snakeviz

In [5]:
transcript = "AGAP009194-RA"

## first time SNP effects, without simplecache

In [8]:
%snakeviz ag3.snp_effects(transcript, site_mask=site_mask)

transcript : AGAP009194-RA
chromosome : 3R 
start : 28597652
stop : 28598640
strand : -
 
*** Profile stats marshalled to file '/tmp/tmpzmth_iho'. 
Embedding SnakeViz in this document...


## second time - simplecached

In [None]:
%%snakeviz
snp_effects_df = ag3.snp_effects(transcript, site_mask)

## first time allele freq 

In [None]:
%%snakeviz
allele_freq_df = ag3.snp_allele_frequencies(transcript=transcript, 
                           populations=populations, 
                           site_mask=site_mask, 
                           sample_sets=sample_sets,
                           drop_invariants=False)

## simplecached allelefreq

In [None]:
%%snakeviz
allele_freq_df = ag3.snp_allele_frequencies(transcript=transcript, 
                           populations=populations, 
                           site_mask=site_mask, 
                           sample_sets=sample_sets,
                           drop_invariants=False)