In [1]:
import malariagen_data
import numpy as np
import pandas as pd

### setup malariagen_data and parameters

In [2]:
# simplecache allows local cacheing
ag3 = malariagen_data.Ag3("simplecache::gs://vo_agam_release/", simplecache=dict(cache_storage="gcs_cache"))

In [3]:
# RDL transcript
transcript = "AGAP006028-RA"
site_mask = "gamb_colu" # other options...
sample_sets = "v3_wild" # explain?

In [4]:
# snp_allele_frequencies requires a population parameter in the form of a dictionary.
# Here we want the allele frequencies from all Kenya samples and from Burkina Faso 
# An. coluzzii samples collected in 2012
populations = {
    "Kenya": "country == 'Ghana' and species == 'gambiae'",
    "Burkina Faso": "country == 'Burkina Faso' and species == 'gambiae'",
    "Mayotte": "country == 'Mayotte'",
}

### generate statistic dataframes

In [5]:
%%time
snp_effects_df = ag3.snp_effects(transcript, site_mask)

transcript : AGAP006028-RA
chromosome : 2L 
start : 25363652
stop : 25434556
strand : +
CPU times: user 1min 36s, sys: 1 s, total: 1min 37s
Wall time: 1min 38s


In [6]:
snp_effects_df.groupby(['effect','impact']).size()

effect                 impact  
INTRONIC               MODIFIER    164055
NON_SYNONYMOUS_CODING  MODERATE      3272
SPLICE_CORE            HIGH            87
SPLICE_REGION          MODERATE       204
START_LOST             HIGH             3
STOP_GAINED            HIGH           190
STOP_LOST              HIGH             4
SYNONYMOUS_CODING      LOW           1019
dtype: int64

In [7]:
snp_effects_df.effect.unique()

array(['START_LOST', 'NON_SYNONYMOUS_CODING', 'STOP_GAINED',
       'SYNONYMOUS_CODING', 'INTRONIC', 'SPLICE_REGION', 'SPLICE_CORE',
       'STOP_LOST'], dtype=object)

In [8]:
snp_effects_df.impact.unique()

array(['HIGH', 'MODERATE', 'LOW', 'MODIFIER'], dtype=object)

In [9]:
%%time
allele_freq_df = ag3.snp_allele_frequencies(transcript=transcript, 
                           populations=populations, 
                           site_mask=site_mask, 
                           sample_sets=sample_sets,
                           drop_invariants=False)

transcript : AGAP006028-RA
chromosome : 2L 
start : 25363652
stop : 25434556
strand : +
CPU times: user 23.5 s, sys: 5.62 s, total: 29.1 s
Wall time: 1min 20s


In [10]:
allele_freq_df.head()

Unnamed: 0,position,ref_allele,alt_allele,Kenya,Burkina Faso,Mayotte,maximum
0,25363652,A,C,0.0,0.0,0.0,0.0
1,25363652,A,T,0.0,0.0,0.0,0.0
2,25363652,A,G,0.0,0.0,0.0,0.0
3,25363653,T,A,0.0,0.0,0.0,0.0
4,25363653,T,C,0.0,0.0,0.0,0.0


### join dataframes together so we can do some filtering

In [11]:
merged_df = pd.merge(snp_effects_df, allele_freq_df, on=['position', 'ref_allele', 'alt_allele'])

In [12]:
merged_df

Unnamed: 0,position,ref_allele,alt_allele,effect,impact,ref_codon,alt_codon,aa_pos,ref_aa,alt_aa,aa_change,Kenya,Burkina Faso,Mayotte,maximum
0,25363652,A,C,START_LOST,HIGH,Atg,Ctg,1.0,M,L,M1L,0.0,0.0,0.0,0.0
1,25363652,A,T,START_LOST,HIGH,Atg,Ttg,1.0,M,L,M1L,0.0,0.0,0.0,0.0
2,25363652,A,G,START_LOST,HIGH,Atg,Gtg,1.0,M,V,M1V,0.0,0.0,0.0,0.0
3,25363653,T,A,NON_SYNONYMOUS_CODING,MODERATE,aTg,aAg,1.0,M,K,M1K,0.0,0.0,0.0,0.0
4,25363653,T,C,NON_SYNONYMOUS_CODING,MODERATE,aTg,aCg,1.0,M,T,M1T,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168829,25434555,A,T,STOP_LOST,HIGH,tAa,tTa,556.0,*,L,*556L,0.0,0.0,0.0,0.0
168830,25434555,A,G,SYNONYMOUS_CODING,LOW,tAa,tGa,556.0,*,*,*556*,0.0,0.0,0.0,0.0
168831,25434556,A,C,STOP_LOST,HIGH,taA,taC,556.0,*,Y,*556Y,0.0,0.0,0.0,0.0
168832,25434556,A,T,STOP_LOST,HIGH,taA,taT,556.0,*,Y,*556Y,0.0,0.0,0.0,0.0


### filter down to variants-of-interest

In [13]:
# #let's keep  variants that have MODERATE (e.g. NON-SYN) or HIGH (e.g. STOP-LOST) impact.
merged_df = merged_df[(merged_df.impact == "HIGH") | (merged_df.impact == "MODERATE" ) | (merged_df.impact == "LOW" )].copy()

In [14]:
merged_df

Unnamed: 0,position,ref_allele,alt_allele,effect,impact,ref_codon,alt_codon,aa_pos,ref_aa,alt_aa,aa_change,Kenya,Burkina Faso,Mayotte,maximum
0,25363652,A,C,START_LOST,HIGH,Atg,Ctg,1.0,M,L,M1L,0.0,0.0,0.0,0.0
1,25363652,A,T,START_LOST,HIGH,Atg,Ttg,1.0,M,L,M1L,0.0,0.0,0.0,0.0
2,25363652,A,G,START_LOST,HIGH,Atg,Gtg,1.0,M,V,M1V,0.0,0.0,0.0,0.0
3,25363653,T,A,NON_SYNONYMOUS_CODING,MODERATE,aTg,aAg,1.0,M,K,M1K,0.0,0.0,0.0,0.0
4,25363653,T,C,NON_SYNONYMOUS_CODING,MODERATE,aTg,aCg,1.0,M,T,M1T,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168829,25434555,A,T,STOP_LOST,HIGH,tAa,tTa,556.0,*,L,*556L,0.0,0.0,0.0,0.0
168830,25434555,A,G,SYNONYMOUS_CODING,LOW,tAa,tGa,556.0,*,*,*556*,0.0,0.0,0.0,0.0
168831,25434556,A,C,STOP_LOST,HIGH,taA,taC,556.0,*,Y,*556Y,0.0,0.0,0.0,0.0
168832,25434556,A,T,STOP_LOST,HIGH,taA,taT,556.0,*,Y,*556Y,0.0,0.0,0.0,0.0


In [15]:
#next, we can remove invariant positions (where maximum allele frequency is >0)
freq_filtered_df = merged_df[merged_df.maximum > 0].copy()

In [16]:
freq_filtered_df.columns

Index(['position', 'ref_allele', 'alt_allele', 'effect', 'impact', 'ref_codon',
       'alt_codon', 'aa_pos', 'ref_aa', 'alt_aa', 'aa_change', 'Kenya',
       'Burkina Faso', 'Mayotte', 'maximum'],
      dtype='object')

In [17]:
freq_filtered_df.shape

(88, 15)

In [18]:
freq_filtered_df['pos_Mbp'] = freq_filtered_df.position/1e6

### plot

In [19]:
# # first we need a plotly friendly dataframe...
# plotly_cols = {
#     'position' : np.tile(freq_filtered_df.position/1e6, len(populations)),
#     'effect' : np.tile(freq_filtered_df.effect, len(populations)),
#     'aa_change' : np.tile(freq_filtered_df.aa_change, len(populations)),
#     'frequency' : [],
#     'population' : []
# }

# for pop in freq_filtered_df[populations]:
#     for freq in freq_filtered_df[pop]:
#         plotly_cols['frequency'].append(freq)
#         plotly_cols['population'].append(pop)
        
# plotly_df = pd.DataFrame(plotly_cols)

In [20]:
# fig = px.scatter(plotly_df, x="position", y="frequency", color="population", symbol='effect', hover_data=["population", "frequency", "effect", "aa_change"])
# fig.show()

### we're going to use bokeh for the user guide

In [21]:
# !poetry add bokeh

In [22]:
import bokeh.plotting as bkplt
import bokeh.models as bkmod
import bokeh.layouts as bklay
from bokeh.layouts import column
import bokeh.io as bkio
from bokeh.layouts import row 

In [23]:
bkio.output_notebook()

In [41]:
df_geneset = ag3.geneset(attributes=["ID", "Parent", "Name", "description"]).set_index("ID")

#RDL
transcript = "AGAP006028-RA"

col_dict = {
    "Kenya" : "pink",
    "Mayotte" : "orange",
    "Burkina Faso" : "grey"
}

In [42]:
populations

{'Kenya': "country == 'Ghana' and species == 'gambiae'",
 'Burkina Faso': "country == 'Burkina Faso' and species == 'gambiae'",
 'Mayotte': "country == 'Mayotte'"}

In [51]:
def plot_transcript_variation(freq_df, col_dict, transcript, width=800, height=400):

    # SCATTER
    # tooltips
    tooltips1 = [
        ("population", '@population'),
        ("frequency", '@frequency'),
        ("position", '@position'),
        ("aa change", '@aa_change'),
    ]

    fig1 = bkplt.figure(
        title=f'Transcript - {transcript}',
        tools='xpan,xzoom_in,xzoom_out,xwheel_zoom,reset,hover',
        active_scroll='xwheel_zoom',
        active_drag='xpan',
        plot_width=width, 
        plot_height=height, 
        tooltips=tooltips1,
        toolbar_location="above")

    for pop in populations:
        df = freq_df.copy()
        df['frequency'] = df[pop]
        df['population'] = pop
        fig1.circle("pos_Mbp", 
                    pop, size=10, 
                    color=col_dict[pop], 
                    alpha=0.8, 
                    source=df,
                    legend_label=pop)

    fig1.yaxis.axis_label = f'population nucleotide frequency'
    fig1.xaxis.visible = False
#     fig1.legend.location = "top_left"
    fig1.add_layout(fig1.legend[0], 'right')
    fig1.legend.click_policy="hide"
    
    # GENE MODEL
    df_transcripts = df_geneset.query(f"ID == '{transcript}'")

    # define tooltips for hover
    tooltips2 = [
        ("Type", '@type'),
        ("Location", '@seqid:@start{,}..@end{,}'),
    ]

    # make a figure
    fig2 = bkplt.figure(   
        plot_width=width, 
        plot_height=int(height / 4),
        tools='xpan,xzoom_in,xzoom_out,xwheel_zoom,reset,hover',
        #toolbar_location='above',
        active_scroll='xwheel_zoom',
        active_drag='xpan',
        tooltips=tooltips2,
        x_range=fig1.x_range,
    )

    # plot the transcripts


    data = df_geneset.query(f"Parent == '{transcript}'").copy()
    data['left'] = data['start'] / 1e6  # plot in Mbp coordinates
    data['right'] = data['end'] / 1e6  # plot in Mbp coordinates
    data['bottom'] = 0 - 0.4
    data['top'] = 0 + 0.4

    # plot exons
    exons = data.query("type == 'exon'")
    fig2.quad(bottom='bottom', 
              top='top', 
              left='left', 
              right='right',
              source=exons, 
              fill_color=None, 
              line_color='black', 
              line_width=.5, 
              fill_alpha=0)

    # plot introns
    for l, r in zip(exons[:-1]['right'], exons[1:]['left']):
        m = (l + r) / 2
        fig2.line([l, m, r], [0, 0 +.1, 0], line_width=1, line_color="black")

    # plot UTRs
    fig2.quad(bottom='bottom', top='top', left='left', right='right',
             source=data.query("type == 'five_prime_UTR'"), 
             fill_color='green', line_width=0, fill_alpha=.5)
    fig2.quad(bottom='bottom', top='top', left='left', right='right',
             source=data.query("type == 'three_prime_UTR'"), 
             fill_color='red', line_width=0, fill_alpha=.5)

    # plot CDSs
    fig2.quad(bottom='bottom', top='top', left='left', right='right',
             source=data.query("type == 'CDS'"), 
             fill_color='blue', line_width=0, fill_alpha=.5)

    fig2.xaxis.axis_label = f'Position (Mbp)'
    fig2.y_range = bkmod.Range1d(-.5, len(df_transcripts)-.5)
    fig2.xaxis.axis_label = f'Contig {df_transcripts.seqid[0]} position (Mbp)'
    yticks = list(range(len(df_transcripts)))
    yticklabels = df_transcripts.index
    fig2.yaxis.ticker = yticks
    fig2.yaxis.major_label_overrides = {k: v for k, v in zip(yticks, yticklabels)}
    fig2.toolbar.logo = None 
    fig2.toolbar_location = None

    bkplt.show(column(fig1, fig2))

In [52]:
plot_transcript_variation(freq_df=freq_filtered_df, col_dict=col_dict, transcript=transcript)

## TODO
- get lasso tool working
- legend outside or inside (the offset bokeh toolbar bugs me a bit)?

## more OPTIMIZE PRIME

In [28]:
!poetry add snakeviz

The following packages are already present in the pyproject.toml and will be skipped:

  • [36msnakeviz[0m

If you want to update it to the latest compatible version, you can use `poetry update package`.
If you prefer to upgrade it to the latest available version, you can use `poetry add package@latest`.

Nothing to add.


In [29]:
%load_ext snakeviz

In [30]:
transcript = "AGAP009194-RA"

## first time SNP effects, without simplecache

In [31]:
%snakeviz ag3.snp_effects(transcript, site_mask=site_mask)

transcript : AGAP009194-RA
chromosome : 3R 
start : 28597652
stop : 28598640
strand : -
 
*** Profile stats marshalled to file '/tmp/tmpqgh2271r'. 
Embedding SnakeViz in this document...


## second time - simplecached

In [32]:
%%snakeviz
snp_effects_df = ag3.snp_effects(transcript, site_mask)

transcript : AGAP009194-RA
chromosome : 3R 
start : 28597652
stop : 28598640
strand : -
 
*** Profile stats marshalled to file '/tmp/tmpfwc_bye5'. 
Embedding SnakeViz in this document...


## first time allele freq 

In [33]:
%%snakeviz
allele_freq_df = ag3.snp_allele_frequencies(transcript=transcript, 
                           populations=populations, 
                           site_mask=site_mask, 
                           sample_sets=sample_sets,
                           drop_invariants=False)

transcript : AGAP009194-RA
chromosome : 3R 
start : 28597652
stop : 28598640
strand : -
 
*** Profile stats marshalled to file '/tmp/tmptrafh9pd'. 
Embedding SnakeViz in this document...


## simplecached allelefreq

In [34]:
%%snakeviz
allele_freq_df = ag3.snp_allele_frequencies(transcript=transcript, 
                           populations=populations, 
                           site_mask=site_mask, 
                           sample_sets=sample_sets,
                           drop_invariants=False)

transcript : AGAP009194-RA
chromosome : 3R 
start : 28597652
stop : 28598640
strand : -
 
*** Profile stats marshalled to file '/tmp/tmpm52xuiz6'. 
Embedding SnakeViz in this document...
