In [4]:
import malariagen_data
import numpy as np
import pandas as pd

In [5]:
# !poetry add plotly
import plotly.express as px

### setup malariagen_data and parameters

In [6]:
# simplecache allows local cacheing
ag3 = malariagen_data.Ag3("simplecache::gs://vo_agam_release/")

In [7]:
# RDL transcript
transcript = "AGAP006028-RA"
site_mask = "gamb_colu" # other options...
sample_sets = "v3_wild" # explain?

In [8]:
# snp_allele_frequencies requires a population parameter in the form of a dictionary.
# Here we want the allele frequencies from all Kenya samples and from Burkina Faso 
# An. coluzzii samples collected in 2012
populations = {
    "Kenya": "country == 'Ghana' and species == 'gambiae'",
    "Burkina Faso": "country == 'Burkina Faso' and species == 'gambiae'",
    "Mayotte": "country == 'Mayotte'",
}

### generate statistic dataframes

In [9]:
%%time
snp_effects_df = ag3.snp_effects(transcript, site_mask)

transcript : AGAP006028-RA
chromosome : 2L 
start : 25363652
stop : 25434556
strand : +
CPU times: user 1min 29s, sys: 1.58 s, total: 1min 31s
Wall time: 4min 15s


In [10]:
snp_effects_df.groupby(['effect','impact']).size()

effect                 impact  
INTRONIC               MODIFIER    164055
NON_SYNONYMOUS_CODING  MODERATE      3272
SPLICE_CORE            HIGH            87
SPLICE_REGION          MODERATE       204
START_LOST             HIGH             3
STOP_GAINED            HIGH           190
STOP_LOST              HIGH             4
SYNONYMOUS_CODING      LOW           1019
dtype: int64

In [11]:
snp_effects_df.effect.unique()

array(['START_LOST', 'NON_SYNONYMOUS_CODING', 'STOP_GAINED',
       'SYNONYMOUS_CODING', 'INTRONIC', 'SPLICE_REGION', 'SPLICE_CORE',
       'STOP_LOST'], dtype=object)

In [12]:
snp_effects_df.impact.unique()

array(['HIGH', 'MODERATE', 'LOW', 'MODIFIER'], dtype=object)

In [13]:
%%time
allele_freq_df = ag3.snp_allele_frequencies(transcript=transcript, 
                           populations=populations, 
                           site_mask=site_mask, 
                           sample_sets=sample_sets,
                           drop_invariants=False)

transcript : AGAP006028-RA
chromosome : 2L 
start : 25363652
stop : 25434556
strand : +
CPU times: user 24.1 s, sys: 5.06 s, total: 29.1 s
Wall time: 1min 59s


In [14]:
allele_freq_df.head()

Unnamed: 0,position,ref_allele,alt_allele,Kenya,Burkina Faso,Mayotte,maximum
0,25363652,A,C,0.0,0.0,0.0,0.0
1,25363652,A,T,0.0,0.0,0.0,0.0
2,25363652,A,G,0.0,0.0,0.0,0.0
3,25363653,T,A,0.0,0.0,0.0,0.0
4,25363653,T,C,0.0,0.0,0.0,0.0


### join dataframes together so we can do some filtering

In [15]:
merged_df = pd.merge(snp_effects_df, allele_freq_df, on=['position', 'ref_allele', 'alt_allele'])

In [16]:
merged_df

Unnamed: 0,position,ref_allele,alt_allele,effect,impact,ref_codon,alt_codon,aa_pos,ref_aa,alt_aa,aa_change,Kenya,Burkina Faso,Mayotte,maximum
0,25363652,A,C,START_LOST,HIGH,Atg,Ctg,1.0,M,L,M1L,0.0,0.0,0.0,0.0
1,25363652,A,T,START_LOST,HIGH,Atg,Ttg,1.0,M,L,M1L,0.0,0.0,0.0,0.0
2,25363652,A,G,START_LOST,HIGH,Atg,Gtg,1.0,M,V,M1V,0.0,0.0,0.0,0.0
3,25363653,T,A,NON_SYNONYMOUS_CODING,MODERATE,aTg,aAg,1.0,M,K,M1K,0.0,0.0,0.0,0.0
4,25363653,T,C,NON_SYNONYMOUS_CODING,MODERATE,aTg,aCg,1.0,M,T,M1T,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168829,25434555,A,T,STOP_LOST,HIGH,tAa,tTa,556.0,*,L,*556L,0.0,0.0,0.0,0.0
168830,25434555,A,G,SYNONYMOUS_CODING,LOW,tAa,tGa,556.0,*,*,*556*,0.0,0.0,0.0,0.0
168831,25434556,A,C,STOP_LOST,HIGH,taA,taC,556.0,*,Y,*556Y,0.0,0.0,0.0,0.0
168832,25434556,A,T,STOP_LOST,HIGH,taA,taT,556.0,*,Y,*556Y,0.0,0.0,0.0,0.0


### filter down to variants-of-interest

In [89]:
# #let's keep  variants that have MODERATE (e.g. NON-SYN) or HIGH (e.g. STOP-LOST) impact.
merged_df = merged_df[(merged_df.impact == "HIGH") | (merged_df.impact == "MODERATE" ) | (merged_df.impact == "LOW" )].copy()

In [90]:
merged_df

Unnamed: 0,position,ref_allele,alt_allele,effect,impact,ref_codon,alt_codon,aa_pos,ref_aa,alt_aa,aa_change,Kenya,Burkina Faso,Mayotte,maximum
0,25363652,A,C,START_LOST,HIGH,Atg,Ctg,1.0,M,L,M1L,0.0,0.0,0.0,0.0
1,25363652,A,T,START_LOST,HIGH,Atg,Ttg,1.0,M,L,M1L,0.0,0.0,0.0,0.0
2,25363652,A,G,START_LOST,HIGH,Atg,Gtg,1.0,M,V,M1V,0.0,0.0,0.0,0.0
3,25363653,T,A,NON_SYNONYMOUS_CODING,MODERATE,aTg,aAg,1.0,M,K,M1K,0.0,0.0,0.0,0.0
4,25363653,T,C,NON_SYNONYMOUS_CODING,MODERATE,aTg,aCg,1.0,M,T,M1T,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168827,25434553,G,T,NON_SYNONYMOUS_CODING,MODERATE,aaG,aaT,555.0,K,N,K555N,0.0,0.0,0.0,0.0
168828,25434555,A,C,STOP_LOST,HIGH,tAa,tCa,556.0,*,S,*556S,0.0,0.0,0.0,0.0
168829,25434555,A,T,STOP_LOST,HIGH,tAa,tTa,556.0,*,L,*556L,0.0,0.0,0.0,0.0
168831,25434556,A,C,STOP_LOST,HIGH,taA,taC,556.0,*,Y,*556Y,0.0,0.0,0.0,0.0


In [108]:
#next, we can remove invariant positions (where maximum allele frequency is >0)
freq_filtered_df = merged_df[merged_df.maximum > 0].copy()

In [109]:
freq_filtered_df.columns

Index(['position', 'ref_allele', 'alt_allele', 'effect', 'impact', 'ref_codon',
       'alt_codon', 'aa_pos', 'ref_aa', 'alt_aa', 'aa_change', 'Kenya',
       'Burkina Faso', 'Mayotte', 'maximum'],
      dtype='object')

In [110]:
freq_filtered_df.shape

(35, 15)

In [112]:
freq_filtered_df['pos_Mbp'] = freq_filtered_df.position/1e6

In [115]:
freq_filtered_df

Unnamed: 0,position,ref_allele,alt_allele,effect,impact,ref_codon,alt_codon,aa_pos,ref_aa,alt_aa,aa_change,Kenya,Burkina Faso,Mayotte,maximum,pos_Mbp
13,25363656,C,T,NON_SYNONYMOUS_CODING,MODERATE,tCg,tTg,2.0,S,L,S2L,0.0,0.003185,0.0,0.003185,25.363656
206,25363723,G,T,NON_SYNONYMOUS_CODING,MODERATE,ttG,ttT,24.0,L,F,L24F,0.0,0.003185,0.0,0.003185,25.363723
37332,25382773,C,A,SPLICE_REGION,MODERATE,,,,,,,0.0,0.003185,0.0,0.003185,25.382773
48429,25387091,G,A,NON_SYNONYMOUS_CODING,MODERATE,gGa,gAa,75.0,G,E,G75E,0.0,0.003185,0.0,0.003185,25.387091
48646,25387164,A,T,SPLICE_REGION,MODERATE,,,,,,,0.0,0.003185,0.0,0.003185,25.387164
116113,25413317,C,T,NON_SYNONYMOUS_CODING,MODERATE,Cca,Tca,228.0,P,S,P228S,0.0,0.003185,0.0,0.003185,25.413317
157736,25429236,C,G,NON_SYNONYMOUS_CODING,MODERATE,gCa,gGa,296.0,A,G,A296G,0.888889,0.388535,0.0,0.888889,25.429236
167126,25433158,C,G,SPLICE_REGION,MODERATE,,,,,,,0.0,0.009554,0.0,0.009554,25.433158
167145,25433166,C,A,STOP_GAINED,HIGH,taC,taA,343.0,Y,*,Y343*,0.0,0.003185,0.0,0.003185,25.433166
167158,25433171,C,T,NON_SYNONYMOUS_CODING,MODERATE,aCg,aTg,345.0,T,M,T345M,0.888889,0.388535,0.0,0.888889,25.433171


### plot

In [38]:
# # first we need a plotly friendly dataframe...
# plotly_cols = {
#     'position' : np.tile(freq_filtered_df.position/1e6, len(populations)),
#     'effect' : np.tile(freq_filtered_df.effect, len(populations)),
#     'aa_change' : np.tile(freq_filtered_df.aa_change, len(populations)),
#     'frequency' : [],
#     'population' : []
# }

# for pop in freq_filtered_df[populations]:
#     for freq in freq_filtered_df[pop]:
#         plotly_cols['frequency'].append(freq)
#         plotly_cols['population'].append(pop)
        
# plotly_df = pd.DataFrame(plotly_cols)

In [39]:
# fig = px.scatter(plotly_df, x="position", y="frequency", color="population", symbol='effect', hover_data=["population", "frequency", "effect", "aa_change"])
# fig.show()

### we're going to use bokeh for the user guide

In [28]:
# !poetry add bokeh

The following packages are already present in the pyproject.toml and will be skipped:

  • [36mbokeh[0m

If you want to update it to the latest compatible version, you can use `poetry update package`.
If you prefer to upgrade it to the latest available version, you can use `poetry add package@latest`.

Nothing to add.


In [116]:
import bokeh.plotting as bkplt
import bokeh.models as bkmod
import bokeh.layouts as bklay
import bokeh.io as bkio
from bokeh.layouts import row 
from bokeh.plotting import figure

In [117]:
bkio.output_notebook()

In [132]:
# TODO if we want to have the pop name in the tool tip, we should build the "source" per pop with a pop column.
# TODO add legend
# TODO add gene model


col_dict = {
    "Kenya" : "pink",
    "Mayotte" : "orange",
    "Burkina Faso" : "grey"
}

# define tooltips for hover
tooltips = [
    
    ("position", '@position'),
    ("aa change", '@aa_change'),
]

fig = figure(plot_width=800, plot_height=400, tooltips=tooltips)

source = freq_filtered_df
for pop in populations:

    fig.circle("pos_Mbp", pop, size=10, color=col_dict[pop], alpha=0.8, source=source)


fig.xaxis.axis_label = f'position (Mbp)'
fig.yaxis.axis_label = f'population nucleotide frequency'
# show the results
bkplt.show(fig)

In [121]:
df_geneset = ag3.geneset(attributes=["ID", "Parent", "Name", "description"]).set_index("ID")
df_geneset

Unnamed: 0_level_0,seqid,source,type,start,end,score,strand,phase,Parent,Name,description
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2L,2L,VectorBase,chromosome,1,49364325,,,,,,
AGAP004677,2L,VectorBase,gene,157348,186936,,-,,,,methylenetetrahydrofolate dehydrogenase(NAD ) ...
AGAP004677-RA,2L,VectorBase,mRNA,157348,181305,,-,,AGAP004677,,
,2L,VectorBase,three_prime_UTR,157348,157495,,-,,AGAP004677-RA,,
,2L,VectorBase,exon,157348,157623,,-,,AGAP004677-RA,AGAP004677-RB-E4,
...,...,...,...,...,...,...,...,...,...,...,...
,Y_unplaced,VectorBase,five_prime_UTR,47932,48111,,+,,AGAP029375-RA,,
,Y_unplaced,VectorBase,exon,47932,48138,,+,,AGAP029375-RA,AGAP029375-RA-E2,
AGAP029375-PA,Y_unplaced,VectorBase,CDS,48112,48138,,+,0.0,AGAP029375-RA,,
,Y_unplaced,VectorBase,exon,48301,48385,,+,,AGAP029375-RA,AGAP029375-RA-E3,


In [86]:
def plot_transcripts(gene_id, width=700, track_height=30):

    # fing the gene
    gene = df_geneset.loc[gene_id]

    # find mRNA records for the given gene
    df_transcripts = df_geneset.query(f"Parent == '{gene_id}'")

    # calculate plot height depending on number of transcripts
    height = 70 + len(df_transcripts) * track_height

    # define tooltips for hover
    tooltips = [
        ("Type", '@type'),
        ("Location", '@seqid:@start{,}..@end{,}'),
    ]

    # make a figure
    fig = bkplt.figure(
        title=f'Transcripts - {gene_id} ({gene.strand})',
        plot_width=width, 
        plot_height=height,
        tools='xpan,xzoom_in,xzoom_out,xwheel_zoom,reset,hover',
        toolbar_location='above',
        active_scroll='xwheel_zoom',
        active_drag='xpan',
        tooltips=tooltips,
    )

    # plot the transcripts
    for i, transcript in enumerate(df_transcripts.itertuples()):

        data = df_geneset.query(f"Parent == '{transcript.Index}'").copy()
        data['left'] = data['start'] / 1e6  # plot in Mbp coordinates
        data['right'] = data['end'] / 1e6  # plot in Mbp coordinates
        data['bottom'] = i - 0.4
        data['top'] = i + 0.4

        # plot exons
        exons = data.query("type == 'exon'")
        fig.quad(bottom='bottom', top='top', left='left', right='right',
                 source=exons, 
                 fill_color=None, line_color='black', line_width=.5, fill_alpha=0)
        
        # plot introns
        for l, r in zip(exons[:-1]['right'], exons[1:]['left']):
            m = (l + r) / 2
            fig.line([l, m, r], [i, i+.1, i], line_width=1, line_color="black")

        # plot UTRs
        fig.quad(bottom='bottom', top='top', left='left', right='right',
                 source=data.query("type == 'five_prime_UTR'"), 
                 fill_color='green', line_width=0, fill_alpha=.5)
        fig.quad(bottom='bottom', top='top', left='left', right='right',
                 source=data.query("type == 'three_prime_UTR'"), 
                 fill_color='red', line_width=0, fill_alpha=.5)

        # plot CDSs
        fig.quad(bottom='bottom', top='top', left='left', right='right',
                 source=data.query("type == 'CDS'"), 
                 fill_color='blue', line_width=0, fill_alpha=.5)

    fig.xaxis.axis_label = f'Position (Mbp)'
    fig.y_range = bkmod.Range1d(-.5, len(df_transcripts)-.5)
    fig.xaxis.axis_label = f'Contig {gene.seqid} position (Mbp)'
    yticks = list(range(len(df_transcripts)))
    yticklabels = df_transcripts.index
    fig.yaxis.ticker = yticks
    fig.yaxis.major_label_overrides = {k: v for k, v in zip(yticks, yticklabels)}

    # show the figure
    bkplt.show(fig)

In [87]:
plot_transcripts("AGAP004050")

In [40]:
p = figure(title = transcript+": position x frequency")
p.circle('position','frequency',source=plotly_df,fill_alpha=0.2, size=10)

In [41]:
bkplt.show(p)