# Imports and configuration

In [1]:
import altair as alt
import pandas as pd
import plotly.express as px

from pathlib import Path

## Sample Diagram
The following is an example of the single category of Cellular Component, as extracted from a manual rerun of REViGO.

In [3]:
data = [
    ["GO:0005874","microtubule", 0.590923058896654, -4.00372136407618, 3.93380303408685, 5.07433362293908, 0.743307611944267, 0],
    ["GO:0042555","MCM complex",0.0516392267501353,5.29409032883786,-0.438484234906433,4.01582063426207,0.736867388621876,0],
    ["GO:0005886","plasma membrane",15.5064680247866,-2.59930578712986,-4.79438349762051,6.49331205332051,0.980465972776413,4.124E-05],
    ["GO:0030173","integral component of Golgi membrane",0.0482779463204013,-0.0820996416106789,6.61844221537962,3.98659260682221,0.720783016873817,0.16417986],
    ["GO:0031083","BLOC-1 complex",0.0157955281823943,6.03044083325888,2.61728943021364,3.50147007210041,0.638408624494431,0.22740185],
    ["GO:0030532","small nuclear ribonucleoprotein complex",0.138166054523554,2.1939043417736,2.03060434260059,4.44321603416583,0.571526896999077,0.2622474],
    ["GO:0008250","oligosaccharyltransferase complex",0.0394539627330108,2.22238070210506,4.52148800747906,3.89894446686651,0.602647357590838,0.39260902]
]

columns = ["term_ID","description","frequency","plot_X","plot_Y","log_size","uniqueness","dispensability"]

df = pd.DataFrame(data, columns=columns)
df.head()

Unnamed: 0,term_ID,description,frequency,plot_X,plot_Y,log_size,uniqueness,dispensability
0,GO:0005874,microtubule,0.590923,-4.003721,3.933803,5.074334,0.743308,0.0
1,GO:0042555,MCM complex,0.051639,5.29409,-0.438484,4.015821,0.736867,0.0
2,GO:0005886,plasma membrane,15.506468,-2.599306,-4.794383,6.493312,0.980466,4.1e-05
3,GO:0030173,integral component of Golgi membrane,0.048278,-0.0821,6.618442,3.986593,0.720783,0.16418
4,GO:0031083,BLOC-1 complex,0.015796,6.030441,2.617289,3.50147,0.638409,0.227402


In [4]:
base = alt.Chart(df).mark_point(filled=True, fillOpacity=0.5).encode(
    y=alt.Y('plot_Y', title="Semantic Space Y"),
    x=alt.X('plot_X', title="Semantic Space X"),
    size=alt.Size('log_size', scale=alt.Scale(base=0, domain=[3.5,6], range=[1000, 7000])),
    color=alt.Color('uniqueness', scale=alt.Scale(scheme='viridis')),
    tooltip=["term_ID", "description", "frequency", "log_size", "uniqueness", "dispensability"]
)

text = alt.Chart(df).mark_text(
#     align='left',
#     baseline='middle',
#     dx=7
).encode(
    y=alt.Y('plot_Y', title="Semantic Space Y"),
    x=alt.X('plot_X', title="Semantic Space X"),
    color=alt.Color('uniqueness', scale=alt.Scale(scheme='viridis')),
    text="description"
)

(base + text).properties(
    title="REViGO plot of top non-redundant GO terms for K. alvarezii protein predictions",
    height=600,
    width=800
).interactive()

# All output data from complete REViGO run

In [19]:
# Parse results
stringent_results = "Revigo_Result.txt"
all_results = {
    "MF": "RevigoMF.csv",
    "BP": "RevigoBP.csv",
    "CC": "RevigoCC.csv"
}
ontologies = {
    "MF": "Molecular function",
    "CC": "Cellular component",
    "BP": "Biological process"
}

for category, f in all_results.items():
    if not Path(f).exists():
        print(f"No file exists for {f}. \nPlease export CSV data from http://revigo.irb.hr/Results.aspx.")
        

all_dfs = []
for category, f in all_results.items():         
    df = pd.read_csv(f, sep=",", skipinitialspace = True, quotechar='"')
    df["Subontology symbol"] = category
    df["Subontology"] = ontologies[category]
    all_dfs.append(df)

master_df = pd.concat(all_dfs)
master_df.head() # 634 rows before filtering to Gabriels 

Unnamed: 0,TermID,Name,Frequency,PlotX,PlotY,LogSize,Value,Uniqueness,Dispensability,Representative,Eliminated,Subontology symbol,Subontology
0,GO:0003777,microtubule motor activity,0.186775,-1.277958,-1.163526,4.741884,,0.980471,0.0,,False,MF,Molecular function
1,GO:0009976,tocopherol cyclase activity,0.002332,4.078306,7.417087,2.838849,,0.96666,0.0,,False,MF,Molecular function
2,GO:0003963,RNA-3'-phosphate cyclase activity,0.005404,,,3.203577,,0.862812,0.70421,9976.0,True,MF,Molecular function
3,GO:0003911,DNA ligase (NAD+) activity,0.030633,,,3.956793,,0.826803,0.744027,9976.0,True,MF,Molecular function
4,GO:0003910,DNA ligase (ATP) activity,0.053418,,,4.198272,,0.820893,0.904895,9976.0,True,MF,Molecular function


In [20]:
# Plot graphs for each category of GO term
base = alt.Chart().mark_point(filled=True, fillOpacity=0.5).encode(
    y=alt.Y('PlotY', title="Semantic Space Y"),
    x=alt.X('PlotX', title="Semantic Space X"),
    size=alt.Size('LogSize', scale=alt.Scale(base=0, domain=[3.5,6], range=[1000, 7000])),
    color=alt.Color('Uniqueness', scale=alt.Scale(scheme='viridis')),
    tooltip=["TermID", "Name", "Frequency", "LogSize", "Uniqueness", "Dispensability"]
)

text = alt.Chart().mark_text().encode(
    y=alt.Y('PlotY', title="Semantic Space Y"),
    x=alt.X('PlotX', title="Semantic Space X"),
    color=alt.Color('Uniqueness', scale=alt.Scale(scheme='viridis')),
    text="Name"
)

plot = alt.layer(base, text, data=master_df).facet(
    row='Subontology:O'
).interactive()

In [21]:
plot

In [22]:
# Plot graphs for each category of GO term separately
category = "Molecular function"
df = master_df[master_df["Subontology"] == category]

base = alt.Chart(df).mark_point(filled=True, fillOpacity=0.5).encode(
    y=alt.Y('PlotY', title="Semantic Space Y"),
    x=alt.X('PlotX', title="Semantic Space X"),
    size=alt.Size('LogSize', scale=alt.Scale(base=0, domain=[3.5,6], range=[1000, 7000])),
    color=alt.Color('Uniqueness', scale=alt.Scale(scheme='viridis')),
    tooltip=["TermID", "Name", "Frequency", "LogSize", "Uniqueness", "Dispensability"]
)

text = alt.Chart(df).mark_text().encode(
    y=alt.Y('PlotY', title="Semantic Space Y"),
    x=alt.X('PlotX', title="Semantic Space X"),
    color=alt.Color('Uniqueness', scale=alt.Scale(scheme='viridis')),
    text="Name"
)

plot = alt.layer(base, text).properties(
    title=f"REViGO plot of top non-redundant '{category}' GO terms for K. alvarezii protein predictions",
    height=600,
    width=800
).interactive()

plot

In [23]:
# Plot graphs for each category of GO term separately
category = "Cellular component"
df = master_df[master_df["Subontology"] == category]

base = alt.Chart(df).mark_point(filled=True, fillOpacity=0.5).encode(
    y=alt.Y('PlotY', title="Semantic Space Y"),
    x=alt.X('PlotX', title="Semantic Space X"),
    size=alt.Size('LogSize', scale=alt.Scale(base=0, domain=[3.5,6], range=[1000, 7000])),
    color=alt.Color('Uniqueness', scale=alt.Scale(scheme='viridis')),
    tooltip=["TermID", "Name", "Frequency", "LogSize", "Uniqueness", "Dispensability"]
)

text = alt.Chart(df).mark_text().encode(
    y=alt.Y('PlotY', title="Semantic Space Y"),
    x=alt.X('PlotX', title="Semantic Space X"),
    color=alt.Color('Uniqueness', scale=alt.Scale(scheme='viridis')),
    text="Name"
)

plot = alt.layer(base, text).properties(
    title=f"REViGO plot of top non-redundant '{category}' GO terms for K. alvarezii protein predictions",
    height=600,
    width=800
).interactive()

plot

In [24]:
# Plot graphs for each category of GO term separately
category = "Biological process"
df = master_df[master_df["Subontology"] == category]

base = alt.Chart(df).mark_point(filled=True, fillOpacity=0.5).encode(
    y=alt.Y('PlotY', title="Semantic Space Y"),
    x=alt.X('PlotX', title="Semantic Space X"),
    size=alt.Size('LogSize', scale=alt.Scale(base=0, domain=[3.5,6], range=[1000, 7000])),
    color=alt.Color('Uniqueness', scale=alt.Scale(scheme='viridis')),
    tooltip=["TermID", "Name", "Frequency", "LogSize", "Uniqueness", "Dispensability"]
)

text = alt.Chart(df).mark_text().encode(
    y=alt.Y('PlotY', title="Semantic Space Y"),
    x=alt.X('PlotX', title="Semantic Space X"),
    color=alt.Color('Uniqueness', scale=alt.Scale(scheme='viridis')),
    text="Name"
)

plot = alt.layer(base, text).properties(
    title=f"REViGO plot of top non-redundant '{category}' GO terms for K. alvarezii protein predictions",
    height=600,
    width=800
).interactive()

plot

# Filtered output data from stringent REViGO run

In [25]:
# Parse stringent results from Gabriel's list
filter_df = pd.read_csv(stringent_results, sep="\t")
filter_df.rename({"Subontoloy": "Subontology"}, axis=1, inplace=True)
filter_df.head()  # 192 rows in white list

# Filter original results using this whitelist
master_df = master_df[master_df["TermID"].isin(list(filter_df["TermID"]))]
master_df.head() # 192 rows after filtering

Unnamed: 0,TermID,Name,Frequency,PlotX,PlotY,LogSize,Value,Uniqueness,Dispensability,Representative,Eliminated,Subontology symbol,Subontology
0,GO:0003777,microtubule motor activity,0.186775,-1.277958,-1.163526,4.741884,,0.980471,0.0,,False,MF,Molecular function
1,GO:0009976,tocopherol cyclase activity,0.002332,4.078306,7.417087,2.838849,,0.96666,0.0,,False,MF,Molecular function
5,GO:0016531,copper chaperone activity,0.008897,3.567694,7.249564,3.419956,,0.993093,0.0,,False,MF,Molecular function
9,GO:0017056,structural constituent of nuclear pore,0.06578,-5.36961,-1.436137,4.288674,,0.967328,0.0,,False,MF,Molecular function
10,GO:0042030,ATPase inhibitor activity,0.016437,-1.858998,0.421116,3.686458,,0.982641,0.0,,False,MF,Molecular function


In [27]:
# Plot graphs for each category of GO term separately
plots = []
for category in master_df["Subontology"].unique():
    df = master_df[master_df["Subontology"] == category]
    domain = [df["LogSize"].min(), df["LogSize"].max()]

    base = alt.Chart(df).mark_point(filled=True, fillOpacity=0.5).encode(
        y=alt.Y('PlotY', title="Semantic Space Y"),
        x=alt.X('PlotX', title="Semantic Space X"),
        size=alt.Size('LogSize', scale=alt.Scale(base=0, domain=domain, range=[1000, 7000])),
        color=alt.Color('Uniqueness', scale=alt.Scale(scheme='viridis')),
        tooltip=["TermID", "Name", "Frequency", "LogSize", "Uniqueness", "Dispensability"]
    )

    text = alt.Chart(df).mark_text().encode(
        y=alt.Y('PlotY', title="Semantic Space Y"),
        x=alt.X('PlotX', title="Semantic Space X"),
        color=alt.Color('Uniqueness', scale=alt.Scale(scheme='viridis')),
        text="Name"
    )

    plot = alt.layer(base, text).properties(
        title=f"REViGO plot of top non-redundant '{category}' GO terms for K. alvarezii protein predictions",
        height=600,
        width=900
    ).interactive()
    
    plot.save(f'output-plots/Revigo-PCA-{df["Subontology symbol"].values[0]}.html')
    plots.append(plot)

# alt.vconcat(plots[0], plots[1], plots[2])

In [142]:
plots[0]

In [143]:
plots[1]

In [144]:
plots[2]

### Treemap view 

In [9]:
# Parse results
# stringent_results = "Revigo_Result.txt"
all_results = {
    "MF": "RevigoTreeMapMF.csv",
    "BP": "RevigoTreeMapBP.csv",
    "CC": "RevigoTreeMapCC.csv"
}
ontologies = {
    "MF": "Molecular function",
    "CC": "Cellular component",
    "BP": "Biological process"
}

for category, f in all_results.items():
    if not Path(f).exists():
        print(f"No file exists for {f}. \nPlease export treemap CSV data from http://revigo.irb.hr/Results.aspx.")
        

all_dfs = []
for category, f in all_results.items():       
    df = pd.read_csv(f, sep=",", skipinitialspace = True, quotechar='"', skiprows=[0,1,2,3])
    df["Subontology symbol"] = category
    df["Subontology"] = ontologies[category]
    all_dfs.append(df)

master_df = pd.concat(all_dfs)
master_df["Representative"].fillna("None", inplace=True)
master_df.head() # 124 rows before any filtering

Unnamed: 0,TermID,Name,Frequency,Value,Uniqueness,Dispensability,Representative,Subontology symbol,Subontology
0,GO:0003777,microtubule motor activity,0.186775,,0.980471,0.0,,MF,Molecular function
1,GO:0009976,tocopherol cyclase activity,0.002332,,0.96666,0.0,,MF,Molecular function
2,GO:0016531,copper chaperone activity,0.008897,,0.993093,0.0,,MF,Molecular function
3,GO:0017056,structural constituent of nuclear pore,0.06578,,0.967328,0.0,,MF,Molecular function
4,GO:0005458,GDP-mannose transmembrane transporter activity,0.004389,,0.956222,0.159301,structural constituent of nuclear pore,MF,Molecular function


In [17]:
plots = []
colors=['#fae588','#f79d65','#f9dc5c','#e8ac65','#e76f51','#ef233c','#b7094c'] #color palette

for category in master_df["Subontology"].unique():
    df = master_df[master_df["Subontology"] == category]
    
    fig = px.treemap(df, path=[
        px.Constant("all"), 
        'Representative', 
        'Name', 
        'TermID'  # note: leave this out to reduce information displayed on map (i.e. dont display the GO terms)
    ], values='Uniqueness')
    
    fig.update_layout(
        title={
            'text': f"Tree Map view of REViGO results for '{category}' GO terms",
            'y':0.98,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'
        },
        treemapcolorway = colors, #defines the colors in the treemap
        margin = dict(t=50, l=25, r=25, b=25),
#         uniformtext=dict(minsize=10, mode='show'),
    )
    fig.update_traces(root_color="lightgrey")
    fig.write_html(f"output-plots/Revigo-treemap-{df['Subontology symbol'].values[0]}.html")

    plots.append(fig)

In [18]:
plots[0].show()

In [110]:
plots[0].show()

In [111]:
plots[1].show()

In [112]:
plots[2].show()

### Sunburst view

In [117]:
plots = []
colors=['#fae588','#f79d65','#f9dc5c','#e8ac65','#e76f51','#ef233c','#b7094c'] #color palette

for category in master_df["Subontology"].unique():
    df = master_df[master_df["Subontology"] == category]
    
    fig = px.sunburst(df, path=[
        px.Constant("all"), 
        'Representative', 
        'Name', 
        'TermID'  
        # note: leave this out to reduce information displayed on map (i.e. dont display the GO terms)
        # maybe even make it toggle-able in the "app" somehow.        
    ], values='Uniqueness')
    
    fig.update_layout(
        title={
        'text': f"Sunburst view of REViGO results for '{category}' GO terms",
        'y':0.98,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        treemapcolorway = colors, #defines the colors in the treemap
        margin = dict(t=50, l=25, r=25, b=25))
    fig.update_traces(root_color="lightgrey")
    fig.write_html(f"output-plots/Revigo-sunburst-{df['Subontology symbol'].values[0]}.html")
    
    plots.append(fig)

In [106]:
plots[0]

In [107]:
plots[1]

In [108]:
plots[2]

---