In [43]:
import math

import numpy as np
import pandas as pd


In [44]:
df = pd.read_table("each_gene_in_one_row.tsv", index_col=False, dtype={"padj": float})
df["Significance"] = "-log_10(padj)<=0.01"

df_all = pd.read_table(
    "shrinked_not-filtered.tsv", index_col=False, dtype={"padj": float}
)
df_all["Significance"] = "Not significant"

df_all

Unnamed: 0,Genes,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,Significance
0,gbgene898,50282.293342,3.186651,0.375224,8.492814,2.016921e-17,2.716995e-13,Not significant
1,gbgene18399,4719.410642,5.472171,0.660615,8.307088,9.807912e-17,1.321224e-12,Not significant
2,gbgene7672,57572.198955,-4.537037,0.952829,-7.383063,1.546893e-13,2.083820e-09,Not significant
3,gbgene2487,98.963962,-1.061089,0.868373,-7.245668,4.303125e-13,5.796739e-09,Not significant
4,gbgene15915,36591.453440,-4.502580,0.953318,-7.178496,7.048255e-13,9.494705e-09,Not significant
...,...,...,...,...,...,...,...,...
17003,gbgene20331,1.848771,-0.198827,0.937246,-0.215010,8.297598e-01,,Not significant
17004,gbgene20332,1.910325,0.253916,0.974964,0.263201,7.923954e-01,,Not significant
17005,gbgene20333,1.208407,0.025135,0.950943,0.026113,9.791675e-01,,Not significant
17006,gbgene20336,0.461617,-0.642723,0.833875,-0.698582,4.848132e-01,,Not significant


In [45]:
data = df_all.merge(df, how="left", on="Genes", suffixes=["", "_y"], copy=True)
data

Unnamed: 0,Genes,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,Significance,baseMean_y,log2FoldChange_y,...,Significant_MF,Expected_MF,Rank_in_classicFisher_MF,classicFisher_MF,elimFisher_MF,weight01Fisher_MF,parentchildFisher_MF,weightFisher_MF,leaFisher_MF,Significance_y
0,gbgene898,50282.293342,3.186651,0.375224,8.492814,2.016921e-17,2.716995e-13,Not significant,50282.293342,3.186651,...,,,,,,,,,,-log_10(padj)<=0.01
1,gbgene18399,4719.410642,5.472171,0.660615,8.307088,9.807912e-17,1.321224e-12,Not significant,4719.410642,5.472171,...,,,,,,,,,,-log_10(padj)<=0.01
2,gbgene7672,57572.198955,-4.537037,0.952829,-7.383063,1.546893e-13,2.083820e-09,Not significant,57572.198955,-4.537037,...,,,,,,,,,,-log_10(padj)<=0.01
3,gbgene2487,98.963962,-1.061089,0.868373,-7.245668,4.303125e-13,5.796739e-09,Not significant,98.963962,-1.061089,...,2,0.85;0.77,27;25,0.208;0.181,0.208;0.181,0.208;0.181,0.157;0.0213,0.208;0.181,0.208;0.181,-log_10(padj)<=0.01
4,gbgene15915,36591.453440,-4.502580,0.953318,-7.178496,7.048255e-13,9.494705e-09,Not significant,36591.453440,-4.502580,...,,,,,,,,,,-log_10(padj)<=0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17003,gbgene20331,1.848771,-0.198827,0.937246,-0.215010,8.297598e-01,,Not significant,,,...,,,,,,,,,,
17004,gbgene20332,1.910325,0.253916,0.974964,0.263201,7.923954e-01,,Not significant,,,...,,,,,,,,,,
17005,gbgene20333,1.208407,0.025135,0.950943,0.026113,9.791675e-01,,Not significant,,,...,,,,,,,,,,
17006,gbgene20336,0.461617,-0.642723,0.833875,-0.698582,4.848132e-01,,Not significant,,,...,,,,,,,,,,


In [46]:
# Assign "-log_10(padj)<=0.01" to data.Significance if row.Significance_y is not NaN,
# i.e. it's "-log_10(padj)<=0.01" instead
def significator(row_val):
    def isNaN(num):
        return num != num

    if not isNaN(row_val):
        return "-log_10(padj)<=0.01"
    return "Not significant"


data["Significance"] = data.apply(lambda row: significator(row.Significance_y), axis=1)


# Drop duplicate columns
cols = [
    "baseMean_y",
    "log2FoldChange_y",
    "lfcSE_y",
    "stat_y",
    "pvalue_y",
    "Significance_y",
    "padj_y",
]
data.drop(cols, inplace=True, axis=1)


data["-log_10_padj"] = data.apply(lambda row: math.log10(row.padj) * -1, axis=1)

nanable_cols = [
    "baseMean",
    "lfcSE",
    "stat",
    "pvalue",
    "padj",
]

for col in nanable_cols:
    data.loc[data["Significance"] == "Not significant", col] = np.nan
data.fillna("", inplace=True)
data

Unnamed: 0,Genes,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,Significance,transcript_ID,FlyBase_ID,...,Significant_MF,Expected_MF,Rank_in_classicFisher_MF,classicFisher_MF,elimFisher_MF,weight01Fisher_MF,parentchildFisher_MF,weightFisher_MF,leaFisher_MF,-log_10_padj
0,gbgene898,50282.293342,3.186651,0.375224,8.492814,0.0,0.0,-log_10(padj)<=0.01,gbgene898.t1,FBgn0053126,...,,,,,,,,,,12.565911
1,gbgene18399,4719.410642,5.472171,0.660615,8.307088,0.0,0.0,-log_10(padj)<=0.01,gbgene18399.t1,FBgn0000042,...,,,,,,,,,,11.879024
2,gbgene7672,57572.198955,-4.537037,0.952829,-7.383063,0.0,0.0,-log_10(padj)<=0.01,gbgene7672.t1,,...,,,,,,,,,,8.68114
3,gbgene2487,98.963962,-1.061089,0.868373,-7.245668,0.0,0.0,-log_10(padj)<=0.01,gbgene2487.t1,FBgn0030592,...,2,0.85;0.77,27;25,0.208;0.181,0.208;0.181,0.208;0.181,0.157;0.0213,0.208;0.181,0.208;0.181,8.236816
4,gbgene15915,36591.45344,-4.502580,0.953318,-7.178496,0.0,0.0,-log_10(padj)<=0.01,gbgene15915.t1,,...,,,,,,,,,,8.022519
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17003,gbgene20331,,-0.198827,,,,,Not significant,,,...,,,,,,,,,,
17004,gbgene20332,,0.253916,,,,,Not significant,,,...,,,,,,,,,,
17005,gbgene20333,,0.025135,,,,,Not significant,,,...,,,,,,,,,,
17006,gbgene20336,,-0.642723,,,,,Not significant,,,...,,,,,,,,,,


In [57]:
hover_cols = {
    "Genes": "",
    "baseMean": "Base mean",
    "log2FoldChange": "Log2 fold change",
    "lfcSE": "Log fold change SE",
    "stat": "Stat",
    "pvalue": "P-value",
    "padj": "Adj p-value",
    "-log_10_padj": "-log10 adj p-value",
    "transcript_ID": "Transcript ID",
    "FlyBase_ID": "FlyBase ID",
    "FlyBase_reverse_hits_IDs": "FlyBase reverse hits IDs",
    "FlyBase_symbol_name": "FlyBase symbol name",
    "gbue11": "gbue11",
    "gbue11_revhits": "gbue11 revhits",
    "orthodb": "Orthodb",
    "orthodb_revhits": "Orthodb revhits",
    "Annotated_Gene_Ontology_terms": "Annotated Gene Ontology terms",
    "FlyBase_reference_ID1": "FlyBase reference ID 1",
    "FlyBase_reference_ID2": "FlyBase reference ID 2",
    "FlyBase_Annotation_Symbol_ID1": "FlyBase Annotation Symbol ID 1",
    "FlyBase_Annotation_Symbol_ID2": "FlyBase Annotation Symbol ID 2",
    "FlyMine_ID1": "FlyMine ID 1",
    "FlyMine_ID2": "FlyMine ID 2",
    "GB_protein_ID1": "GB protein ID 1",
    "GB_protein_ID2": "GB protein ID 2",
    "GB_protein_ID3": "GB protein ID 3",
    "GB_protein_ID4": "GB protein ID 4",
    "GB_protein_ID5": "GB protein ID 5",
    "NCBI_Reference_Sequence_ID1": "NCBI Reference Sequence ID 1",
    "NCBI_Reference_Sequence_ID2": "NCBI Reference Sequence ID 2",
    "UniProt_Swiss-Prot_ID": "UniProt Swiss-Prot ID",
    "UniProt_TrEMBL_ID1": "UniProt TrEMBL ID 1",
    "UniProt_TrEMBL_ID2": "UniProt TrEMBL ID 2",
    "modMine": "ModMine",
    "modMine_2": "ModMine 2",
    "GO_ID_BP": "BP GO IDs",
    "Term_BP": "BP terms",
    "GO_ID_CC": "CC GO IDs",
    "Term_CC": "CC terms",
    "GO_ID_MF": "MF GO IDs",
    "Term_MF": "MF terms",
}

In [60]:
hover_texts: list[str] = []
for index, row in data.iterrows():
    # For each row we want a fresh list to keep track of what values
    # are in each column of the row
    hover_texts_row: list[str] = []
    for index, element in enumerate(row):
        col_name: str = row.index[index]  # e.g. baseMean
        row_value: str = element  # e.g. 50282.2933422622
        hr_colname = hover_cols.get(col_name)  # e.g. Base mean
        # Present gene names as headers bold face headers in the hover text
        if col_name == "Genes":
            hover_texts_row.append(f"<b>{row_value}</b><br>")
            continue
        # Add a hover text only if the row value is among the wished hover text columns
        # and if there is a something to show in the first place
        if hr_colname and row_value:
            hover_texts_row.append(f"{hr_colname}: {row_value}<br>")
    # Produce the hover text from its elements
    hover_text = "".join(hover_texts_row)
    # Remove the last <br> from the hover text
    hover_text = hover_text[: len(hover_text) - 4]
    hover_texts.append(hover_text)

In [75]:
# Add the hover texts as a column to the whole df so they go along
data.insert(2, "hover_text", hover_texts)

df_not_sig = data[data["Significance"] == "Not significant"]
df_sig = data[data["Significance"] == "-log_10(padj)<=0.01"]
df_sig

Unnamed: 0,Genes,baseMean,hover_text,log2FoldChange,lfcSE,stat,pvalue,padj,Significance,transcript_ID,...,Significant_MF,Expected_MF,Rank_in_classicFisher_MF,classicFisher_MF,elimFisher_MF,weight01Fisher_MF,parentchildFisher_MF,weightFisher_MF,leaFisher_MF,-log_10_padj
0,gbgene898,50282.293342,<b>gbgene898</b><br>Base mean: 50282.293342262...,3.186651,0.375224,8.492814,0.0,0.0,-log_10(padj)<=0.01,gbgene898.t1,...,,,,,,,,,,12.565911
1,gbgene18399,4719.410642,<b>gbgene18399</b><br>Base mean: 4719.41064222...,5.472171,0.660615,8.307088,0.0,0.0,-log_10(padj)<=0.01,gbgene18399.t1,...,,,,,,,,,,11.879024
2,gbgene7672,57572.198955,<b>gbgene7672</b><br>Base mean: 57572.19895476...,-4.537037,0.952829,-7.383063,0.0,0.0,-log_10(padj)<=0.01,gbgene7672.t1,...,,,,,,,,,,8.68114
3,gbgene2487,98.963962,<b>gbgene2487</b><br>Base mean: 98.96396230428...,-1.061089,0.868373,-7.245668,0.0,0.0,-log_10(padj)<=0.01,gbgene2487.t1,...,2,0.85;0.77,27;25,0.208;0.181,0.208;0.181,0.208;0.181,0.157;0.0213,0.208;0.181,0.208;0.181,8.236816
4,gbgene15915,36591.45344,<b>gbgene15915</b><br>Base mean: 36591.4534399...,-4.502580,0.953318,-7.178496,0.0,0.0,-log_10(padj)<=0.01,gbgene15915.t1,...,,,,,,,,,,8.022519
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,gbgene11592,1301.303916,<b>gbgene11592</b><br>Base mean: 1301.30391571...,2.320423,0.462493,5.016983,0.000001,0.007071,-log_10(padj)<=0.01,gbgene11592.t1,...,,,,,,,,,,2.150529
97,gbgene14914,36.348107,<b>gbgene14914</b><br>Base mean: 36.3481070818...,2.976539,0.590946,4.997882,0.000001,0.007808,-log_10(padj)<=0.01,gbgene14914.t1,...,2,0.85;0.77,27;25,0.208;0.181,0.208;0.181,0.208;0.181,0.157;0.0213,0.208;0.181,0.208;0.181,2.107445
98,gbgene6556,276.704689,<b>gbgene6556</b><br>Base mean: 276.7046886001...,-3.991248,0.806298,-4.986408,0.000001,0.008286,-log_10(padj)<=0.01,gbgene6556.t1;gbgene6556.t2,...,1,0.2;0.28;0.25,24;32;28,0.18;0.246;0.221,0.18;0.246;0.221,0.18;0.246;0.221,0.5345;0.0333;0.0296,0.18;0.246;0.221,0.18;0.246;0.221,2.081638
99,gbgene883,135.764437,<b>gbgene883</b><br>Base mean: 135.76443714879...,3.094281,0.619886,4.981549,0.000001,0.008497,-log_10(padj)<=0.01,gbgene883.t1,...,1,0.06,13,0.062,0.062,0.062,1,0.062,0.062,2.070728


In [76]:
df_not_sig

Unnamed: 0,Genes,baseMean,hover_text,log2FoldChange,lfcSE,stat,pvalue,padj,Significance,transcript_ID,...,Significant_MF,Expected_MF,Rank_in_classicFisher_MF,classicFisher_MF,elimFisher_MF,weight01Fisher_MF,parentchildFisher_MF,weightFisher_MF,leaFisher_MF,-log_10_padj
101,gbman.GBUE000615-PA.1,,<b>gbman.GBUE000615-PA.1</b><br>Log2 fold chan...,-4.222245,,,,,Not significant,,...,,,,,,,,,,1.995378
102,gbgene13992,,<b>gbgene13992</b><br>Log2 fold change: -4.136...,-4.136911,,,,,Not significant,,...,,,,,,,,,,1.975924
103,gbgene7742,,<b>gbgene7742</b><br>Log2 fold change: -4.1864...,-4.186480,,,,,Not significant,,...,,,,,,,,,,1.923486
104,gbgene7728,,<b>gbgene7728</b><br>Log2 fold change: -1.1289...,-1.128962,,,,,Not significant,,...,,,,,,,,,,1.917184
105,gbgene2091,,<b>gbgene2091</b><br>Log2 fold change: 2.69263...,2.692637,,,,,Not significant,,...,,,,,,,,,,1.874355
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17003,gbgene20331,,<b>gbgene20331</b><br>Log2 fold change: -0.198...,-0.198827,,,,,Not significant,,...,,,,,,,,,,
17004,gbgene20332,,<b>gbgene20332</b><br>Log2 fold change: 0.2539...,0.253916,,,,,Not significant,,...,,,,,,,,,,
17005,gbgene20333,,<b>gbgene20333</b><br>Log2 fold change: 0.0251...,0.025135,,,,,Not significant,,...,,,,,,,,,,
17006,gbgene20336,,<b>gbgene20336</b><br>Log2 fold change: -0.642...,-0.642723,,,,,Not significant,,...,,,,,,,,,,


In [79]:
import plotly.graph_objects as go

fig = go.Figure()


fig.add_trace(
    go.Scatter(
        x=df_sig["log2FoldChange"],
        y=df_sig["-log_10_padj"],
        mode="markers",
        hovertext=df_sig["hover_text"].tolist(),
        hoverinfo="text",
        name="padj<=0.01",
    )
)

fig.add_trace(
    go.Scatter(
        x=df_not_sig["log2FoldChange"],
        y=df_not_sig["-log_10_padj"],
        mode="markers",
        hovertext=df_not_sig["hover_text"].tolist(),
        hoverinfo="text",
        name="Not significant",
    )
)
fig.update_layout(title_text="Volcano plot")
fig.write_html("fig.html", config={"displaylogo": False})