In [None]:
import plotly.express as px
import pandas as pd
import numpy as np
import yaml

from pathlib import Path

## File configuration

In [None]:
with open("config.yaml", "r") as f:
    notebook_configuration = yaml.safe_load(f)
notebook_configuration

In [None]:
bgcflow_dir = Path(notebook_configuration["bgcflow_dir"])
project_name = "mq_saccharopolyspora"
report_dir = bgcflow_dir / f"data/processed/{project_name}"
FIGURE = "Figure_4"

bigscape = [i for i in (report_dir / "bigscape/for_cytoscape_antismash_6.1.1/").glob("*df_clusters_0.30.csv")]
assert len(bigscape) == 1
bigscape = bigscape[0]

bigfam = report_dir / "bigslice/query_as_6.1.1/query_network.csv"
arts = report_dir / "tables/df_arts_as-6.1.1.csv"
bgcs = report_dir / "tables/df_regions_antismash_6.1.1.csv"

df_bigscape = pd.read_csv(bigscape)
df_bigfam = pd.read_csv(bigfam)
df_arts = pd.read_csv(arts)
df_bgcs = pd.read_csv(bgcs)

In [None]:
df_bigscape

In [None]:
mapping = {"known_family":"BiG-SCAPE known", 
           "unknown_family":"BiG-SCAPE unknown"}
df = df_bigscape.loc[:, ["bgc_id", "bigscape_class", 'fam_type_0.30', 'fam_id_0.30']].set_index("bgc_id")
df.loc[:, "fam_type_0.30"] = [mapping[df.loc[i, "fam_type_0.30"]] for i in df.index]
df.head()

In [None]:
mapping = {True : "antiSMASH known",
           False: "antiSMASH unknown"}
df2 = df_bgcs.loc[:, ["bgc_id", "similarity"]].fillna(0).set_index("bgc_id")
df2["known_cluster_blast"] = df2.loc[:, "similarity"] >= 0.8
df2["known_cluster_blast"] = [mapping[df2.loc[i, "known_cluster_blast"]] for i in df2.index]
df2 = df2.loc[:, "known_cluster_blast"]
df2

In [None]:
df3 = df_bigfam[df_bigfam.loc[:, "rank"] == 0].set_index("bgc_id")
df3["bigfam_hits"] = "BiG-FAM match"
df3 = df3.loc[:, ["bigfam_hits"]]
df3

In [None]:
df4 = df_arts.loc[:, ["bgc_id", "Core hits"]].set_index("bgc_id")
df4["arts_hits"] = "ARTS2 match"
df4 = df4.loc[:, "arts_hits"]

In [None]:
df_final = pd.concat([df, df2, df3, df4], axis=1)
df_final["bigfam_hits"] = df_final["bigfam_hits"].fillna("BiG-FAM unknown")
df_final["arts_hits"] = df_final["arts_hits"].fillna("ARTS2 unknown")
df_final = df_final.reset_index()
for i in df_final.columns:
    print(df_final[i].value_counts())
    print("")

In [None]:
label = "Number of BGCs"
df_grouped = df_final.groupby(["fam_type_0.30", "known_cluster_blast","bigfam_hits",  "bigscape_class", "arts_hits"]).count()["bgc_id"].reset_index()
df_grouped = df_grouped.rename(columns={"bgc_id" : label})

In [None]:
fig = px.treemap(df_grouped, path=[px.Constant("all"),'known_cluster_blast','fam_type_0.30',   "bigfam_hits", 'arts_hits', 'bigscape_class'], 
                 values=label, color=label, width=800, height=1000, color_continuous_scale='RdBu')
fig.update_layout(margin = dict(t=25, l=25, r=25, b=25))
fig.show()

In [None]:
fig.write_image(f"assets/figures/{FIGURE}/{FIGURE}a.svg")
df_final.columns

In [None]:
val_dict = {}
for g in df_final["fam_id_0.30"].unique():
    row_val = {}
    subset = df_final[df_final['fam_id_0.30'] == g]
    for c in df_final.columns:
        if 'fam_id' in c:
            pass
        elif 'bgc_id' in c:
            count = len(subset[c].unique())
            row_val["family_size"] = count
        else:
            value = list(subset[c].unique())
            if len(value) > 1:
                value = [i for i in value if 'unknown' not in i]
                assert len(value) == 1
            row_val[c] = value[0]
    val_dict[g] = row_val
#val_dict

In [None]:
df_edited = df_final.copy()
for i in df_final.index:
    gcf = df_final.loc[i, "fam_id_0.30"]
    bgc_id = df_final.loc[i, "bgc_id"]
    df_edited.loc[i, 'bgc_id'] = bgc_id
    dict_map = val_dict[gcf]
    for c in dict_map.keys():
        df_edited.loc[i, c] = dict_map[c]
df_edited

In [None]:
label = "Number of BGCs"
df_grouped = df_edited.groupby(["fam_type_0.30", "known_cluster_blast","bigfam_hits",  "bigscape_class", "arts_hits"]).count()["bgc_id"].reset_index()
df_grouped = df_grouped.rename(columns={"bgc_id" : label})
df_grouped.head()

In [None]:
source = df_grouped.copy()
fig = px.treemap(source, 
                 path=[px.Constant("all"), 'fam_type_0.30', 'known_cluster_blast', "bigfam_hits", 'arts_hits', 'bigscape_class'], 
                 values=label, color=label,
                 width=1000, height=1000, 
                 color_continuous_scale=[(0,"#ef6f6c"), (0.5,"white"), (1,"#006e90")],
                 #color_continuous_scale=[(0,"green"), (0.5,"white"), (1,"#006e90")] 
                )


figure_data = fig["data"][0]
#list of unique names in the dataframe

for num, item in enumerate(figure_data['ids']):
    values = len(item.split("/"))
    if values < 6:
        end = item.split("/")[-1]
        if 'unknown' in end:
            figure_data['marker']['colors'][num] = max(figure_data['marker']['colors'])
        elif values == 1:
            figure_data['marker']['colors'][num] = 0
        else:
            figure_data['marker']['colors'][num] = -max(figure_data['marker']['colors'])

fig.update_layout(
  hoverlabel=dict(
        font_size=12,
        font_family="Calibri"
    ),
  template="plotly_white",
  paper_bgcolor="white",
)
fig.data[0].hovertemplate = (
  '<b>%{label}</b>'
  '<br>' +
  '# Number of BGCs: %{customdata[0]}'
)

fig.update_coloraxes(showscale=False)

#fig.update_traces(root_color="lightgrey")
fig.data[0].textinfo = 'label+text+value'
fig.show()
fig.write_image(f"assets/figures/{FIGURE}/{FIGURE}x.svg")

In [None]:
df_select = df_final.copy()
df_select.set_index("bgc_id", inplace=True)
df_select = df_select[(df_select["fam_type_0.30"] == "BiG-SCAPE unknown") & (df_select["known_cluster_blast"] == "antiSMASH unknown") & (df_select["arts_hits"] == "ARTS2 match")  ]

In [None]:
df_select.value_counts("fam_id_0.30").head(15)

In [None]:
df_select

In [None]:
df3 = df_bigfam[df_bigfam["rank"] == 0]
df4 = df3[df3.bgc_id.isin(df_select.index)]
df4.

In [None]:
df_bigfam_gcf = pd.read_csv("../bigslice/query_as_6.1.1/gcf_summary.csv", index_col=0)
df_bigfam_gcf.loc[df4.gcf_id.unique()]

In [None]:
source = df_edited.copy()
source.loc[:, "fam_id_0.30"] = [f"GCF_{source.loc[i, 'fam_id_0.30']}" for i in source.index]
source['values'] = 1
fig = px.treemap(source, 
                 path=[px.Constant("all"), 'fam_type_0.30', 'known_cluster_blast', "bigfam_hits", 'arts_hits', 'bigscape_class', 'fam_id_0.30', 'bgc_id'], 
                 values="values", color="values",
                 width=1000, height=1000, 
                 color_continuous_scale=[(0,"#ef6f6c"), (0.5,"white"), (1,"#006e90")],
                 #color_continuous_scale=[(0,"green"), (0.5,"white"), (1,"#006e90")] 
                )


figure_data = fig["data"][0]
#list of unique names in the dataframe

for num, item in enumerate(figure_data['ids']):
    values = len(item.split("/"))
    if values < 6:
        end = item.split("/")[-1]
        if 'unknown' in end:
            figure_data['marker']['colors'][num] = max(figure_data['marker']['colors'])
        elif values == 1:
            figure_data['marker']['colors'][num] = 0
        else:
            figure_data['marker']['colors'][num] = -max(figure_data['marker']['colors'])

fig.update_layout(
  hoverlabel=dict(
        font_size=12,
        font_family="Calibri"
    ),
  template="plotly_white",
  paper_bgcolor="white",
)
fig.data[0].hovertemplate = (
  '<b>%{label}</b>'
  '<br>' +
  '# Number of BGCs: %{customdata[0]}'
)

fig.update_coloraxes(showscale=False)

#fig.update_traces(root_color="lightgrey")
fig.data[0].textinfo = 'label+text+value'
source.to_csv("assets/tables/network.csv")
fig.show()
fig.write_html("assets/figures/Fig_4.html")

In [None]:
max(figure_data['marker']['colors'])

In [None]:
figure_data['marker']['colors'][1]

In [None]:
venn_category = {"BiG-SCAPE MIBIG hits" : set(df_edited[df_edited['fam_type_0.30'] == 'BiG-SCAPE known'].bgc_id),
                 "KnownClusterBlast hits" : set(df_edited[df_edited['known_cluster_blast'] == 'antiSMASH known'].bgc_id),
                 "BiG-FAM hits" : set(df_edited[df_edited['bigfam_hits'] == 'BiG-FAM match'].bgc_id),
                 "ARTS2 hits" : set(df_edited[df_edited['arts_hits'] == 'ARTS2 match'].bgc_id),
                 #"All" : set(df_edited.bgc_id)
                }

In [None]:
#! pip install venn # we will use this later for visualization
from venn import venn
%matplotlib inline

In [None]:
venn_diagram = venn(venn_category)
venn_diagram

In [None]:
import pandas as pd
import plotly.express as px 
import numpy as np

df = pd.DataFrame({'Parent': ['Alba', 'John', 'John', 'Alba'],
                   'Child': ['John', 'Alba', 'Jane', 'Mark']})

# Create the treemap
fig = px.treemap(df, path=['Parent', 'Child'])
figure_data = fig["data"][0]

In [None]:
figure_data

In [None]:
figure_data['marker']['colors']

In [None]:
source