# Figure 4a - Treemap distribution of the mapping of detected BGCs across in different databases such as MIBIG, BiGFAM, and ARTS.
BGCs are grouped into antiSMASH known or unknown categories based on a cutoff greater than 80% similarity to MIBIG DB using KnownClusterBlast function of antiSMASH. Using BiG-SCAPE, BGCs are grouped in known or unknown if the GCF contains MIBIG BGC with a 0.3 cutoff on a similarity metric. Using BiG-SLICE, BGCs have detected a match against BiG-FAM DB of over 1.2 M BGCs detected across public genomes. Finally, the ARTS match represents whether a BGC contains resistance-related genes that can help in prioritization.

In [None]:
import plotly.express as px
import pandas as pd
import numpy as np
import yaml

from pathlib import Path

## File configurations
This first part configure the paths and tables used to create the figure

In [None]:
with open("config.yaml", "r") as f:
    notebook_configuration = yaml.safe_load(f)
notebook_configuration

In [None]:
bgcflow_dir = Path(notebook_configuration["bgcflow_dir"])
project_name = "mq_saccharopolyspora"
report_dir = bgcflow_dir / f"data/processed/{project_name}"
FIGURE = "Figure_4"

bigscape = [i for i in (report_dir / "bigscape/for_cytoscape_antismash_6.1.1/").glob("*df_clusters_0.30.csv")]
assert len(bigscape) == 1
bigscape = bigscape[0]

bigfam = report_dir / "bigslice/query_as_6.1.1/query_network.csv"
arts = report_dir / "tables/df_arts_as-6.1.1.csv"
bgcs = report_dir / "tables/df_regions_antismash_6.1.1.csv"

df_bigscape = pd.read_csv(bigscape)
df_bigfam = pd.read_csv(bigfam)
df_arts = pd.read_csv(arts)
df_bgcs = pd.read_csv(bgcs)

## Building category for treemap
### Categorising BiG-SCAPE result into knowns and unknowns

In [None]:
df_bigscape

In [None]:
# uses the result form "fam_type_0.30"
mapping = {"known_family":"BiG-SCAPE known", 
           "unknown_family":"BiG-SCAPE unknown"}
df = df_bigscape.loc[:, ["bgc_id", "bigscape_class", 'fam_type_0.30', 'fam_id_0.30']].set_index("bgc_id")
df.loc[:, "fam_type_0.30"] = [mapping[df.loc[i, "fam_type_0.30"]] for i in df.index]
df.head()

### Categorising AntiSMASH result into knowns and unknowns

In [None]:
# cutoff used 
antismash_cutoff = 0.80

mapping = {True : "antiSMASH known",
           False: "antiSMASH unknown"}
df2 = df_bgcs.loc[:, ["bgc_id", "similarity"]].fillna(0).set_index("bgc_id")
df2["known_cluster_blast"] = df2.loc[:, "similarity"] >= antismash_cutoff
df2["known_cluster_blast"] = [mapping[df2.loc[i, "known_cluster_blast"]] for i in df2.index]
df2 = df2.loc[:, "known_cluster_blast"]
df2

### Categorising BiG-FAM hits

In [None]:
df3 = df_bigfam[df_bigfam.loc[:, "rank"] == 0].set_index("bgc_id")
df3["bigfam_hits"] = "BiG-FAM match"
df3 = df3.loc[:, ["bigfam_hits"]]
df3

### Categorising ARTS hits

In [None]:
df4 = df_arts.loc[:, ["bgc_id", "Core hits"]].set_index("bgc_id")
df4["arts_hits"] = "ARTS2 match"
df4 = df4.loc[:, "arts_hits"]

### Table Cleaning

In [None]:
df_final = pd.concat([df, df2, df3, df4], axis=1)
df_final["bigfam_hits"] = df_final["bigfam_hits"].fillna("BiG-FAM unknown")
df_final["arts_hits"] = df_final["arts_hits"].fillna("ARTS2 unknown")
df_final = df_final.reset_index()
for i in df_final.columns:
    print(df_final[i].value_counts())
    print("")

## Drawing treemap

In [None]:
label = "Number of BGCs"
df_grouped = df_final.groupby(["fam_type_0.30", "known_cluster_blast","bigfam_hits",  "bigscape_class", "arts_hits"]).count()["bgc_id"].reset_index()
df_grouped = df_grouped.rename(columns={"bgc_id" : label})

In [None]:
fig = px.treemap(df_grouped, path=[px.Constant("all"), 'fam_type_0.30', 'known_cluster_blast', "bigfam_hits", 'arts_hits', 'bigscape_class'], 
                 values=label, color=label, width=800, height=1200, color_continuous_scale='Blues')
#fig.update_layout(margin = dict(t=25, l=25, r=25, b=25))
fig.update_layout(
    uniformtext=dict(minsize=12, mode='hide'),
    margin = dict(t=50, l=25, r=25, b=25)
)
fig.show()

In [None]:
outfile = Path(f"assets/figures/{FIGURE}/{FIGURE}a.svg")
outfile_html = Path(f"assets/figures/{FIGURE}/{FIGURE}a.html")
outfile.parent.mkdir(exist_ok=True, parents=True)
fig.write_image(outfile)
fig.write_html(outfile_html)

## Adding labels in treemap

In [None]:
val_dict = {}
for g in df_final["fam_id_0.30"].unique():
    row_val = {}
    subset = df_final[df_final['fam_id_0.30'] == g]
    for c in df_final.columns:
        if 'fam_id' in c:
            pass
        elif 'bgc_id' in c:
            count = len(subset[c].unique())
            row_val["family_size"] = count
        else:
            value = list(subset[c].unique())
            if len(value) > 1:
                value = [i for i in value if 'unknown' not in i]
                assert len(value) == 1
            row_val[c] = value[0]
    val_dict[g] = row_val
#val_dict

In [None]:
df_edited = df_final.copy()
for i in df_final.index:
    gcf = df_final.loc[i, "fam_id_0.30"]
    bgc_id = df_final.loc[i, "bgc_id"]
    df_edited.loc[i, 'bgc_id'] = bgc_id
    dict_map = val_dict[gcf]
    for c in dict_map.keys():
        df_edited.loc[i, c] = dict_map[c]
df_edited

In [None]:
label = "Number of BGCs"
df_grouped = df_edited.groupby(["fam_type_0.30", "known_cluster_blast","bigfam_hits",  "bigscape_class", "arts_hits"]).count()["bgc_id"].reset_index()
df_grouped = df_grouped.rename(columns={"bgc_id" : label})
df_grouped.head()

In [None]:
fig = px.treemap(df_grouped.copy(), path=[px.Constant("all"), 'fam_type_0.30', 'known_cluster_blast', "bigfam_hits", 'arts_hits', 'bigscape_class'], 
                 values=label, color=label, width=800, height=1200, color_continuous_scale='Blues')
#fig.update_layout(margin = dict(t=25, l=25, r=25, b=25))
fig.update_layout(
    uniformtext=dict(minsize=12, mode='hide'),
    margin = dict(t=50, l=25, r=25, b=25)
)

fig.update_layout(
  hoverlabel=dict(
        font_size=12,
        font_family="Calibri"
    ),
  template="plotly_white",
  paper_bgcolor="white",
)
fig.data[0].hovertemplate = (
  '<b>%{label}</b>'
  '<br>' +
  '# Number of BGCs: %{customdata[0]}'
)

fig.data[0].textinfo = 'label+text+value'
fig.show()

## Figure Text

Here, various genome mining tools were used to predict BGCs and their association with databases such as MIBIG, BiGFAM-DB, and ARTS. The predicted mappings of BGCs against these different databases were used to reconstruct a treemap distribution (Figure 4a). Using antiSMASH, a total of 721 BGCs were predicted across 26 Saccharopolyspora genomes with a median of 23 BGCs per genome. The BGCs were distributed across various types such as terpenes (165 BGCs), RiPPs (121 BGCs), NRPS (89 BGCs), T1PKS (46  BGCs), other PKS types (59 BGCs), PKS-NRPS hybrids (37 BGCs), saccharides (8 BGCs) and other types (196 BGCs). Based on the antiSMASH KnownClusterBlast similarity of greater than 80%, a total of 112 BGCs were mapped to the 16 MIBIG database entries of characterized secondary metabolites. Most common hits were geosmin (31 BGCs), ectoine (26 BGCs), 2-methylisoborneol (15 BGCs), and erythreapeptin (14 BGCs) which were found across multiple species. Whereas erythromycin (5 BGCs), spinosyn (5 BGCs), flaviolin (5 BGCs), erythrochelin (5 BGCs), coelibactin (4 BGCs), and E-837 furanone (4 BGCs) were detected in specific species. 


In [None]:
n_BGCs = len(df_final)
n_genomes = len(df_bigscape.genome_id.unique())
n_bgc_median = df_bigscape.genome_id.value_counts().median()
n_bigscape_class = df_bigscape.bigscape_class.value_counts().to_dict()
n_bigscape_class = [f"{k} ({v} BGCs)" for k,v in sorted(n_bigscape_class.items(), key=lambda x:x[1])]
n_bigscape_class.insert(-1, "and")
n_bigscape_class = ", ".join(n_bigscape_class)
df_bgcs_known = df_bgcs[df_bgcs.similarity >= antismash_cutoff]

In [None]:
df_bgcs_known_dict = df_bgcs_known.most_similar_known_cluster_description.value_counts().to_dict()
manual_correction = {"Ery-9 / Ery-6 / Ery-8 / Ery-7 / Ery-5 / Ery-4 / Ery-3" : "erythreapeptin",
                    "erythromycin A / erythromycin B / erythromycin C / erythromycin D" : "erythromycin",
                    "A83543A" : "spinosyn",
                    "flaviolin rhamnoside / 3,3'-diflaviolin / flaviolin" : "flaviolin",
                    "E-837" : "E-837 furanone"}
                     
df_bgcs_known_dict_corrected = {}
for k,v in df_bgcs_known_dict.items():
    if k in manual_correction.keys():
        k = manual_correction[k]
    else:
        pass
    df_bgcs_known_dict_corrected[k] = v
    
df_bgcs_known_list = [f"{k} ({v} BGCs)" for k,v in sorted(df_bgcs_known_dict_corrected.items(), key=lambda x:x[1], reverse=True)]
df_bgcs_known_list_top = df_bgcs_known_list[:4]
df_bgcs_known_list_top.insert(-1, "and")
df_bgcs_known_list_top = ", ".join(df_bgcs_known_list_top)

df_bgcs_known_list_top2 = df_bgcs_known_list[4:9]
df_bgcs_known_list_top2.insert(-1, "and")
df_bgcs_known_list_top2 = ", ".join(df_bgcs_known_list_top2)

In [None]:
text1 = f"Here, various genome mining tools were used to predict BGCs and their association with databases such as MIBIG, BiGFAM-DB, and ARTS. The predicted mappings of BGCs against these different databases were used to reconstruct a treemap distribution (Figure 4a). Using antiSMASH, a total of {n_BGCs} BGCs were predicted across {n_genomes} Saccharopolyspora genomes with a median of {n_bgc_median:.0f} BGCs per genome."
text2 = f"The BGCs were distributed across various types such as {n_bigscape_class}. Based on the antiSMASH KnownClusterBlast similarity of greater than {antismash_cutoff:.0%}, a total of {len(df_bgcs_known)} BGCs were mapped to the {len(df_bgcs_known.most_similar_known_cluster_id.unique())} MIBIG database entries of characterized secondary metabolites."
text3 = f"Most common hits were {df_bgcs_known_list_top} which were found across multiple species. Whereas {df_bgcs_known_list_top2} were detected in specific species."
final_text = " ".join([text1, text2, text3]).replace("and,", "and")
with open(f"assets/figures/{FIGURE}/{FIGURE}a_text.txt", "w") as f:
    f.writelines(final_text)

## Other Exploratory Analysis

In [None]:
source = df_grouped.copy()
fig = px.treemap(source, 
                 path=[px.Constant("all"), 'fam_type_0.30', 'known_cluster_blast', "bigfam_hits", 'arts_hits', 'bigscape_class'], 
                 values=label, color=label,
                 width=1000, height=1000, 
                 color_continuous_scale=[(0,"#ef6f6c"), (0.5,"white"), (1,"#006e90")],
                 #color_continuous_scale=[(0,"green"), (0.5,"white"), (1,"#006e90")] 
                )


figure_data = fig["data"][0]
#list of unique names in the dataframe

for num, item in enumerate(figure_data['ids']):
    values = len(item.split("/"))
    if values < 6:
        end = item.split("/")[-1]
        if 'unknown' in end:
            figure_data['marker']['colors'][num] = max(figure_data['marker']['colors'])
        elif values == 1:
            figure_data['marker']['colors'][num] = 0
        else:
            figure_data['marker']['colors'][num] = -max(figure_data['marker']['colors'])

fig.update_layout(
  hoverlabel=dict(
        font_size=12,
        font_family="Calibri"
    ),
  template="plotly_white",
  paper_bgcolor="white",
)
fig.data[0].hovertemplate = (
  '<b>%{label}</b>'
  '<br>' +
  '# Number of BGCs: %{customdata[0]}'
)

fig.update_coloraxes(showscale=True)

#fig.update_traces(root_color="lightgrey")
fig.data[0].textinfo = 'label+text+value'
fig.show()
outfile = Path(f"assets/figures/{FIGURE}/{FIGURE}x.svg")
outfile.parent.mkdir(parents=True, exist_ok=True)
fig.write_image(outfile)

In [None]:
df_final.to_csv(f"assets/tables/{FIGURE}")

In [None]:
df_select = df_final.copy()
df_select.set_index("bgc_id", inplace=True)
df_select = df_select[(df_select["fam_type_0.30"] == "BiG-SCAPE unknown") & (df_select["known_cluster_blast"] == "antiSMASH unknown") & (df_select["arts_hits"] == "ARTS2 match")  ]

In [None]:
df_select.value_counts("fam_id_0.30").head(15)

In [None]:
df_select

In [None]:
df3 = df_bigfam[df_bigfam["rank"] == 0]
df4 = df3[df3.bgc_id.isin(df_select.index)]
df4

In [None]:
df_bigfam_gcf = pd.read_csv(report_dir / "bigslice/query_as_6.1.1/gcf_summary.csv", index_col=0)
df_bigfam_gcf.loc[df4.gcf_id.unique()]

In [None]:
source = df_edited.copy()
source.loc[:, "fam_id_0.30"] = [f"GCF_{source.loc[i, 'fam_id_0.30']}" for i in source.index]
source['values'] = 1
fig = px.treemap(source, 
                 path=[px.Constant("all"), 'fam_type_0.30', 'known_cluster_blast', "bigfam_hits", 'arts_hits', 'bigscape_class', 'fam_id_0.30', 'bgc_id'], 
                 values="values", color="values",
                 width=1000, height=1000, 
                 color_continuous_scale=[(0,"#ef6f6c"), (0.5,"white"), (1,"#006e90")],
                 #color_continuous_scale=[(0,"green"), (0.5,"white"), (1,"#006e90")] 
                )


figure_data = fig["data"][0]
#list of unique names in the dataframe

for num, item in enumerate(figure_data['ids']):
    values = len(item.split("/"))
    if values < 6:
        end = item.split("/")[-1]
        if 'unknown' in end:
            figure_data['marker']['colors'][num] = max(figure_data['marker']['colors'])
        elif values == 1:
            figure_data['marker']['colors'][num] = 0
        else:
            figure_data['marker']['colors'][num] = -max(figure_data['marker']['colors'])

fig.update_layout(
  hoverlabel=dict(
        font_size=12,
        font_family="Calibri"
    ),
  template="plotly_white",
  paper_bgcolor="white",
)
fig.data[0].hovertemplate = (
  '<b>%{label}</b>'
  '<br>' +
  '# Number of BGCs: %{customdata[0]}'
)

fig.update_coloraxes(showscale=False)

#fig.update_traces(root_color="lightgrey")
fig.data[0].textinfo = 'label+text+value'
source.to_csv(f"assets/tables/{FIGURE}a_treemap.csv")
fig.show()
fig.write_html(f"assets/figures/{FIGURE}/{FIGURE}a_alternative.html")

In [None]:
max(figure_data['marker']['colors'])

In [None]:
figure_data['marker']['colors'][1]

In [None]:
venn_category = {"BiG-SCAPE MIBIG hits" : set(df_edited[df_edited['fam_type_0.30'] == 'BiG-SCAPE known'].bgc_id),
                 "KnownClusterBlast hits" : set(df_edited[df_edited['known_cluster_blast'] == 'antiSMASH known'].bgc_id),
                 "BiG-FAM hits" : set(df_edited[df_edited['bigfam_hits'] == 'BiG-FAM match'].bgc_id),
                 "ARTS2 hits" : set(df_edited[df_edited['arts_hits'] == 'ARTS2 match'].bgc_id),
                 #"All" : set(df_edited.bgc_id)
                }

In [None]:
#! pip install venn # we will use this later for visualization
from venn import venn
%matplotlib inline

In [None]:
venn_diagram = venn(venn_category)
venn_diagram