## Figure 5. Expanded similarity network with additional connections to different BGC knowledgebases
This notebook integrates BiG-SCAPE network with additional information

In [None]:
import pandas as pd
import networkx as nx
import plotly.graph_objects as go
import numpy as np
import scipy.spatial as spatial

from pathlib import Path
import json, yaml
import shutil

In [None]:
def create_node_trace(G, node_trace_category, color, showtextlabel=False, nodesize=10, nodeopacity=0.8, 
                      nodesymbol="circle", linewidth=1, linecolor="black", textposition="top center", showlegend=False):
    if showtextlabel:
        markermode = "markers+text"
    else:
        markermode = "markers"
    nodes = np.array([node for node in G.nodes() if G.nodes[node]["node_trace"] == node_trace_category])
    pos = np.array([G.nodes[node]['pos'] for node in nodes.flatten()]).reshape(-1, 2)
    xs, ys = pos[:, 0], pos[:, 1]
    texts = np.array([G.nodes[node]['text'] for node in nodes])
    node_trace = go.Scatter(
        x=xs.tolist(),
        y=ys.tolist(),
        text=texts.tolist(),
        textposition=textposition,
        mode=markermode,
        hoverinfo='text',
        name=node_trace_category,
        showlegend=showlegend,
        marker=dict(
            symbol=nodesymbol,
            opacity=nodeopacity,
            showscale=False,
            color=color,
            size=nodesize,
            line=dict(width=linewidth, color=linecolor)))
    return node_trace

In [None]:
def create_edge_trace(Graph, name, showlegend=False, color='#888', width=0.5, opacity=0.8):
    edge_trace = go.Scatter(
        x=[],
        y=[],
        name=name,
        opacity=opacity,
        line=dict(width=width,color=color),
        hoverinfo='none',
        mode='lines',
        showlegend=showlegend)

    edges = np.array([edge for edge in Graph.edges() if G.edges[edge]["relation_type"] == name])
    pos = np.array([Graph.nodes[e]['pos'] for e in edges.flatten()]).reshape(-1, 2)
    xs = np.insert(pos[:, 0], np.arange(2, len(pos[:, 0]), 2), None)
    ys = np.insert(pos[:, 1], np.arange(2, len(pos[:, 1]), 2), None)
    edge_trace['x'] = xs
    edge_trace['y'] = ys

    return edge_trace

In [None]:
def group_markers(points, name, color="red", width=0.5, showlegend=False, fill="toself", opacity=0.8, textposition="bottom center", offset='default'):
    # Find the convex hull of the points
    hull = spatial.ConvexHull(points)

    # Get the x and y coordinates of the vertices of the hull
    x = [points[i][0] for i in hull.vertices] + [points[hull.vertices[0]][0]]
    y = [points[i][1] for i in hull.vertices] + [points[hull.vertices[0]][1]]

    # Calculate the center of the hull
    center_x = np.mean(x)
    center_y = np.mean(y)
    
    if offset == "default":
        if len(points) < 50:
            offset = len(points) * (-0.5/47) + 1.8
        else:
            offset = 1

    # Add an offset of 1 pixel to each vertex
    offset_x = [(xi - center_x) * offset + center_x for xi in x]
    offset_y = [(yi - center_y) * offset + center_y for yi in y]
    trace = go.Scatter(name=name, x=offset_x, y=offset_y, mode='lines', line=dict(width=width, color=color), 
                       showlegend=showlegend, fill=fill, opacity=opacity, textposition=textposition)
    return trace

## File configuration

In [None]:
with open("config.yaml", "r") as f:
    notebook_configuration = yaml.safe_load(f)
notebook_configuration

In [None]:
bgcflow_dir = Path(notebook_configuration["bgcflow_dir"])
project_name = "mq_saccharopolyspora"
report_dir = bgcflow_dir / f"data/processed/{project_name}"
FIGURE = "Figure_7"
FIGURE_ARTS = "Figure_4"
FIGURE_MASH = "Figure_3"
FIGURE_BIGFAM = "Figure_4"

## Setting Up Cutoff and Inputs

In [None]:
cutoff = "0.30"

In [None]:
path_bigscape = report_dir / "bigscape/for_cytoscape_antismash_6.1.1/"

In [None]:
df_bigscape = pd.read_csv([i for i in path_bigscape.glob(f"*df_network*{cutoff}*")][0])
df_bigscape_cluster = pd.read_csv([i for i in path_bigscape.glob(f"*df_cluster*{cutoff}*")][0])
df_antismash = pd.read_csv(report_dir / "tables/df_regions_antismash_6.1.1.csv")
df_bigfam = pd.read_csv(report_dir / "bigslice/query_as_6.1.1/query_network.csv")
df_arts = pd.read_csv(f"assets/tables/{FIGURE_ARTS}_df_arts_hits.csv")
df_mash = pd.read_csv(f"assets/tables/{FIGURE_MASH}b_mash_hcluster.csv", index_col=0)

In [None]:
bigfam_models_stats = pd.read_csv(f"assets/tables/{FIGURE_BIGFAM}_bigfam_models.csv", index_col=0)

## Building Edges for Networkx

In [None]:
edge_bigscape = df_bigscape.loc[:, ['Clustername 1','Clustername 2']]
edge_bigscape = edge_bigscape.rename(columns={'Clustername 1' : 'source',
                                              'Clustername 2' : 'target'})
edge_bigscape['relation_type'] = 'bigscape_similarity'

In [None]:
edge_antismash = df_antismash.loc[:, ['bgc_id', 'most_similar_known_cluster_id', 'similarity']].dropna()
edge_antismash = edge_antismash.rename(columns={'bgc_id' : 'source', 
                                                'most_similar_known_cluster_id' : 'target',
                                                'similarity' : 'value'
                                               })
edge_antismash['relation_type'] = 'antismash_knownclusterblast' 

In [None]:
edge_arts = df_arts.loc[:, ['profile', 'bgc_id']]
edge_arts = edge_arts.rename(columns={'profile' : 'source', 
                                      'bgc_id' : 'target'})
edge_arts['relation_type'] = 'arts_hits'

In [None]:
edge_bigscape_class = df_bigscape_cluster.loc[:, ['bgc_id', 'bigscape_class']]
edge_bigscape_class = edge_bigscape_class.rename(columns={'bgc_id' : 'source', 
                                      'bigscape_class' : 'target'})
edge_bigscape_class['relation_type'] = 'bigscape_class'

In [None]:
edge_bigfam = df_bigfam[df_bigfam['rank'] == 0].loc[:, ['gcf_id', 'bgc_id']]
edge_bigfam = edge_bigfam.rename(columns={'gcf_id' : 'source', 
                                          'bgc_id' : 'target'})
edge_bigfam['relation_type'] = 'bigfam_hits'

In [None]:
for i in df_bigscape_cluster.index:
    genome_id = df_bigscape_cluster.loc[i, "genome_id"] 
    df_bigscape_cluster.loc[i, "phylogroup"] = df_mash.loc[genome_id, "phylogroup"]
edge_mash = df_bigscape_cluster.loc[:, ["bgc_id", "phylogroup"]]
edge_mash = edge_mash.rename(columns={'bgc_id' : 'source', 
                                          'phylogroup' : 'target'})
edge_mash['relation_type'] = 'phylogroup'

## Build mapping categories

In [None]:
nodemap_antismash = df_antismash.loc[:, ['bgc_id', 'product', 'most_similar_known_cluster_description', 'similarity']].set_index('bgc_id')
nodemap_antismash['similarity'] = [f"{i:.2f}" for i in nodemap_antismash['similarity']]
nodemap_antismash = nodemap_antismash.astype(str).T.to_dict()

In [None]:
nodemap_mibig = pd.read_csv(bgcflow_dir / "resources/mibig/df_mibig_bgcs.csv", index_col=0)
nodemap_mibig = nodemap_mibig.loc[:, ["biosyn_class", "compounds"]].T.to_dict()

In [None]:
nodemap_mibig_bigscape = list(set([i for i in edge_bigscape.source if i.startswith("BGC")]).union(set([i for i in edge_bigscape.target if i.startswith("BGC")])))

In [None]:
nodemap_arts = df_arts.set_index("profile").fillna("").loc[:, ["hits_type", "function", "description"]].T.to_dict()

In [None]:
nodemap_bigfam = bigfam_models_stats.loc[:, ["top_chemical_class","top_chemical_class_proportion","top_chemical_subclass","top_chemical_subclass_proportion", "H-index","richness","top_taxa","top_taxa_proportion"]]
nodemap_bigfam = nodemap_bigfam.astype(str).T.to_dict()

In [None]:
arts_map = list(df_arts.profile.unique())
bigfam_map = list(df_bigfam.astype("str").gcf_id.unique())
antismash_map = list(df_antismash.bgc_id.unique())
phylogroup_map = list(df_mash.phylogroup.unique())

## Build Network

In [None]:
df = pd.concat([edge_antismash, edge_bigscape, edge_bigfam, edge_arts])
df = df.astype(str)
G = nx.from_pandas_edgelist(df, source='source', target='target', edge_attr=['relation_type', 'value'])

## Filtering
### Remove Unreliable BiG-FAM models
This part requires analysis from [FigS12_query_bigfam.ipynb](FigS12_query_bigfam.ipynb)

In [None]:
deleted_bigfam = []
bigfam_taxa_cutoff = 0.3
bigfam_filter = bigfam_models_stats[bigfam_models_stats.top_taxa_proportion <= bigfam_taxa_cutoff]
for n in bigfam_filter.index:
    try:
        G.remove_node(str(n))
        deleted_bigfam.append(n)  
    except nx.NetworkXError as e:
        print(e)

deleted_bigfam = ', '.join([str(i) for i in deleted_bigfam])

### Network Annotation

In [None]:
# position nodes
pos = nx.nx_agraph.graphviz_layout(G)
for n, p in pos.items():
    G.nodes[n]['pos'] = p
    G.nodes[n]['text'] = n
    if n in arts_map:
        node_trace = 'ARTS model'
        G.nodes[n]['text'] = n + "<br>" + "<br>".join(nodemap_arts[n].values())
    elif n in bigfam_map:
        node_trace = 'BiG-FAM model'
        try:
            G.nodes[n]['text'] = n + "<br>" + "<br>".join(nodemap_bigfam[int(n)].values())
        except KeyError:
            print(f"Node {n} not found in bigfam_mapping")
    elif n in antismash_map:
        node_trace = "BGC"
        try:
            G.nodes[n]['text'] = n + "<br>" + "<br>".join(nodemap_antismash[n].values())
            if float(nodemap_antismash[n]['similarity']) < 0.4:
                node_trace = "BGC < 40% similarity"
            elif float(nodemap_antismash[n]['similarity']) < 0.8:
                node_trace = "BGC < 80% similarity"
            elif float(nodemap_antismash[n]['similarity']) >= 0.8:
                node_trace = "BGC >= 80% similarity"
        except TypeError as e:
            print(n, e)
        if 'ranthipeptide' in G.nodes[n]['text']:
            node_trace = 'Ranthipeptide' + " " + node_trace
    elif n.startswith("BGC"):
        node_trace = "MIBIG (KnownClusterBlast)"
        if n in nodemap_mibig_bigscape:
            node_trace = "MIBIG (BiG-SCAPE)"
        try:
            G.nodes[n]['text'] = n + "<br>" + "<br>".join(nodemap_mibig[n].values())
        except KeyError:
            pass
    if n in list(edge_mash.source):
        phylogroup = edge_mash.set_index("source").loc[n, "target"]
        G.nodes[n]['text'] = "<br>".join([phylogroup, G.nodes[n]['text']])
    G.nodes[n]['node_trace'] = node_trace

## Define visualizations

In [None]:
edge_annotation_map = {'bigscape_similarity' : {'color':'black',
                                                'width':0.5}, 
                       'antismash_knownclusterblast': {'color':'blue', 
                                                       'width':0.5}, 
                       'arts_hits': {'color':'red', 
                                     'width':0.5}, 
                       'bigfam_hits': {'color':'orange', 
                                       'width':0.5},
                       'phylogroup' : {'color' : 'black',
                                     'width' : 0.5}}

node_annotation_map = {'ARTS model' : {'color':'red',
                                       'node_symbol' : 'star'},
                       'BiG-FAM model' : {'color':'orange',
                                          'node_symbol' : 'triangle-up'},
                       'MIBIG (KnownClusterBlast)' : {'color':'green',
                                                      'node_symbol' : 'square-dot'},
                       'MIBIG (BiG-SCAPE)' : {'color':'blue',
                                              'node_symbol' : 'square'},
                       "BGC >= 80% similarity" : {'color':'blue',
                                                  'node_symbol' : 'circle'},
                       "BGC < 80% similarity" : {'color':'#0077b6',
                                                 'node_symbol' : 'circle'},
                       "BGC < 40% similarity" : {'color':'#90e0ef',
                                                 'node_symbol' : 'circle'},
                       'BGC' : {'color':'#90e0ef',
                                          'node_symbol' : 'circle'},
                       "Ranthipeptide BGC >= 80% similarity" : {'color':'blue',
                                                  'node_symbol' : 'circle'},
                       "Ranthipeptide BGC < 80% similarity" : {'color':'#0077b6',
                                                 'node_symbol' : 'circle'},
                       "Ranthipeptide BGC < 40% similarity" : {'color':'#90e0ef',
                                                 'node_symbol' : 'circle'},
                       'Ranthipeptide BGC' : {'color':'#90e0ef',
                                          'node_symbol' : 'circle'},
                       'phylogroup' : {'color' : 'black',
                                       'node_symbol' : 'star'}
                      }

## Draw Target Box

In [None]:
x, y = G.nodes['TIGR03997']['pos']
x1 = x - 200
x2 = x + 200
y1 = y - 200
y2 = y + 200

# Figure 5D

In [None]:
traces = []

In [None]:
showlegend=False
color="grey"

# drop all ARTS
G_arts = G.copy()
for n in df_arts.profile.unique():
    try:
        G_arts.remove_node(str(n))
    except nx.NetworkXError as e:
        print(e)
    

for num, query in enumerate(nx.connected_components(G_arts)):
    if len(query) > 2:
        points = []
        for n in query:
            q = G.nodes[n]['pos']
            points.append(q)
        traces.append(group_markers(points, str(num), color="#2a9d8f", 
                                    width=1, showlegend=showlegend, fill='toself', 
                                    #offset=1.5, 
                                    opacity=0.8))

In [None]:
for e in ['bigscape_similarity', 'arts_hits', 'bigfam_hits', "antismash_knownclusterblast"]:
    edge_trace = create_edge_trace(G, e, color=edge_annotation_map[e]['color'], showlegend=True)
    traces.append(edge_trace)

for trace in node_annotation_map.keys():
    nodeopacity = 0.2
    showtextlabel = False
    linecolor = "black"
    linewidth = 0.5
    textposition="top left"
    node_size = 8
    if trace.startswith("Ranthipeptide"):
        nodeopacity = 0.8
        linewidth = 1
    node_trace = create_node_trace(G, trace, node_annotation_map[trace]['color'], showtextlabel=showtextlabel, 
                                   nodesymbol=node_annotation_map[trace]['node_symbol'], nodeopacity=nodeopacity, 
                                   showlegend=True, linecolor=linecolor, linewidth=linewidth, nodesize=node_size,
                                   textposition=textposition)
    traces.append(node_trace)
    

In [None]:
fig5D = go.Figure(data=traces,
                layout=go.Layout(
                    paper_bgcolor='rgba(0,0,0,0)',
                    plot_bgcolor='white',
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20,l=5,r=5,t=40),
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, linecolor='black', mirror=True, linewidth=1),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, linecolor='black', mirror=True, linewidth=1),
                    width=600, height=600)
                )

outfile = Path(f"assets/figures/{FIGURE}/{FIGURE}D_{cutoff}.html")
outfile.parent.mkdir(exist_ok=True, parents=True)
fig5d_html = fig5D.update_layout(width=600, height=600)
fig5d_html.write_html(outfile)

In [None]:
square = [x1, y1, x2, y2]
fig5D.add_shape(type="rect",
               x0=square[0], y0=square[1], x1=square[2], y1=square[3],
               line=dict(color="Red", width=2),
)

fig5D.update_layout(width=600, height=600,
    font=dict(
        #family="Courier New, monospace",
        size=8,  # Set the font size here
        #color="RebeccaPurple"
    )
)

In [None]:
outfile = Path(f"assets/figures/{FIGURE}/{FIGURE}D_{cutoff}.svg")
outfile.parent.mkdir(exist_ok=True, parents=True)
fig5D.write_image(outfile)

# Figure 5B

In [None]:
nodemap_bigscape_cluster = df_bigscape_cluster.set_index("bgc_id").loc[:, 'fam_known_compounds_0.30'].to_dict()

In [None]:
## Correct labels
keyword = 'Ranthipeptide'
container = []
for n in G.nodes:
    if keyword in G.nodes[n]['node_trace']:
        text = nodemap_bigscape_cluster[n]
        if text in container:
            G.nodes[n]['text'] = ""
        else:
            G.nodes[n]['text'] = text
            container.append(text)
    elif G.nodes[n]['node_trace'] in ['MIBIG (BiG-SCAPE)', 'BiG-FAM model', 'ARTS model', ]:
        G.nodes[n]['text'] = n   
    else:
        G.nodes[n]['text'] = ""

In [None]:
traces = []

In [None]:
for e in ['bigscape_similarity', 'arts_hits', 'bigfam_hits', "antismash_knownclusterblast"]:
    edge_trace = create_edge_trace(G, e, color=edge_annotation_map[e]['color'], showlegend=True, width=3)
    traces.append(edge_trace)

for trace in node_annotation_map.keys():
    nodeopacity = 0.5
    showtextlabel = True
    linecolor = "black"
    linewidth = 0.5
    textposition="middle right"
    node_size = 25
    if trace.startswith(keyword) or trace.startswith("ARTS"):
        nodeopacity = 0.8
        linewidth = 2
    node_trace = create_node_trace(G, trace, node_annotation_map[trace]['color'], showtextlabel=showtextlabel, 
                                   nodesymbol=node_annotation_map[trace]['node_symbol'], nodeopacity=nodeopacity, 
                                   showlegend=True, linecolor=linecolor, linewidth=linewidth, nodesize=node_size,
                                   textposition=textposition)
    traces.append(node_trace)
    

In [None]:
showlegend=False
color="grey"

# drop all ARTS
G_arts = G.copy()
for n in df_arts.profile.unique():
    try:
        G_arts.remove_node(str(n))
    except nx.NetworkXError as e:
        print(e)
    

for num, query in enumerate(nx.connected_components(G_arts)):
    if len(query) > 2:
        points = []
        for n in query:
            q = G.nodes[n]['pos']
            points.append(q)
        traces.append(group_markers(points, str(num), color="blue", width=1, showlegend=showlegend, fill=None, opacity=0.8))

In [None]:
fig5B = go.Figure(data=traces,
                layout=go.Layout(
                    paper_bgcolor='rgba(0,0,0,0)',
                    plot_bgcolor='white',
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20,l=5,r=5,t=40),
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, linecolor='black', mirror=True, linewidth=1),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, linecolor='black', mirror=True, linewidth=1),
                    width=600, height=600)
                )

In [None]:
square = [x1, y1, x2, y2]

fig5B.update_layout(xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, linecolor='black', mirror=True, linewidth=1, range=[square[0], square[2]]),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, linecolor='black', mirror=True, linewidth=1, range=[square[1], square[3]]))

In [None]:
outfile = Path(f"assets/figures/{FIGURE}/{FIGURE}B_{cutoff}.svg")
outfile.parent.mkdir(exist_ok=True, parents=True)
fig5B.write_image(outfile)

In [None]:
fig5B.update_layout(showlegend=True)
outfile = Path(f"assets/figures/{FIGURE}/{FIGURE}B_{cutoff}_legend.svg")
outfile.parent.mkdir(exist_ok=True, parents=True)
fig5B.write_image(outfile)

# Figure 5 Final

In [None]:
from svgutils.compose import *
from svgutils.compose import Figure

In [None]:
final_figure = Figure("650", "650",
                      Panel(
                          SVG(f"assets/figures/{FIGURE}/{FIGURE}B_{cutoff}.svg").scale(1).move(20, 0),
                          #Text("B", 620, 80, size=24, weight='bold'),
                      ),
                      Panel(
                          SVG(f"assets/figures/{FIGURE}/{FIGURE}D_{cutoff}.svg").scale(0.3).move(0, 0),
                          #Text("D", 620, 630, size=24, weight='bold'),
                      )
                     )
outfile = Path(f"assets/figures/{FIGURE}/{FIGURE}_{cutoff}.svg")
outfile.parent.mkdir(parents=True, exist_ok=True)
final_figure.save(outfile)
final_figure

In [None]:
print(f"Enrichment of BiG-SCAPE similarity network (cutoff {cutoff}) with KnownClusterBLAST, BiG-FAM, and ARTS models. (C) Blue polygons depicts GCF assigned through BiG-SCAPE similarity network. (D) Blue polygons depicts connected components except through ARTS model. BiG-FAM models {deleted_bigfam} were removed because of assigned top genus in the model are below {bigfam_taxa_cutoff:.0%}")

In [None]:
## Add mycobacterium tuberculosis genome
df_mycobacterium = {0:{'genome_id' : "GCF_000195955.2", "source" : "ncbi"}}
df_mycofactocin = pd.DataFrame.from_dict(df_mycobacterium)

outfile = bgcflow_dir / "config/Mycobacterium/samples.csv"
outfile.parent.mkdir(parents=True, exist_ok=True)
df_mycofactocin.T.to_csv(outfile, index=False)

## Generate config file
project_config = {}

In [None]:
outfile_dir = bgcflow_dir / f"config/Mycobacterium"

project_config = {'name': 'Mycobacterium',
                  'pep_version': '2.1.0',
                  'description': "A Mycobacterium genome for reference of Mycofactocin",
                  'sample_table': 'samples.csv',
                  'rules' : {'antismash' : True}
                 }

source = bgcflow_dir / "config/mq_saccharopolyspora/gtdbtk.bac120.summary.tsv"
destination = outfile_dir / source.name
shutil.copy(source, destination)

source = bgcflow_dir / "config/mq_saccharopolyspora/prokka-db.csv"
destination = outfile_dir / source.name
shutil.copy(source, destination)

with open(outfile_dir / "project_config.yaml", "w") as f:
    json.dump(project_config, f, indent=2)

In [None]:
targets = []
query_node = "TIGR03997"
for e in G.edges(query_node):
    for n in e:
        if n != query_node:
            targets.append(n)
df_TIGR03997 = df_antismash.set_index("bgc_id").loc[targets]
df_TIGR03997

In [None]:
df_mycobacterium_region = pd.read_csv(bgcflow_dir / "data/processed/Mycobacterium/tables/df_regions_antismash_6.1.1.csv")
mycofactocin = df_mycobacterium_region[df_mycobacterium_region['product'] == "['redox-cofactor']"]
df_TIGR03997_with_reference = pd.concat([df_TIGR03997, mycofactocin.set_index("bgc_id")])
df_TIGR03997_with_reference['source'] = "bgcflow"
df_TIGR03997_with_reference.shape

In [None]:
outfile = bgcflow_dir / "config/TIGR03997_with_reference/samples.csv"
outfile.parent.mkdir(parents=True, exist_ok=True)
df_TIGR03997_with_reference.to_csv(outfile)

In [None]:
outfile_dir = bgcflow_dir / f"config/TIGR03997_with_reference"
outfile_dir.mkdir(parents=True, exist_ok=True)

project_config = {'name': 'TIGR03997_with_reference',
                  'schema' : 'BGC',
                  'pep_version': '2.1.0',
                  'description': "A selection of putative BGCs with Mycofactocin signature and references",
                  'sample_table': 'samples.csv',
                  'rules' : {"bigslice": False,
                             "bigscape": False,
                             "query-bigslice": False,
                             "clinker": True,
                             "interproscan": False,
                             "mmseqs2": True}
                 }

source = bgcflow_dir / "config/mq_saccharopolyspora/gtdbtk.bac120.summary.tsv"
destination = outfile_dir / source.name
shutil.copy(source, destination)


with open(outfile_dir / "project_config.yaml", "w") as f:
    json.dump(project_config, f, indent=2)

In [None]:
edge_annotation_map = {'MMSEQS2' : {'color':'blue',
                                     'width':1}
                      }

node_annotation_map = {'CDS' : {'color':'blue',
                                'node_symbol' : 'circle'},
                       'CDS_precursor_peptide': {'color':'red',
                                                 'node_symbol' : 'circle'},
                       'CDS_Mft': {'color':'green',
                                    'node_symbol' : 'circle'},
                      }

In [None]:
traces = []

In [None]:
for e in edge_annotation_map.keys():
    edge_trace = create_edge_trace(G, e, color=edge_annotation_map[e]['color'], showlegend=True)
    traces.append(edge_trace)

for trace in node_annotation_map.keys():
    nodeopacity = 0.8
    showtextlabel = False
    linecolor = "black"
    linewidth = 0.5
    textposition="top left"
    node_size = 8
    node_trace = create_node_trace(G, trace, node_annotation_map[trace]['color'], showtextlabel=showtextlabel, 
                                   nodesymbol=node_annotation_map[trace]['node_symbol'], nodeopacity=nodeopacity, 
                                   showlegend=True, linecolor=linecolor, linewidth=linewidth, nodesize=node_size,
                                   textposition=textposition)
    traces.append(node_trace)
    

In [None]:
fig = go.Figure(data=traces,
                layout=go.Layout(
                    paper_bgcolor='rgba(0,0,0,0)',
                    plot_bgcolor='white',
                    showlegend=True,
                    hovermode='closest',
                    margin=dict(b=20,l=5,r=5,t=40),
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, linecolor='black', mirror=True, linewidth=1),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, linecolor='black', mirror=True, linewidth=1),
                    width=800, height=600)
                )
fig

In [None]:
G = nx.read_edgelist(bgcflow_dir / "data/interim/mmseqs2/TIGR03997_with_reference/TIGR03997_with_reference_6.1.1_cluster.tsv")
# get the connected components of G
components = nx.connected_components(G)

# sort the components based on their size
sorted_components = sorted(components, key=lambda x: len(x), reverse=True)

In [None]:
result = {"feature_id": [], "cluster_id" : [], "cluster_n" : [], "locus_tag" : []}
for num, g in enumerate(sorted_components):
    size = len(g)
    for item in g:
        result['feature_id'].append(f"cds-{item}")
        result['cluster_id'].append(f"cog_{num+1:02d}")
        result['cluster_n'].append(size)
        result['locus_tag'].append(item)
df_TIGR03997_mmseqs2 = pd.DataFrame.from_dict(result)
df_TIGR03997_mmseqs2.to_csv("assets/tables/mmseqs_cog.csv", index=False)
df_TIGR03997_mmseqs2.cluster_id == "cog_01"

In [None]:
df_cds = pd.read_parquet(report_dir / "data_warehouse/6.1.1/cdss.parquet")
cog = df_cds.set_index("locus_tag").loc[df_TIGR03997_mmseqs2[df_TIGR03997_mmseqs2.cluster_id == "cog_12"].locus_tag]
cog.gene_function.astype(str).unique()
cog['product'].astype(str).unique()

In [None]:
{'cog_01' :'SMCOG1217_NADH:flavin oxidoreductase/NADH oxidase',
 'cog_02' : 'SMCOG1001:short-chain dehydrogenase/reductase SDR',
 'cog_03' : 'Mycofactocin precursor',
 'cog_04' : 'Ferredoxin-like protein FixX / NapF',
 'cog_05' : 'Thiamine thiazole synthase / Electron transfer flavoprotein-ubiquinone oxidoreductase / SMCOG1119:halogenase',
 'cog_06' : 'Electron transfer flavoprotein subunit alpha, Caffeyl-CoA reductase-Etf complex subunit CarE, Protein FixB',
 'cog_07' : 'Mycofactocin_RRE',
 'cog_08' : 'Putative mycofactocin biosynthesis glycosyltransferase MftF',
 'cog_09' : 'FixA',
 'cog_10' : 'TIGR04085 / Fer4_12 / SPASM / Putative mycofactocin radical SAM maturase MftC',
 'cog_11' : 'Universal stress protein',
 'cog_12' : 'DNA polymerase IV 1'
}

In [None]:
df_cds = pd.read_parquet("../../Mycofactocin_genomes/data_warehouse/6.1.1/cdss.parquet")

In [None]:
df_cds[df_cds.region_id == 'NC_000962.3.region003'].loc[:, ["locus_tag","name","product"]]