# ARTS2
Summary of [ARTS2](link) results from project: `[{{ project().name }}]` 

## Description
Antibiotic Resistant Target Seeker

In [None]:
import pandas as pd
from pathlib import Path
import plotly.graph_objects as go

import altair as alt
import ast
import yaml

import networkx as nx
#! pip install pygraphviz

import warnings
warnings.filterwarnings('ignore')

from IPython.display import display, Markdown, HTML
from itables import to_html_datatable as DT
import itables.options as opt
opt.css = """
.itables table td { font-style: italic; font-size: .8em;}
.itables table th { font-style: oblique; font-size: .8em; }
"""
opt.classes = ["display", "compact"]
opt.lengthMenu = [5, 10, 20, 50, 100, 200, 500]

## File configuration

In [None]:
with open("config.yaml", "r") as f:
    notebook_configuration = yaml.safe_load(f)
notebook_configuration

In [None]:
bgcflow_dir = Path(notebook_configuration["bgcflow_dir"])
project_name = "mq_saccharopolyspora"
report_dir = bgcflow_dir / f"data/processed/{project_name}"
FIGURE = "Figure_4"


df_arts = pd.read_csv(report_dir / "tables/df_arts_as-6.1.1.csv")

bgc_path = report_dir / "bigscape/for_cytoscape_antismash_6.1.1"
bgc_path = [i for i in bgc_path.glob("*df_clusters_0.30.csv")][0]

df_bgcs = pd.read_csv(bgc_path, index_col=0)

In [None]:
df_tax = pd.read_csv(report_dir / "tables/df_gtdb_meta.csv", index_col=0)

In [None]:
lanthi_hits = []
keyword = "lanthi"
for i in df_arts.index:
    if keyword in df_arts.loc[i, "Type"]:
        lanthi_hits.append(i)
print(f"There are {len(df_arts.loc[lanthi_hits])} BGCs matched with keyword: {keyword}, with total of {df_arts.loc[lanthi_hits]['Core hits'].sum()} ARTS2 core hits")

In [None]:
result = {}
ctr = 1
for i in df_arts.index:
    hits = ast.literal_eval(df_arts.loc[i, "Genelist"])
    bgc_id = df_arts.loc[i, "bgc_id"]
    for num, hits in enumerate(hits):
        assert len(hits) == 7
        arts_hits = {'arts_id' : int(hits[0]),
                     'genome_id' : df_arts.loc[i, "genome_id"],
                     'bgc_id' : bgc_id,
                     'bgc_type' : df_arts.loc[i, "Type"],
                     'profile' : hits[1],
                     'start' : int(hits[2]),
                     'stop' : int(hits[3]),
                     'hits_type' : hits[4],
                     'description' : hits[5],
                     'function' : hits[6]
                    }
        if arts_hits['function'] == "N/A":
            arts_hits['function'] = "ResModel"
        result[ctr] = arts_hits
        ctr = ctr + 1
df_hits = pd.DataFrame.from_dict(result).T
df_hits.to_csv(f"assets/tables/{FIGURE}_df_arts_hits.csv")

display(HTML(DT(df_hits, columnDefs=[{"className": "dt-center", "targets": "_all"}],)))

In [None]:
summary_report = f"A total of {len(df_hits.bgc_id.unique())} BGCs from {len(df_hits.genome_id.unique())} genomes have hits with {len(df_hits.profile.unique())} ARTS2 target."
summary_report

In [None]:
lanthi_hits = []
for i in df_hits.index:
    if 'lanthi' in df_hits.loc[i, "bgc_type"]:
        lanthi_hits.append(i)
lanthipeptide_report = f"{len(df_hits.loc[lanthi_hits].bgc_id.unique())}, {len(df_hits.loc[lanthi_hits].profile.unique())}"
lanthipeptide_report

In [None]:
ctr = 0
for item in [i for i in df_hits.bgc_type.value_counts().to_dict().keys() if 'lanthipeptide' in i]:
    print(item, df_hits.bgc_type.value_counts().to_dict()[item])
    ctr = ctr + df_hits.bgc_type.value_counts().to_dict()[item]
ctr

In [None]:
color = ["#264653", "#287271", "#2a9d8f", "#8ab17d", "#e9c46a", "#f4a261", "#ee8959", "#e76f51"]
function_map = df_hits.function.value_counts().to_dict()
arts_function_color_map = {}
ctr = 0
for item in function_map.keys():
    if item == "Unclassified":
        arts_function_color_map[item] = 'grey'
        ctr = ctr - 1
    elif ctr+1 > len(color):
        arts_function_color_map[item] = 'grey'
    elif item == "ResModel":
        arts_function_color_map[item] = 'red'
    else:
        arts_function_color_map[item] = color[ctr]
    ctr = ctr + 1

arts_node_mapping = df_hits.loc[:, ["profile", "description", "function"]].drop_duplicates().set_index("profile", drop=False)
bgc_id_mapping = df_hits.loc[:, ["bgc_id", "bgc_type", "genome_id"]].drop_duplicates().set_index("bgc_id", drop=False)#.T.to_dict()

for c in ["bgc_id", "bgc_type", "genome_id"]:
    arts_node_mapping[c] = None

for c in ["profile", "description", "function"]:
    bgc_id_mapping[c] = None
    
arts_node_mapping = arts_node_mapping.T.to_dict()
bgc_id_mapping = bgc_id_mapping.T.to_dict()

In [None]:
color_map = []
G = nx.from_pandas_edgelist(df_hits, source='bgc_id', target='profile')

pos = nx.nx_agraph.graphviz_layout(G)

for g in G.nodes:
    # annotate ARTS model
    if g in arts_node_mapping.keys():
        for column in arts_node_mapping[g].keys():
            attrib = str(column)
            G.nodes[g][attrib] = arts_node_mapping[g][attrib]
        color = arts_function_color_map[G.nodes[g]['function']]
        color_map.append(color)
        G.nodes[g]["color"] = color
        G.nodes[g]["node_type"] = G.nodes[g]["function"]
        G.nodes[g]["text"] = f"{G.nodes[g]['profile']}<br>{G.nodes[g]['function']}<br>{G.nodes[g]['description']}"
    # annotate BGCs
    elif g in bgc_id_mapping.keys():
        for column in bgc_id_mapping[g].keys():
            attrib = str(column)
            G.nodes[g][attrib] = bgc_id_mapping[g][attrib]
        color = "blue"
        color_map.append(color)
        G.nodes[g]["color"] = color
        G.nodes[g]["node_type"] = "BGC"
        taxonomy = df_tax.loc[G.nodes[g]['genome_id'], "Organism"]
        GCF = df_bgcs.loc[g, "fam_known_compounds_0.30"]
        G.nodes[g]["text"] = f"{G.nodes[g]['bgc_id']}<br>{G.nodes[g]['bgc_type']}<br>{GCF}<br>{G.nodes[g]['genome_id']}<br>{taxonomy}"

In [None]:
for n, p in pos.items():
    G.nodes[n]['pos'] = p

In [None]:
edge_trace = go.Scatter(
    x=[],
    y=[],
    name="ARTS2 hit",
    line=dict(width=0.5,color='#888'),
    hoverinfo='none',
    mode='lines')

for edge in G.edges():
    x0, y0 = G.nodes[edge[0]]['pos']
    x1, y1 = G.nodes[edge[1]]['pos']
    edge_trace['x'] += tuple([x0, x1, None])
    edge_trace['y'] += tuple([y0, y1, None])

In [None]:
def create_node_trace(G, node_type, shape="circle", opacity=0.8):
    node_color = []
    for node in G.nodes():
        if G.nodes[node]['node_type'] == node_type:
            node_color.append(G.nodes[node]['color'])    

    node_trace = go.Scatter(
        x=[],
        y=[],
        name=node_type,
        text=[],
        mode='markers',
        hoverinfo='text',
        marker_symbol=shape,
        opacity=opacity,
        marker=dict(
            showscale=False,
            color=node_color,
            size=10,
            line=dict(width=0)))

    for node in G.nodes():
        if G.nodes[node]['node_type'] == node_type:
            x, y = G.nodes[node]['pos']
            node_trace['x'] += tuple([x])
            node_trace['y'] += tuple([y])

    for node, adjacencies in enumerate(G.adjacency()):
        if G.nodes[adjacencies[0]]['node_type'] == node_type:
            node_trace['marker']['color']+=tuple([len(adjacencies[1])])
            node_type = G.nodes[adjacencies[0]]['node_type']
            description = G.nodes[adjacencies[0]]['bgc_type']
            node_info = G.nodes[adjacencies[0]]['text']
            node_trace['text']+=tuple([node_info])
    
    return node_trace

In [None]:
node_traces = list(function_map.keys())
node_traces.append("BGC")
traces = [edge_trace]
for trace in node_traces:
    shape = "square"
    if trace == "BGC":
        shape = "circle"
    elif trace == "ResModel":
        shape = "star"
    new_trace = create_node_trace(G, trace, shape=shape)
    traces.append(new_trace)

In [None]:
fig = go.Figure(data=traces,
                layout=go.Layout(
                    paper_bgcolor='rgba(0,0,0,0)',
                    plot_bgcolor='rgba(0,0,0,0)',
                    showlegend=True,
                    hovermode='closest',
                    margin=dict(b=20,l=5,r=5,t=40),
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    width=750, height=900))

fig = fig.update_layout(legend=dict(
    orientation="h",
    yanchor="top",
    y=0,
    xanchor="left",
    x=0
))
fig

In [None]:
outfile = Path(f"assets/figures/{FIGURE}/{FIGURE}c.svg")
outfile.parent.mkdir(parents=True, exist_ok=True)
fig.write_image(outfile)

In [None]:
for node in G.nodes:
    for attrib in G.nodes[node]:
        if type(G.nodes[node][attrib]) == tuple:
            G.nodes[node][attrib] = str(G.nodes[node][attrib])
        elif G.nodes[node][attrib] == None:
            G.nodes[node][attrib] = str(G.nodes[node][attrib])

            
output_path = Path(f"assets/data/{FIGURE}_arts.graphml")
output_path.parent.mkdir(exist_ok=True, parents=True)
nx.write_graphml(G, output_path)

## Figure DESCRIPTION

In [None]:
n_bgc_hits = len(df_hits.bgc_id.unique())
n_arts_profile = len(df_hits.arts_id)
n_arts_profile_unique = len(df_hits.profile.unique())

# Load region table
df_antismash = pd.read_csv(report_dir / "tables/df_regions_antismash_6.1.1.csv")
df_antismash = df_antismash[df_antismash.bgc_id.isin(df_hits.bgc_id)]
df_antismash_unknown = len(df_antismash[df_antismash.similarity < 0.8])

resmodel_hits = len(df_hits[df_hits.function == 'ResModel'].profile.unique())
core_hits = len(df_hits[df_hits.function != 'ResModel'].profile.unique())

In [None]:
# how many BGCs have connection with ResModel?
resmodel_nodes = [n for n in G.nodes if G.nodes[n]['function'] == "ResModel"]
bgc_resmodel = []
for resmodel in resmodel_nodes:
    neighbors = nx.neighbors(G, resmodel)
    result = [n for n in neighbors]
    for r in result:
        bgc_resmodel.append(r)
n_bgc_resmodel = len(set(bgc_resmodel))

In [None]:
# Use category table from Fig 4a
df_treetable = pd.read_csv("assets/tables/Figure_4a_network.csv", index_col=0)
df_treetable_resfam = df_treetable[df_treetable.bgc_id.isin(bgc_resmodel)].known_cluster_blast.value_counts().to_dict()
df_treetable_resfam


ARTS2 detection includes three criteria: duplication, localization within a biosynthetic gene cluster, and evidence of Horizontal Gene Transfer (HGT). We found 98 BGCs that had at least one gene with 3 criteria and 61 others with at least one gene with 2 criteria. Whereas the remaining 64 BGCs had genes with only one of the criteria being satisfied indicating relatively lower confidence of bioactivity.
"""

In [None]:
text1=f"""
Many BGCs coding for known antibiotics also contain genes assisting in self-resistance. By looking for these resistance models, ARTS2 assists in prioritizing novel target screening with potential bioactivity. We detected {n_bgc_hits} BGCs that had hits against {n_arts_profile} ({n_arts_profile_unique} unique) gene profiles from the ARTS model (Figure 4C).
"""

In [None]:
text2=f"""
The interaction network represented an overview of BGCs in proximity to the ARTS resistance gene models ({resmodel_hits} unique) or the core genes models ({core_hits} unique) from different functional categories (Figure 4C). We found that {n_bgc_resmodel} BGCs had hits against resistance genes and are more likely to have an antibiotic potential. We also noted that {df_treetable_resfam['antiSMASH unknown']} of the {n_bgc_resmodel} BGCs had no similarity to known clusters (either from antiSMASH KonowClusterBlast or BiG-SCAPE results), thus representing novel bioactive potential of the genus. This analysis further motivated an exploratory analysis of set of BGCs of unknown function that shared proximity to the same resistance or core gene model ({df_antismash_unknown} BGCs), as represented in the last section of mycofactin-related BGCs of this study.
"""

In [None]:
print(text1, text2)

In [None]:
df_subset_resfam_unknown = df_treetable[df_treetable.bgc_id.isin(bgc_resmodel)]
df_subset_resfam_unknown = df_subset_resfam_unknown[(df_subset_resfam_unknown.known_cluster_blast == "antiSMASH unknown") & (df_subset_resfam_unknown['fam_type_0.30'] == "BiG-SCAPE unknown")]

In [None]:
df_antismash[df_antismash.bgc_id.isin(df_subset_resfam_unknown.bgc_id)].most_similar_known_cluster_description.value_counts()

In [None]:
df_subset_resfam_unknown

In [None]:
df_unknown[~df_unknown.bgc_id.isin(bgc_resmodel)]

## References

<font size="2">

{% for i in project().rule_used['arts']['references'] %}
- *{{ i }}*
{% endfor %}

</font>