# autoMLST Wrapper
Summary of [AutoMLST Wrapper](https://github.com/KatSteinke/automlst-simplified-wrapper) results from project: `[{{ project().name }}]`

## Description
A fork of [autoMLST](https://bitbucket.org/ziemertlab/automlst) with a simplified wrapper script bypassing additional organism selection.

In [None]:
import pandas as pd
from pathlib import Path
import yaml

import warnings
warnings.filterwarnings('ignore')

%load_ext rpy2.ipython

## File configurations

In [None]:
with open("config.yaml", "r") as f:
    notebook_configuration = yaml.safe_load(f)
notebook_configuration

In [None]:
# Set up paths and input file
bgcflow_dir = Path(notebook_configuration["bgcflow_dir"])
project_name = "mq_saccharopolyspora"
report_dir = bgcflow_dir / f"data/processed/{project_name}"
FIGURE = "Figure_3"

with open(report_dir / "automlst_wrapper/final.newick", "r") as f:
    data = f.readlines()

value_to_replace = [i.split(":")[0] for i in data[0].replace("(", "").split(",")]

new_dict = {}
df = pd.read_csv(report_dir / "automlst_wrapper/df_genomes_tree.csv")
genome_ids = list(df.genome_id)
for g in genome_ids:
    for v in value_to_replace:
        if v.startswith(g.split(".")[0]):
            new_dict[v] = g
            value_to_replace.remove(v)

data = data[0]
for k in new_dict.keys():
    data = data.replace(k, new_dict[k])

with open(report_dir / "automlst_wrapper/final_corrected.newick", "w") as f:
    f.write(data)

# Prepare data tables
gtdtbk = Path(report_dir / "tables/df_gtdb_meta.csv")
mash = Path(f"assets/tables/{FIGURE}b_mash_hcluster.csv")
bigscape = Path(f"assets/tables/{FIGURE}c_bigscape_class.csv")
antismash = Path(f"assets/tables/{FIGURE}c_bgcs_count.csv")
seqfu = Path(f"assets/tables/{FIGURE}a_df_seqfu_annotated.csv")
ncbi = report_dir / "tables/df_ncbi_meta.csv"

# merge data tables for tree
df_mash = pd.read_csv(mash, index_col=0)
df_gtdbtk = pd.read_csv(gtdtbk, index_col=0)
df_gtdbtk = df_gtdbtk.loc[:, ["Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species", "Organism"]]
df = pd.concat([df_mash, df_gtdbtk], axis=1, join="inner")
df_seqfu = pd.read_csv(seqfu, index_col=0).loc[:, "sequence_quality"]
df = pd.concat([df, df_seqfu], axis=1, join="inner")
df_ncbi = pd.read_csv(ncbi, index_col=0).loc[:, "strain"]
df = pd.concat([df, df_ncbi], axis=1, join="inner")

## manual annotation for tree
### Rename MASH based Species Phylogroup
# df["Species_Phylogroup"] = [f"P{i + 1}" for i in df.hcluster]
df = df.reset_index(drop=False).rename(columns={"index" : "genome_id"})

In [None]:
# Renaming of some strains based on redundancy
df.loc[9,"strain"] = "NRRL 30141"
df.loc[8,"strain"] = "DSM 44228"
df.loc[22,"strain"] = "DSM 40517"
df.loc[24,"strain"] = "DSM 40517"

In [None]:
### Set up tip labels to show
df['tip_label'] = [f"{df.loc[i, 'phylogroup']} | {df.loc[i, 'genome_id']} | {df.loc[i, 'Organism'].replace('s__Saccharopolyspora','S')} ({df.loc[i, 'strain']})" for i in df.index]

# save to intermediate file
df.to_csv(f"assets/tables/{FIGURE}c_ggtree.csv", index=False)

In [None]:
df_bgcs_count = pd.DataFrame()
df_antismash = pd.read_csv(report_dir / "tables/df_antismash_6.1.1_summary.csv").set_index("genome_id")
#df_antismash = df_antismash.loc[:, ["bgcs_count", "bgcs_on_contig_edge"]]
df_bgcs_count["bgcs_on_contig_edge"] = df_antismash["bgcs_on_contig_edge"]
df_bgcs_count["complete_bgcs"] = df_antismash["bgcs_count"] - df_antismash["bgcs_on_contig_edge"] 

In [None]:
df_bgcs_count.to_csv(antismash)

In [None]:
# barchart on column 1
#Path("assets/tables/")
df_antismash = pd.read_csv(antismash, index_col=0)
df_antismash = df_antismash.stack().reset_index().rename(columns={"level_1" : "bgc_status", 0 : "value"})
df_antismash.to_csv(f"assets/tables/{FIGURE}c_bgcs_count_stacked.csv", index=False)

In [None]:
# barchart on column 2
bigscape_path = [i for i in (report_dir / "bigscape/for_cytoscape_antismash_6.1.1/").glob("*df_clusters_0.30.csv")]
assert len(bigscape_path) == 1
df_bigscape = pd.read_csv(bigscape_path[0]).groupby(["genome_id", "bigscape_class"]).count()["bgc_id"].reset_index()
df_bigscape = df_bigscape.rename(columns={"bgc_id":"value"})
df_bigscape.to_csv(f"assets/tables/{FIGURE}c_bigscape_class_stacked.csv", index=False)

In [None]:
%%R
library("treeio")
library("ggtree")
library("tidyverse")
library("ggstar")
library("ggnewscale")
library("ggtreeExtra")
library("phangorn")
library("svglite")

In [None]:
R_tree = str(report_dir / "automlst_wrapper/final_corrected.newick")
R_data = f"assets/tables/{FIGURE}c_ggtree.csv"
R_bgc_data = f"assets/tables/{FIGURE}c_bgcs_count_stacked.csv"
R_bigscape_data = f"assets/tables/{FIGURE}c_bigscape_class_stacked.csv"

In [None]:
%%R  -w 1200 -h 1200 -i R_tree -i R_data
tree <- read.tree(R_tree)
#data <- read.csv("../automlst_wrapper/df_genomes_tree.csv")
data <- read.csv(R_data)

# midpoint root
tree <- phangorn::midpoint(tree)
tree <- ladderize(reorder(tree))

In [None]:
%%R  -w 600 -h 300

p <- ggtree(tree)
p <- p %<+% data
#p

In [None]:
%%R
p2 = p + geom_tiplab(aes(label=tip_label, fill=phylogroup),
                     size=2.6, hjust=-0.03, family='sans',
                    linetype = "dotted", linesize = 1) + # size of label border) +
        hexpand(1.5) +
        geom_tippoint(size=2.6, alpha=0.8, aes(colour=phylogroup, stroke=1, shape=sequence_quality)) + 
        scale_color_manual(labels=c("P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8"),
                           values=c('#264653','#e9c46a','#808080', '#808080','#f4a261','#808080', '#e76f51', '#2a9d8f'))
#p2

In [None]:
%%R -i R_bgc_data
# add antismash_data
bgc_data <- read.csv(R_bgc_data)
p3 = p2 + new_scale_fill() + geom_facet(panel='BGCs Status',
                     geom=geom_bar,
                     data=bgc_data,
                     mapping=aes(x=value, fill=bgc_status),
                     position="stack",
                     orientation="y", 
                     stat="identity") +
        scale_fill_manual(labels=c("bgcs_on_contig_edge", "complete_bgcs"),
                          values=c("#cc2936", "#08415c")) 
#p3

In [None]:
%%R -w 800 -h 500 -u px -i R_bigscape_data
# add bigscape_data
bigscape_data <- read.csv(R_bigscape_data)
p4 = p3 + new_scale_fill() + geom_facet(panel='BiG-SCAPE Class',
                     geom=geom_bar,
                     data=bigscape_data,
                     mapping=aes(x=value, fill=bigscape_class),
                     position="stack",
                     orientation="y", 
                     stat="identity") +
        scale_fill_manual(labels=c("NRPS", "Others", "PKS-NRP Hybrids", "PKSI", 
                                   "PKSOther", "RiPPs", "Saccharides", "Terpene"),
                          values=c("#734f5a", "#264653", "#2a9d8f", "#e9c46a", 
                                   "#f4a261", "#e76f51", "#941c2f", "#c05761")) 

p5 = p4 + theme(text=element_text(size=10, family="sans")) + theme_bw() 

p6 = facet_widths(p5, widths = c(80, 20, 30))
p6

In [None]:
R_fig3c_svg = f"assets/figures/{FIGURE}/c.svg"
R_fig3c_pdf = f"assets/figures/{FIGURE}/c.pdf"

In [None]:
%%R -i R_fig3c_svg -i R_fig3c_pdf
ggsave(file=R_fig3c_svg , plot=p6, device=svglite, width=3600, height=1800, units="px")
ggsave(plot=p6, width=3200, height=1600, units="px", dpi=300, filename=R_fig3c_pdf, useDingbats=FALSE)

## References
<font size="2">
{% for i in project().rule_used['automlst-wrapper']['references'] %}
- *{{ i }}*
{% endfor %}
</font>