In [None]:
import pandas as pd
from pathlib import Path
import yaml

import warnings
warnings.filterwarnings('ignore')

%load_ext rpy2.ipython

## File configuration

In [None]:
with open("config.yaml", "r") as f:
    notebook_configuration = yaml.safe_load(f)
notebook_configuration

In [None]:
bgcflow_dir = Path(notebook_configuration["bgcflow_dir"])
project_name = "mq_saccharopolyspora"
report_dir = bgcflow_dir / f"data/processed/{project_name}"
FIGURE = "Figure_6"
FIGURE_TREE = "Figure_3"
cutoff = "0.30"

In [None]:
bigscape_dir = report_dir / "bigscape/for_cytoscape_antismash_6.1.1/"
bigscape_mapping = [i for i in bigscape_dir.glob(f"*df_families_{cutoff}.csv")]
bigscape_cluster = [i for i in bigscape_dir.glob(f"*df_clusters_{cutoff}.csv")]
bigscape_presence = [i for i in bigscape_dir.glob(f"*df_family_presence_{cutoff}.csv")]
assert len(bigscape_mapping) == 1
assert len(bigscape_cluster) == 1
assert len(bigscape_presence) == 1
df_mapping = pd.read_csv(bigscape_mapping[0], index_col=0)
df_cluster = pd.read_csv(bigscape_cluster[0], index_col=0)
df_presence = pd.read_csv(bigscape_presence[0], index_col=0)


In [None]:
query = list(df_cluster.loc[[i for i in df_cluster.index if "lanthipeptide" in df_cluster.loc[i, 'product']], f"fam_id_{cutoff}"].unique())
fam_mapping = df_mapping.loc[query, "fam_name"].to_dict()
fam_mapping = {str(k):v for k,v in fam_mapping.items()}

In [None]:
query = [str(i) for i in query]
df_presence = df_presence.loc[:, query]

In [None]:
df_presence = df_presence.rename(columns=fam_mapping)
df_presence.to_csv(f"assets/tables/{FIGURE}a_presence_absence_lanthipeptide.csv")

In [None]:
%%R
library("treeio")
library("ggtree")
library("tidyverse")
library("ggstar")
library("ggnewscale")
library("ggtreeExtra")
library("phangorn")
library("svglite")

In [None]:
tree_data = f"assets/tables/{FIGURE_TREE}c_ggtree.csv"
tree_file = str((report_dir / "automlst_wrapper/final_corrected.newick").resolve())

In [None]:
%%R  -w 800 -h 500 -i tree_data -i tree_file
tree <- read.tree(tree_file)
#data <- read.csv("../automlst_wrapper/df_genomes_tree.csv")
data <- read.csv(tree_data)

# midpoint root
tree <- phangorn::midpoint(tree)
tree <- ladderize(reorder(tree))

p <- ggtree(tree)
data$tip_label2 <- data$genome_id
p <- p %<+% data

p2 <- p + geom_tiplab(aes(label=tip_label2, fill=phylogroup),
                     size=2.6, hjust=-0.03, family='sans',
                    linetype = "dotted", linesize = 1) + # size of label border) +
        #hexpand(1.5) +
        geom_tippoint(size=2.6, alpha=0.8, aes(colour=phylogroup, stroke=1)) + 
        scale_color_manual(labels=c("P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8"),
                           values=c('#264653','#e9c46a','#808080', '#808080','#f4a261','#808080', '#e76f51', '#2a9d8f'))

p2

In [None]:
df_phylogroup = pd.read_csv(f"assets/tables/{FIGURE_TREE}c_ggtree.csv", index_col="genome_id")
#df_cluster = pd.read_csv("../bigscape/for_cytoscape_antismash_6.1.1/2023-01-30 16_15_43_df_clusters_0.30.csv", index_col=0)
for i in df_cluster.index:
    genome_id = df_cluster.loc[i, "genome_id"]
    df_cluster.loc[i, "phylogroup"] = df_phylogroup.loc[genome_id, "phylogroup"]

In [None]:
def map_value_to_color(value):
    if value == 1:
        return 'black'
    else:
        return 'white'

df = pd.read_csv(f"assets/tables/{FIGURE}a_presence_absence_lanthipeptide.csv")
df['xindex'] = df.index
df = pd.melt(df, ["genome_id", "xindex"]).rename(columns={"variable":"GCF"})
df['color'] = df['value'].apply(map_value_to_color)

df.GCF

df_cluster = df_cluster[df_cluster[f'fam_known_compounds_{cutoff}'].isin(df.GCF)]
df_cluster['lanthipeptide_class'] = ["".join([x for x in i.split(".") if 'lanthipeptide' in x]) for i in df_cluster['product']]

In [None]:
cluster_category = {}
for cluster in df_cluster[f"fam_known_compounds_{cutoff}"].unique():
    subset = df_cluster[df_cluster[f"fam_known_compounds_{cutoff}"] == cluster]
    category_raw = ["".join([x for x in i.split(".") if "lanthipeptide" in x]) for i in subset["product"].to_list()]
    category = list(set(category_raw))
    if len(category) == 1:
        cluster_category[cluster] = "".join(category)
    else:
        if "" in category:
            category.remove("")
        cluster_category[cluster] = " or ".join(category)
cluster_category.values()
df["class_category"] = [cluster_category[i] for i in df.GCF]
df["phylogroup"] = [df_phylogroup.loc[i, "phylogroup"] for i in df.genome_id]

for i in df.index:
    if df.loc[i, 'value'] == 1:
        df.loc[i, 'color'] = f'{df.loc[i, "phylogroup"]}_1'
    else:
        df.loc[i, 'color'] = ""

        
df = df[df.value == 1]
xindex = {}
for cat in sorted(set(cluster_category.values())):
    subset = df[df["class_category"] == cat].GCF.sort_values().drop_duplicates().reset_index(drop=True)
    print(cat, len(xindex))
    subset.index += (1+len(xindex))
    subset = subset.to_dict()
    xindex.update({v:k for k,v in subset.items()})
xindex

In [None]:
# manually annotate as lanthipeptide class iii
override = df[df.class_category.isin(['lanthipeptide-class-ii or lanthipeptide-class-iii', 'lanthipeptide-class-iii or lanthipeptide-class-iv'])].index
df.loc[override, "class_category"] = 'lanthipeptide-class-iii'

In [None]:
df.xindex = [xindex[i] for i in df.GCF]

presence_absence_table = f"assets/tables/{FIGURE}a_presence_absence_lanthipeptide_long.csv"
df.to_csv(presence_absence_table, index=False)

In [None]:
%%R -w 800 -h 400 -i tree_file -i tree_data -i presence_absence_table
df <- read.csv(presence_absence_table,
               stringsAsFactors = TRUE)

tree <- read.tree(tree_file)
#data <- read.csv("../automlst_wrapper/df_genomes_tree.csv")
data <- read.csv(tree_data)

# midpoint root
tree <- phangorn::midpoint(tree)
tree <- ladderize(reorder(tree))

p <- ggtree(tree)
data$tip_label2 <- data$genome_id
p <- p %<+% data

p2 <- p + geom_tiplab(aes(label=tip_label2, fill=phylogroup),
                     size=2.8, hjust=-0.03, family='sans',
                    linetype = "dotted", linesize = 1) + # size of label border) +
        hexpand(0.5) +
        geom_tippoint(size=1, alpha=0.8, aes(colour=phylogroup, stroke=1)) + 
        scale_color_manual(labels=c("P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8"),
                           values=c('#264653','#e9c46a','#808080', '#808080','#f4a261','#808080', '#e76f51', '#2a9d8f')) + 
        new_scale_fill() 

p3 <- p2 #+ theme(legend.position='none')

df_text <- df[!duplicated(df$xindex),]
p4 <- facet_plot(p3, panel = "BiG-SCAPE GCFs", data = df, geom=geom_tile, mapping = aes(x=xindex, y=y, fill = class_category), 
                 color = "white", lwd = 1,
                 linetype = 1) 
#p3 <- facet_plot(p3, panel = "BiG-SCAPE GCFs", data = df_text, geom=geom_text, mapping = aes(x=xindex, y = 0, label = GCF), size=2, angle = 90)
p4 <- facet_widths(p4, widths = c(1, 1.5)) 
#p3 + xlim_expand(c(150, 150), "BiG-SCAPE GCFs")
p4

In [None]:
from svgutils.compose import *
from svgutils.compose import Figure

In [None]:
labels = {i:n for n,i in enumerate(list(df.sort_values(by="xindex").GCF.unique()))}
df_region = pd.read_csv(report_dir / "tables/df_regions_antismash_6.1.1.csv", index_col="bgc_id")
for i in df_cluster.index:
    similarity = df_region.loc[i, "similarity"]
    #print(similarity, similarity > 0.7)
    if similarity >= 0.5:
        for c in ["similarity", "most_similar_known_cluster_id", "most_similar_known_cluster_description"]:
            df_cluster.loc[i, c] = df_region.loc[i, c]
df_cluster.loc[:, ["fam_known_compounds_0.30", "most_similar_known_cluster_description", "similarity"]]
df_cluster["most_similar_known_cluster_description"].unique()

naming_dict = {'venezuelin' : "Venezuelin",
               'labyrinthopeptin A2 / labyrinthopeptin A1 / labyrinthopeptin A3' : "Labyrinthopeptin",
               'Ery-9 / Ery-6 / Ery-8 / Ery-7 / Ery-5 / Ery-4 / Ery-3': "Erythreapeptin",
               'cinnamycin' : "Kyamicin", 
               'A83543A' : "Spinosyn", 
               'anantin C' : "Anantin", 
               'planosporicin' : "Planosporicin"}

for i in df_cluster.index:
    name = df_cluster.loc[i, "most_similar_known_cluster_description"]
    if name in naming_dict.keys():
        df_cluster.loc[i, "most_similar_known_cluster_description"] = naming_dict[name]

label_mapping = {}
        
for gcf in df_cluster["fam_known_compounds_0.30"].unique():
    subset = df_cluster[df_cluster["fam_known_compounds_0.30"] == gcf]
    gcf_number = subset["fam_id_0.30"].unique()
    subset = subset.fillna("")
    label = list(subset["most_similar_known_cluster_description"].unique())
    label = "".join(label)
    if label == "ErythreapeptinAnantin":
        label = "Erythreapeptin|Anantin"
    if label == "":
        label = f"GCF_{gcf_number[0]}"
    else:
        label = f"GCF_{gcf_number[0]} ({label})"
    label_mapping[gcf] = label

labels = {v:label_mapping[k] for k,v in labels.items()}
labels

In [None]:
Path(f"assets/figures/{FIGURE}/").mkdir(parents=True, exist_ok=True)

In [None]:
figure_output_svg = f"assets/figures/{FIGURE}/a.svg"
figure_output_pdf = f"assets/figures/{FIGURE}/a.pdf"

In [None]:
%%R -w 800 -h 400 -i presence_absence_table -i tree_data -i tree_file -i figure_output_svg -i figure_output_pdf 
df <- read.csv(presence_absence_table,
               stringsAsFactors = TRUE)

tree <- read.tree(tree_file)
#data <- read.csv("../automlst_wrapper/df_genomes_tree.csv")
data <- read.csv(tree_data)

# midpoint root
tree <- phangorn::midpoint(tree)
tree <- ladderize(reorder(tree))

p <- ggtree(tree)
data$tip_label2 <- data$genome_id
p <- p %<+% data

p2 <- p + geom_tiplab(aes(label=tip_label2, fill=phylogroup),
                     size=2.7, hjust=-0.08, family='sans',
                    linetype = "dotted", linesize = 1) + # size of label border) +
        hexpand(1.8) +
        geom_tippoint(size=1, alpha=0.8, aes(colour=phylogroup, stroke=1)) + 
        scale_color_manual(labels=c("P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8"),
                           values=c('#264653','#e9c46a','#808080', '#808080','#f4a261','#808080', '#e76f51', '#2a9d8f')) + 
        new_scale_fill() 

p3 <- p2+ theme(legend.position='none')

df_text <- df[!duplicated(df$xindex),]
p4 <- facet_plot(p3, panel = "Lanthipeptide BiG-SCAPE GCFs", data = df, geom=geom_tile, mapping = aes(x=xindex, y=y, fill = class_category), 
                 color = "white", lwd = 1,
                 linetype = 1) 
#p3 <- facet_plot(p3, panel = "BiG-SCAPE GCFs", data = df_text, geom=geom_text, mapping = aes(x=xindex, y = 0, label = GCF), size=2, angle = 90)
p4 <- facet_widths(p4, widths = c(1, 3)) 
#p3 + xlim_expand(c(150, 150), "BiG-SCAPE GCFs")
ggsave(filename=figure_output_svg, plot=p4, device=svglite, width=2000, height=1400, units="px")
ggsave(plot=p4, width=2000, height=1400, units="px", dpi=300, filename=figure_output_pdf, useDingbats=FALSE)
p4

In [None]:
# get coordinates of tiles
with open(f'assets/figures/{FIGURE}/a.svg', "r") as f:
    data = f.readlines()

x_points = []
for i in data:
    if '<rect x=' in i:
        if 'stroke-linecap: butt; stroke-linejoin:' in i:
            for item in i.split():
                if 'x=' in item:
                    x_coor = float(item.strip("x=").strip("'"))
                    x_points.append(x_coor)
x_points = sorted(list(set(x_points)))
assert len(x_points) == len(labels)

In [None]:
import svgutils.transform as sg

# load matpotlib-generated figures
fig = sg.fromfile(f'assets/figures/{FIGURE}/a.svg')
fig2 = sg.fromfile(f'assets/figures/{FIGURE}/a.svg')

# get the plot objects
plot2 = fig2.getroot()
#plot2.moveto(280, 0, scale=0.5)

# add text labels
for n, label in labels.items():
    print(n, x_points[n], label)
    x = x_points[n]
    y = 330
    if 'Erythreapeptin' in label:
        weight = "bold"
    else:
        weight = "normal"
    txt1 = sg.TextElement(x, y, label, size=7.5, weight=weight)
    txt1.rotate(45, x, y)
    fig.append(txt1)
    # save generated SVG files
    fig.save(f"assets/figures/{FIGURE}/a_annotated.svg")

final_figure = Figure("650", "650",
                      Panel(
                          SVG(f"assets/figures/{FIGURE}/a_annotated.svg").scale(1.2),
                      ))
final_figure.save(f"assets/figures/{FIGURE}/a_annotated.svg")
final_figure