In [None]:
import pandas as pd
from pathlib import Path
import yaml

import warnings
warnings.filterwarnings('ignore')

%load_ext rpy2.ipython

## File configurations

In [None]:
with open("config.yaml", "r") as f:
    notebook_configuration = yaml.safe_load(f)
notebook_configuration

In [None]:
bgcflow_dir = Path(notebook_configuration["bgcflow_dir"])
project_name = "mq_saccharopolyspora"
report_dir = bgcflow_dir / f"data/processed/{project_name}"
FIGURE = "Figure_S8"
FIGURE_TREE = "Figure_3"

In [None]:
R_feature_data = f"assets/tables/{FIGURE}_features_stacked.csv"
R_bases_data = f"assets/tables/{FIGURE}_bases_stacked.csv"
R_contigs_data = f"assets/tables/{FIGURE}_contigs_stacked.csv"

In [None]:
summary = []
for item in (report_dir / "genbank").glob("*.txt"):
    df = pd.read_csv(item, sep=":", skiprows=1, header=None, index_col=0).rename(columns={1:item.stem}).T
    server_path = "<a href='{{ project().file_server() }}/genbank/"
    df.loc[item.stem, "gbk file"] = server_path + f"{item.stem}.gbk' target='_blank''>{item.stem}.gbk</a>"
    df.loc[item.stem, "CDS table"] = server_path + f"{item.stem}.tsv' target='_blank''>{item.stem}.tsv</a>"
    summary.append(df)
df = pd.concat(summary).sort_index()
df_table = df.loc[:, ['contigs', 'bases', 'CDS', 'rRNA', 'tRNA', 'tmRNA', 'repeat_region']].fillna(0)

result = {}
for gbk_file in (report_dir / "genbank").glob("*.tsv"):
    gbk_file = Path(gbk_file)
    gbk_table = pd.read_csv(gbk_file, sep="\t")
    genome_id = gbk_file.stem
    gbk_table['genome_id'] = genome_id
    unknown_cds = gbk_table['product'].value_counts()['hypothetical protein']
    known_cds = gbk_table.shape[0] - unknown_cds
    result[genome_id] = {"CDS (Unknown)" : unknown_cds, 
                         "CDS (Known)" : known_cds}
df_table = pd.concat([df_table, pd.DataFrame.from_dict(result).T], axis=1)

df_table.to_csv(f"assets/tables/{FIGURE}_prokka.csv", index=True)

In [None]:
df_features = df_table.loc[:, ['rRNA', 'tRNA', 'tmRNA', 'repeat_region', "CDS (Unknown)", "CDS (Known)"]].reset_index(drop=False).melt(id_vars='index').rename(columns={"index":"genome_id", 'variable':"category", "value":"count"})
df_features.to_csv(R_feature_data, index=False)

df_bases = df_table.loc[:, ['bases']].reset_index(drop=False).melt(id_vars='index').rename(columns={"index":"genome_id", 'variable':"category", "value":"count"})
df_bases.to_csv(R_bases_data, index=False)

df_contigs = df_table.loc[:, ['contigs']].reset_index(drop=False).melt(id_vars='index').rename(columns={"index":"genome_id", 'variable':"category", "value":"count"})
df_contigs.to_csv(R_contigs_data, index=False)

In [None]:
%%R
library("treeio")
library("ggtree")
library("tidyverse")
library("ggstar")
library("ggnewscale")
library("ggtreeExtra")
library("phangorn")
library("svglite")

In [None]:
R_tree = str(report_dir / "automlst_wrapper/final_corrected.newick")
R_data = f"assets/tables/{FIGURE_TREE}c_ggtree.csv"

In [None]:
%%R  -w 1200 -h 1200 -i R_tree -i R_data
tree <- read.tree(R_tree)
#data <- read.csv("../automlst_wrapper/df_genomes_tree.csv")
data <- read.csv(R_data)

# midpoint root
tree <- phangorn::midpoint(tree)
tree <- ladderize(reorder(tree))

In [None]:
%%R  -w 600 -h 300

p <- ggtree(tree)
p <- p %<+% data
#p

In [None]:
%%R
p2 = p + geom_tiplab(aes(label=tip_label, fill=phylogroup),
                     size=2.6, hjust=-0.03, family='sans',
                    linetype = "dotted", linesize = 1) + # size of label border) +
        hexpand(5) +
        geom_tippoint(size=2.6, alpha=0.8, aes(colour=phylogroup, stroke=1, shape=sequence_quality)) + 
        scale_color_manual(labels=c("P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8"),
                           values=c('#264653','#e9c46a','#808080', '#808080','#f4a261','#808080', '#e76f51', '#2a9d8f'))
p2

In [None]:
%%R -i R_feature_data
# add feature
feature_data <- read.csv(R_feature_data)
p3 = p2 + new_scale_fill() + geom_facet(panel='Feature count (log scale)',
                     geom=geom_bar,
                     data=feature_data,
                     mapping=aes(x=count, fill=category),
                     position="stack",
                     orientation="y", 
                     stat="identity") +
        scale_x_log10() + theme(text=element_text(size=10, family="sans")) + theme_bw() #+
        #scale_fill_manual(labels=c("bgcs_on_contig_edge", "complete_bgcs"),
        #                  values=c("#cc2936", "#08415c")) 
p3 = facet_widths(p3, widths = c(3, 2))
p3

In [None]:
R_fig3c_svg = f"assets/figures/{FIGURE}/{FIGURE}.svg"
R_fig3c_pdf = f"assets/figures/{FIGURE}/{FIGURE}.pdf"
Path(R_fig3c_svg).parent.mkdir(parents=True, exist_ok=True)

In [None]:
%%R -i R_fig3c_svg -i R_fig3c_pdf
ggsave(file=R_fig3c_svg , plot=p3, device=svglite, width=4000, height=1800, units="px")
ggsave(plot=p3, width=4000, height=1600, units="px", dpi=300, filename=R_fig3c_pdf, useDingbats=FALSE)