In [None]:
# Load libraries
from pathlib import Path
import pandas as pd
import os
# import ncbi.datasets
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
from matplotlib import rcParams
import yaml

import altair as alt

sns.set_context("paper")

# Figure S6.A

## File Configurations

In [None]:
with open("config.yaml", "r") as f:
    notebook_configuration = yaml.safe_load(f)
notebook_configuration

In [None]:
bgcflow_dir = Path(notebook_configuration["bgcflow_dir"])
project_name = "qc_saccharopolyspora"
report_dir = bgcflow_dir / f"data/processed/{project_name}"
FIGURE = "Figure_S6"

In [None]:
external_data_dir = bgcflow_dir / 'data/external/'
interim_data_dir = bgcflow_dir / 'data/interim/'
processed_data_dir = bgcflow_dir / 'data/processed/'
config_dir = bgcflow_dir / 'config/'

In [None]:
# load tables
df_ncbi = pd.read_csv(processed_data_dir / project_name / 'tables/df_ncbi_meta.csv').set_index('genome_id', drop=False)
df_seqfu = pd.read_csv(f'assets/tables/{FIGURE}_df_seqfu_annotated.csv').set_index('genome_id', drop=False)
df_checkm = pd.read_csv(processed_data_dir / project_name / 'tables/df_checkm_stats.csv').set_index('genome_id', drop=False)
df_gtdb = pd.read_csv(f"assets/tables/{FIGURE}_df_gtdb.csv")

In [None]:
# load tables
# ALTERNATIVE: use NCBI assembly category for HQ, use contig < 50 for MQ
for i in df_checkm.index:
    completeness = df_checkm.loc[i, "Completeness"]
    contamination = df_checkm.loc[i, "Contamination"]
    if (completeness > 90) and (contamination < 5):
        df_checkm.loc[i, "MIMAG_quality"] = "HQ"
    elif (completeness >= 50) and (contamination < 10):
        df_checkm.loc[i, "MIMAG_quality"] = "MQ"
    elif (completeness < 50) and (contamination < 10):
        df_checkm.loc[i, "MIMAG_quality"] = "LQ"
    else:
        print(f"{i} dropped!")

In [None]:
df_year = pd.DataFrame(index = df_ncbi.index, columns=['Date', 'Year'])
for genome_id in df_ncbi.index:
    date_str = df_ncbi.loc[genome_id, 'date']
    date_obj = datetime.datetime.strptime(date_str, "%Y-%m-%d")
    df_year.loc[genome_id, 'Year'] = str(date_obj.year)
    df_year.loc[genome_id, 'Date'] = date_str
    df_year.loc[genome_id, 'assembly_level'] = df_ncbi.loc[genome_id, 'assembly_level']
    
df_year.sort_values(by='Date', inplace=True)

In [None]:
for genome_id in df_year.index:
    try:
        df_year.loc[genome_id, "MIMAG_quality"] = df_checkm.loc[genome_id, "MIMAG_quality"]
        df_year.loc[genome_id, "Completeness"] = df_checkm.loc[genome_id, "Completeness"]
        df_year.loc[genome_id, "Contamination"] = df_checkm.loc[genome_id, "Contamination"]
        df_year.loc[genome_id, "sequence_quality"] = df_seqfu.loc[genome_id, "sequence_quality"]
        df_year.loc[genome_id, "N50"] = df_seqfu.loc[genome_id, "N50"]
        df_year.loc[genome_id, "number_of_contigs"] = df_seqfu.loc[genome_id, "Count"]
        df_year.loc[genome_id, "sequence_length"] = df_seqfu.loc[genome_id, "Total"]
    except KeyError as e:
        print(genome_id, e)

In [None]:
source = df_year.copy().reset_index(drop=False).groupby(['Year', 'sequence_quality']).count().reset_index(drop=False)
source = source.rename(columns={"genome_id" : "genome_count"})

# fill missing years
all_values = [i for i in range(source.Year.astype(int).min(), source.Year.astype(int).max() + 1)]
stored_values = [int(i) for i in source.Year.unique()]
[all_values.remove(i) for i in stored_values]
missing_values = all_values

missing_dict = {}
idx = source.index.max() + 1
for num, y in enumerate(missing_values):
    for q in ['HQ', 'MQ', 'LQ']:
        source.loc[idx, "Year"] = str(y)
        source.loc[idx, 'sequence_quality'] = q
        source.loc[idx, 'genome_count'] = 0
        idx = idx + 1
source = source.fillna(0)
source = source.sort_values(by="Year").reset_index()

In [None]:
# calculate cumulative data
data = dict()

previous_year = None
for year in source.Year.unique():
    source_subset_year = source[source.loc[:, "Year"] == year]
    
    # initiate empty dict
    if year not in data.keys():
        data[year] = {}
    
    for sequence_quality in ["HQ", "MQ", "LQ"]: 
        value = source_subset_year[source_subset_year.loc[:, "sequence_quality"] == sequence_quality].genome_count.values
        if len(value) == 0:
            value = [0]
        print(year, sequence_quality, value[0], previous_year)
        
        if previous_year is None:
            previous_value = 0
        else:
            previous_value = data[previous_year][sequence_quality]
    
        data[year].update({sequence_quality : value[0] + previous_value})
    
    previous_year = year

cumulative_data = {}
index = 0
for year in data.keys():
    for sequence_quality in data[year].keys():
        value = data[year][sequence_quality]
        cumulative_data[index] = {"year" : year, "sequence_quality" : sequence_quality, "count" : value}
        index = index + 1

source = pd.DataFrame(cumulative_data).T

In [None]:
domain = ['LQ', 'MQ', 'HQ']
range_ = ['red', 'orange', 'blue']

base = alt.Chart(source).mark_bar().encode(
    alt.X('year',
          title='Year'),
    alt.Y('count',
          title="Number of Genomes"),
    color=alt.Color("sequence_quality:N", legend=None, scale=alt.Scale(domain=domain, range=range_)),
).configure_axis(
    labelFontSize=10,
    titleFontSize=12
).configure_legend(
    labelFontSize=10,
    titleFontSize=12,
).configure_view(
    continuousHeight=250,
    continuousWidth=250,
)

fig_S6a = base

In [None]:
fig_S6a

# Figure S6.B

In [None]:
source = df_year.copy()
source = source.reset_index(drop=False)
x_col = 'Completeness'
y_col = 'Contamination'
chart_one = alt.Chart(source).mark_point().encode(
    alt.X(x_col,
          scale=alt.Scale(domain=(92, 101)),
          title=f'{x_col} (%)'),
    alt.Y(y_col,
          scale=alt.Scale(domain=(round(0, 1), 
                                  round(source[y_col].max() + 1, 2))),
          title=f"{y_col} (%)"
         ),
    #shape='sequence_quality',
    color=alt.Color("sequence_quality:N", legend=None, scale=alt.Scale(domain=domain, range=range_)),
    tooltip=['genome_id', 'Date', 'Year', 'assembly_level', 'MIMAG_quality', 'Completeness',
       'Contamination', 'sequence_quality', 'N50', 'number_of_contigs',
       'sequence_length']
).mark_point(
    filled=True,
    stroke='black',
    strokeWidth=0.5,
    opacity=0.8,
    size=100
).configure_axis(
    labelFontSize=10,
    titleFontSize=12
).configure_legend(
    labelFontSize=10,
    titleFontSize=12,
).configure_view(
    continuousHeight=250,
    continuousWidth=250,
).interactive()


fig_S6b = chart_one

In [None]:
fig_S6b

# Figure S6.C

In [None]:
import plotly.graph_objects as go

In [None]:
project_name = "qc_saccharopolyspora"
project_dir = bgcflow_dir / f"data/processed/{project_name}"
ncbi_table = project_dir / "tables/df_ncbi_meta.csv"

In [None]:
df_ncbi = pd.read_csv(ncbi_table).set_index("genome_id", drop=False)

In [None]:
df_taxonomy = pd.DataFrame()
df_gtdb = df_gtdb.set_index("genome_id")
for gid in df_gtdb.index:
    df_taxonomy.loc[gid, "genome_id"] = gid
    df_taxonomy.loc[gid, "GTDB_Genus"] = df_gtdb.loc[gid, "Genus"].strip("g__")
    df_taxonomy.loc[gid, "GTDB_Species"] = df_gtdb.loc[gid, "Species"]
    df_taxonomy.loc[gid, "NCBI_Genus"] = df_ncbi.loc[gid, "genus"]
    df_taxonomy.loc[gid, "NCBI_Species"] = df_ncbi.loc[gid, "species"]
df_taxonomy

In [None]:
df = df_taxonomy.copy()#.reset_index()
for i in df.index:
    df.loc[i, "GTDB_Species"] = f'{df.loc[i, "GTDB_Genus"]} {df.loc[i, "GTDB_Species"]}'
    df.loc[i, "NCBI_Species"] = f'{df.loc[i, "NCBI_Genus"]} {df.loc[i, "NCBI_Species"]}'

# max color
coolors_list = ["#001219","#005f73","#0a9396","#94d2bd","#e9d8a6","#ee9b00","#ca6702","#bb3e03","#ae2012","#9b2226"]

max_color = 10
# generate color list
gtdb_species_count = df.GTDB_Species.value_counts().to_dict()
ncbi_species_count = df.NCBI_Species.value_counts().to_dict()
dict_species_count = {}
for species in set(list(gtdb_species_count.keys()) + list(ncbi_species_count.keys())):
    value = 0
    if species in ncbi_species_count.keys():
        value = value + ncbi_species_count[species]
    if species in gtdb_species_count.keys():
        value = value + gtdb_species_count[species] 
    dict_species_count[species] = value

coolors_dict = {}    
ctr = 0
for species in sorted(dict_species_count, key=dict_species_count.get, reverse=True):
    if ctr < len(coolors_list):
        color = coolors_list[ctr]
    else:
        color = "#808080"
    coolors_dict[species] = color
    ctr = ctr + 1
#coolors_dict

In [None]:
df_sankey = df.groupby(['GTDB_Species','NCBI_Species']).count().iloc[:,:1].reset_index()
df = df_sankey.copy()
for i in df.index:
    df.loc[i, "GTDB_Species"] = f' {df.loc[i, "GTDB_Species"]}'
    # df.loc[i, "NCBI_Species"] = f'NCBI_{df.loc[i, "NCBI_Species"]}'
cat_cols=['NCBI_Species', 'GTDB_Species']
value_cols='genome_id'

In [None]:
df

In [None]:
color_dict = dict()
column_name = ["NCBI_Species", "GTDB_Species"]
for col in column_name:
    color_dict[col] = {}
    for species in df[col]:
        s = species.strip(f' {col.split("_")[0]}')
        value = {species : coolors_dict[s]}
        color_dict[col].update(value)
color_dict 

In [None]:
labelList = []
colorList = []
for col in cat_cols:
    labelList = labelList + list(set(df[col].values))
    try:
        colorList = colorList + [color_dict[col][label] for label in list(set(df[col].values))]
    except KeyError:
        #colorList = colorList + [color_dict[col][label] for label in list(set(df[col].values))]
        pass
labelList = list(dict.fromkeys(labelList))
#labelList

In [None]:
# transform df into a source-target pair
for i in range(len(cat_cols)-1):
    if i==0:
        sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
        sourceTargetDf.columns = ['source','target','count']
    else:
        tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
        tempDf.columns = ['source','target','count']
        sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
    sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
        
        
# add index for source-target pair
sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
sourceTargetDf = sourceTargetDf.sort_values(by=["count"]).reset_index()

In [None]:
# sourceTargetDf['color'] = [color_quality_dict[target_id] if target_id in color_quality_dict.keys() else '#2a9d8f' for target_id in sourceTargetDf['target']]
    
# creating the sankey diagram
data = dict(
        type='sankey',
        node = dict(
          pad = 15,
          thickness = 100,
          line = dict(
            color = "black",
            width = 0
          ),
          label = labelList,
          color = colorList
        ),
        link = dict(
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['count'],
          # color = sourceTargetDf['color'],
        )
      )
    
       
fig_data = dict(data=[data])
#data

In [None]:
fig = go.Figure(data=fig_data)

fig_S6c = fig.update_layout(font_size=11,
                 width=800,
                height=1200,)

In [None]:
fig_S6c

# Figure S6.D

In [None]:
# Set up paths and input file
report_dir = processed_data_dir / project_name

# Seqfu result
seqfu_table = report_dir / "tables/df_seqfu_stats.csv"

# Taxonomic placement result
#gtdb_table = report_dir / "tables/df_gtdb_meta.csv"
#gtdb_table = report_dir / "tables/df_gtdb_gtdbtk_meta.csv"

# NCBI report for later use (automatically build when using samples from NCBI)
df_ncbi = pd.read_csv((processed_data_dir / project_name / "tables/df_ncbi_meta.csv")).set_index("genome_id")

# Integrate both result into single table
df_seqfu = pd.read_csv(seqfu_table)
df_seqfu = df_seqfu.rename(columns={'File' : 'genome_id'}).set_index('genome_id')
#df_gtdb = pd.read_csv(gtdb_table).set_index('genome_id')
df = pd.concat([df_seqfu, df_gtdb], axis=1).reset_index()
df = df.set_index("genome_id", drop=False)

In [None]:
# adds NCBI assembly level column in main table
for i in df.index:
    df.loc[i, "assembly_level"] = df_ncbi.loc[i, "assembly_level"]

In [None]:
count_cutoff = 50
df_filtered = df[df.Count < count_cutoff]
df_filtered_2 = df[df.N50 > 5000000]

In [None]:
for genome in df.index:
    if genome in df_filtered_2.index:
        df.loc[genome, "sequence_quality"] = "HQ"
    elif genome in df_filtered.index:
        df.loc[genome, "sequence_quality"] = "MQ"
    else:
        df.loc[genome, "sequence_quality"] = "LQ"
#df

In [None]:
source = df.copy()
#source = pd.concat([df, df_gtdb], axis=1)

n50_cutoff = 5000000
count_cutoff = 50

source['count_cutoff'] = count_cutoff 
source['N50_cutoff'] = n50_cutoff

chart_one = alt.Chart().mark_point().encode(
    alt.X('Count:Q',
          scale=alt.Scale(type="log", domain=(0.99, 1000)),
          axis=alt.Axis(format='.1s')),
    alt.Y('N50:Q',
          scale=alt.Scale(type="log", domain=(10000, 10000001)),
          axis=alt.Axis(format='.1s')),
    #shape='sequence_quality',
    color=alt.Color("sequence_quality:N", legend=None, scale=alt.Scale(domain=domain, range=range_)),
    tooltip=['genome_id', 'Organism', 'Count:Q', 'Total:Q', 'gc:Q', 'N50:Q', 'N75:Q', 'N90:Q', 'AuN:Q', 'Min:Q', 'Max:Q'],
).mark_point(
    filled=True,
    stroke='black',
    strokeWidth=0.5,
    opacity=0.8,
    size=100
)

chart_two = alt.Chart().mark_rule().encode(
    alt.X('count_cutoff', 
          title=f'Contigs (cutoff = {count_cutoff})'),
         strokeWidth=alt.value(0.01)
)

chart_three = alt.Chart().mark_rule().encode(
    alt.Y('N50_cutoff', 
          title=f"N50 (cutoff = {n50_cutoff})"),
    strokeWidth=alt.value(0.01)
)

merged_chart = (chart_one+chart_two+chart_three).facet(column='Quality filtering', data=source).interactive().configure_header(
    title=None,
    labels=False
).configure_axis(
    labelFontSize=10,
    titleFontSize=12
).configure_legend(
    labelFontSize=10,
    titleFontSize=12,
).configure_view(
    continuousHeight=250,
    continuousWidth=250,
)

fig_S6d = merged_chart

In [None]:
#! pip install altair_saver
#! pip install selenium==4.2.0
#from altair_saver import save

In [None]:
fig_S6d
outfile = Path(f"assets/figures/{FIGURE}/{FIGURE}d.svg")
outfile.parent.mkdir(parents=True, exist_ok=True)
fig_S6d.save(outfile)

In [None]:
fig_S6a #'assets/Figure_S6/S6_A.svg')#.data # HAVE STACKS IN THE ORDER OF HQ, MQ, and LQ
outfile = Path(f"assets/figures/{FIGURE}/{FIGURE}a.svg")
outfile.parent.mkdir(parents=True, exist_ok=True)
fig_S6a.save(outfile)

In [None]:
fig_S6b#.data # HAVE STACKS IN THE ORDER OF HQ, MQ, and LQ
outfile = Path(f"assets/figures/{FIGURE}/{FIGURE}b.svg")
outfile.parent.mkdir(parents=True, exist_ok=True)
fig_S6b.save(outfile)

Why is there a HQ sequence which has more than 5% contamination?

In [None]:
fig_S6c

In [None]:
outfile = Path(f"assets/figures/{FIGURE}/{FIGURE}c.svg")
outfile.parent.mkdir(parents=True, exist_ok=True)
fig_S6c.write_image(outfile)

In [None]:
fig_S6d.data[fig_S6d.data.sequence_quality != "LQ"].shape

**Figure S6. Overview of timeline, quality and taxonomic placement of 42 Saccharopolyspora genomes**

- a) Sankey diagram representing the species assignment differences between NCBI and GTDB.

- b) Cumulative barchart of the number of genomes over last 15 years with different assembly qualities,. 

- c) Distribution of contamination vs completeness metrics calculated using CheckM, where colors represent the assembly qualities. 

- d) Scatterplot representing distribution of N50 values vs number of contigs. The cutoff of **50** contings is used to filter the low quality genomes, whereas 5 Mbp of N50 value cutoff was used to define high-quality genome. The remaining genomes were defined as medium-quality.


In [None]:
df_ncbi.shape

In [None]:
#! pip install svgutils
import svgutils.compose as sc

In [None]:
from svgutils.compose import *

final_figure = Figure("800", "720",
                      Panel(
                          SVG(f"assets/figures/{FIGURE}/{FIGURE}c.svg").scale(0.62),
                          Text("(a)", 0, 25, size=12, weight='bold'),
                          Text("NCBI", 50, 35, size=12, weight='bold'),
                          Text("GTDB", 375, 35, size=12, weight='bold'),
                      ).move(0, 0),
                      Panel(
                          SVG(f"assets/figures/{FIGURE}/{FIGURE}a.svg").scale(0.7),
                          Text("(b)", -10, 0, size=12, weight='bold')
                      ).move(480, 25),
                      Panel(
                          SVG(f"assets/figures/{FIGURE}/{FIGURE}b.svg").scale(0.8),
                          Text("(c)", -10, 0, size=12, weight='bold')
                      ).move(480, 250),
                      Panel(
                          SVG(f"assets/figures/{FIGURE}/{FIGURE}d.svg").scale(0.8),
                          Text("(d)", 0, 0, size=12, weight='bold'),
                      ).move(470, 485),
                     )
final_figure

In [None]:
final_figure.save(f"assets/figures/{FIGURE}/{FIGURE}.svg")

In [None]:
import svgutils.transform as sg

In [None]:
from cairosvg import svg2png
svg2png(url=f"assets/figures/{FIGURE}/{FIGURE}.svg", write_to=f"assets/figures/{FIGURE}/{FIGURE}.png")