# SeqFu2
Summary of [Seqfu](https://github.com/telatin/seqfu2) results from project: `[{{ project().name }}]` 

## Description
[Seqfu2](https://github.com/telatin/seqfu2) provides a sequence statistic overview of the genomes in the dataset.

## Genome Statistics Overview

In [None]:
import pandas as pd
from pathlib import Path
import altair as alt
import yaml
import warnings
warnings.filterwarnings('ignore')

## File Configuration

In [None]:
with open("config.yaml", "r") as f:
    notebook_configuration = yaml.safe_load(f)
notebook_configuration

In [None]:
# Set up paths and input file
bgcflow_dir = Path(notebook_configuration["bgcflow_dir"])
project_name = "mq_saccharopolyspora"
report_dir = bgcflow_dir / f"data/processed/{project_name}"
FIGURE = "Figure_3"

# Seqfu result
#seqfu_table = report_dir / "tables/df_seqfu_stats.csv"
seqfu_table = bgcflow_dir / "data/processed/qc_saccharopolyspora/tables/df_seqfu_stats.csv"


In [None]:
# Taxonomic placement result - merges GTDB API and GTDB-tk result
gtdb_table = report_dir / "tables/df_gtdb_meta.csv"
#gtdb_table = "../../qc_saccharopolyspora/tables/df_gtdb_gtdbtk_meta.csv"

In [None]:
# NCBI report for later use (automatically build when using samples from NCBI)
df_ncbi = pd.read_csv(report_dir / "tables/df_ncbi_meta.csv").set_index("genome_id")

# Integrate both result into single table
df_seqfu = pd.read_csv(seqfu_table)
df_seqfu = df_seqfu.rename(columns={'File' : 'genome_id'}).set_index('genome_id')
df_gtdb = pd.read_csv(gtdb_table).set_index('genome_id')
df = pd.concat([df_seqfu, df_gtdb], join="inner", axis=1).reset_index()
df = df.set_index("genome_id", drop=False)

#df_all = pd.read_csv("../../qc_saccharopolyspora/tables/df_seqfu_stats.csv").rename(columns={'File' : 'genome_id'}).set_index('genome_id', drop=False)

In [None]:
# adds NCBI assembly level column in main table
for i in df.index:
    try:
        df.loc[i, "assembly_level"] = df_ncbi.loc[i, "assembly_level"]
    except KeyError:
        pass

In [None]:
count_cutoff = 50
df_filterd = df[df.Count < count_cutoff]
df_filterd_2 = df[df.N50 > 5000000]

for genome in df.index:
    if genome in df_filterd_2.index:
        df.loc[genome, "sequence_quality"] = "HQ"
    elif genome in df_filterd.index:
        df.loc[genome, "sequence_quality"] = "MQ"
    else:
        df.loc[genome, "sequence_quality"] = "LQ"
df.loc[:, ["Total", "Count", "N50", "assembly_level", "sequence_quality"]].sort_values(by="Count")

Path("assets/tables").mkdir(parents=True, exist_ok=True)

df.to_csv(f"assets/tables/{FIGURE}a_df_seqfu_annotated.csv")

In [None]:
outfile = Path(f"assets/tables/{FIGURE}b_mash_hcluster.csv")
outfile.parent.mkdir(parents=True, exist_ok=True)
df_mash_cluster = pd.read_csv(outfile, index_col=0)
source = pd.concat([df, df_mash_cluster], axis=1)
source.hcluster = source.hcluster.fillna(99)
source["N50_ratio"] = source["N50"] / source["Total"]
source.phylogroup.fillna("NA",inplace=True)

In [None]:
source["Genome length (Mbp)"] = source["Total"] / 1000000
source = source.rename(columns={"sequence_quality" : "Sequence Quality"})
x_col = 'gc'
#y_col = 'Genome length (Mbp)'
y_col = 'Total'


# set up manual color
domain = ["P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "NA"]
range_ = ['#264653','#e9c46a','#808080', '#808080','#f4a261','#808080', "#e76f51", "#2a9d8f", "#FFFFFF"]

chart = alt.Chart(source).mark_point().encode(
    alt.X(x_col, 
          scale=alt.Scale(domain=(round(source[x_col].min() - 0.01, 2), 
                                  round(source[x_col].max() + 0.01, 2))),
          axis=alt.Axis(format='p', 
                        title="GC Content")
         ),
    alt.Y(y_col,
          scale=alt.Scale(domain=(round(source[y_col].min() - 0.2, 2), 
                                  round(source[y_col].max() + 0.2, 2))),
          axis=alt.Axis(format='.2s',
                        title="Genome Length (bp)")
         ),
    shape=alt.Shape('Sequence Quality', legend=alt.Legend(orient='top')),
    #size="N50_ratio",
    color=alt.Color("phylogroup:N", scale=alt.Scale(domain=domain, range=range_), legend=None),
    tooltip=['genome_id', 'Organism', 'Count', 'Total', 'gc', 'N50', 'AuN', 'Min', 'Max', "phylogroup"],
).mark_point(
    filled=True,
    stroke='black',
    strokeWidth=0.5,
    opacity=0.8,
    size=100
).configure_header(
    title=None,
    labels=False
).configure_axis(
    labelFontSize=10,
    titleFontSize=12
).configure_legend(
    labelFontSize=10,
    titleFontSize=12,
).configure_view(
    continuousHeight=250,
    continuousWidth=250,
)

chart.interactive()
chart.save(f"assets/figures/{FIGURE}/a.svg")
chart.interactive().save(f"assets/figures/{FIGURE}/a.html")
chart

[Download Table]({{ project().file_server() }}/tables/df_seqfu_stats.csv){:target="_blank" .md-button}

## References
<font size="2">
{% for i in project().rule_used['seqfu']['references'] %}
- *{{ i }}*
{% endfor %}
</font>