# SeqFu2
Summary of [Seqfu](https://github.com/telatin/seqfu2) results from project: `[{{ project().name }}]` 

## Description
[Seqfu2](https://github.com/telatin/seqfu2) provides a sequence statistic overview of the genomes in the dataset.

## Genome Statistics Overview

In [None]:
import pandas as pd
from pathlib import Path
import altair as alt
import warnings
import yaml
warnings.filterwarnings('ignore')

## File Configurations

In [None]:
with open("config.yaml", "r") as f:
    notebook_configuration = yaml.safe_load(f)
notebook_configuration

In [None]:
bgcflow_dir = Path(notebook_configuration["bgcflow_dir"])
project_name = "qc_saccharopolyspora"
report_dir = bgcflow_dir / f"data/processed/{project_name}"
FIGURE = "Figure_S6"

In [None]:
# Seqfu result
seqfu_table = report_dir / "tables/df_seqfu_stats.csv"


In [None]:
# Taxonomic placement result - merges GTDB API and GTDB-tk result
gtdb_table = report_dir / "tables/df_gtdb_meta.csv"
gtdbtk_table = report_dir / "tables/gtdbtk.bac120.summary.tsv"
df_gtdb = pd.read_csv(gtdb_table).set_index('genome_id')

In [None]:
df_gtdbtk = pd.read_csv(gtdbtk_table, sep="\t").rename(columns={"user_genome" : "genome_id"}).set_index("genome_id")
tax_mapping = {"d" : "Domain",
              "p" : "Phylum",
              "c" : "Class",
              "o" : "Order",
              "f" : "Family",
              "g" : "Genus",
              "s" : "Organism"}

for index in df_gtdbtk.index:
    tax = [i for i in df_gtdbtk.loc[index, "classification"].split(";")]
    for level in tax:
        key = level.split("__")[0]
        if key == "g":
            genus = level.split("__")[-1]
        if level == "s__":
            level = f"s__{genus} sp."
        df_gtdb.loc[index, tax_mapping[key]] = level
df_gtdb.Species = [i.split()[-1] for i in df_gtdb.Organism]
df_gtdb.to_csv(f"assets/tables/{FIGURE}_df_gtdb.csv")

In [None]:
# NCBI report for later use (automatically build when using samples from NCBI)
df_ncbi = pd.read_csv(report_dir / "tables/df_ncbi_meta.csv").set_index("genome_id")

# Integrate both result into single table
df_seqfu = pd.read_csv(seqfu_table)
df_seqfu = df_seqfu.rename(columns={'File' : 'genome_id'}).set_index('genome_id')

df = pd.concat([df_seqfu, df_gtdb], join="inner", axis=1).reset_index()
df = df.set_index("genome_id", drop=False)

In [None]:
# adds NCBI assembly level column in main table
for i in df.index:
    try:
        df.loc[i, "assembly_level"] = df_ncbi.loc[i, "assembly_level"]
    except KeyError:
        pass

In [None]:
count_cutoff = 50
df_filterd = df[df.Count < count_cutoff]
df_filterd_2 = df[df.N50 > 5000000]

for genome in df.index:
    if genome in df_filterd_2.index:
        df.loc[genome, "sequence_quality"] = "HQ"
    elif genome in df_filterd.index:
        df.loc[genome, "sequence_quality"] = "MQ"
    else:
        df.loc[genome, "sequence_quality"] = "LQ"
df.loc[:, ["Total", "Count", "N50", "assembly_level", "sequence_quality"]].sort_values(by="Count")

Path("assets/tables").mkdir(parents=True, exist_ok=True)

df.to_csv(f"assets/tables/{FIGURE}_df_seqfu_annotated.csv")

[Download Table]({{ project().file_server() }}/tables/df_seqfu_stats.csv){:target="_blank" .md-button}

## References
<font size="2">
{% for i in project().rule_used['seqfu']['references'] %}
- *{{ i }}*
{% endfor %}
</font>