# Sunbeam Clean
This notebook will clean the output of the sunbeam metagenomics pipeline into the standardized relative abundance tables.

| Species | RA | TAX_ID |
|---------|----|--------|

Sunbeam was chosen since it is an assembly-based DNA-to-DNA pipeline.

In [1]:
import os
import sys  # noqa
sys.path.append("../../")  # noqa
from utils.data_paths import make_data_list, MockCommData

import pandas as pd
import seaborn as sns
from typing import List

# Import the data paths from the utils module
paths = make_data_list()


In [2]:
tax_dict = {"Genus": "G", "Species": "S", "Family": "F",
            "Order": "O", "Class": "C", "Phylum": "P", "Kingdom": "K"}


def clean_sunbeam(paths: List[MockCommData], rank: str):
    """ Cleans the sunbeam output files and saves them to the sunbeam directory. 

    Parameters:
        paths: list[MockCommData]
            A list of MockCommData objects. Typically from the utils module.
        rank: str
            The taxonomic rank to clean the data to. Typically: Genus, Species. See tax_dict for more options.
    """
    for p in paths:
        sunbeam_path = p.sunbeam
        if sunbeam_path == "" or "MBARC" not in sunbeam_path:
            continue

        files = os.listdir(sunbeam_path)
        files = [os.path.join(sunbeam_path, f)
                 for f in files if f.endswith("taxa.tsv")]

        output_dir = os.path.join(p.path, "sunbeam")
        os.makedirs(output_dir, exist_ok=True)

        for f in files:
            print(f)
            print(p.path)
            clean_kraken(os.path.join(sunbeam_path, f),
                         rank.capitalize(), output_dir)


def clean_kraken(file: str, rank: str, output_dir: str):
    """ Cleans the kraken outputs files from sunbeam. 

    Follows the same steps as WGSA2. Note that the total RA is not 100% because
    assigned higher taxonomic ranks are not included in the total.

    """
    df = pd.read_csv(file, sep="\t", header=None, usecols=[1, 3, 4, 5])

    # Split off the first two rows. The sum of the unclassified and root counts is the total.
    total_counts = df.iloc[:2, 0].sum()

    # We need an OR statement to get the unclassified.
    df = df.loc[(df[3] == tax_dict[rank]) | (df[3] == "U")]
    # df = df.where(df[3] == tax_dict[rank]).dropna()

    df.sort_values(by=1, ascending=False, inplace=True)

    clean_genus = df[[5, 1, 4]].copy(deep=True)
    clean_genus.columns = [rank, "RA", "TAX_ID"]

    # Convert the RA column from counts to RA.
    clean_genus["RA"] = clean_genus["RA"].apply(lambda x: x / total_counts)

    clean_genus.set_index(rank, inplace=True)

    # The unclassified tax_id is actually 12908, not 0.
    clean_genus.loc["unclassified", "TAX_ID"] = 12908

    indices = clean_genus.index

    indices = [i.lstrip() for i in indices]

    # Remove any [ and ] characters from the indices.
    indices = [i.replace("[", "") for i in indices]
    indices = [i.replace("]", "") for i in indices]

    clean_genus.index = indices

    # Since the WGSA data already contains the TAXID, it is much faster than this function.
    # ann, uann = standardize_wgsa(clean_genus)

    clean_genus = clean_genus.astype({"TAX_ID": int})
    # display(clean_genus)

    prefix = os.path.basename(file).split("-")[0]
    left_prefix = ""
    output_file_name = f"{left_prefix}{prefix.upper()}_{rank.lower()}_relabund_annotated.csv"
    output_file_path = os.path.join(output_dir, output_file_name)
    print("Saved to: ", output_file_path)

    clean_genus.to_csv(output_file_path, sep=",",
                       header=True, index_label=rank)


# Main
Run the code below.

In [3]:
clean_sunbeam(paths, "Genus")
clean_sunbeam(paths, "Species")


/Volumes/TBHD_share/valencia/pipelines/MBARC/pipelines/sunbeam/sunbeam_output/classify/kraken/MBARC-taxa.tsv
/Users/valenciaem/coding/pipelines/pipelines/mbarc
Saved to:  /Users/valenciaem/coding/pipelines/pipelines/mbarc/sunbeam/MBARC_genus_relabund_annotated.csv
/Volumes/TBHD_share/valencia/pipelines/MBARC/pipelines/sunbeam/sunbeam_output/classify/kraken/MBARC-taxa.tsv
/Users/valenciaem/coding/pipelines/pipelines/mbarc
Saved to:  /Users/valenciaem/coding/pipelines/pipelines/mbarc/sunbeam/MBARC_species_relabund_annotated.csv
