# Annotated Species Data to Annotated Genus Data

This notebook annotates the genus-level relative abundance files from the species-level data in the "../../expected_pipelines" folder by finding the lineage for each species, determining if there is a genus in the lineage, then returning the TAXID if it exists. If there is no genus-level parent, it uses the direct parent of the speices.

In [19]:
import pandas as pd
import os
from os.path import basename, dirname
from lineage import make_annotated_lineage, make_annotation_dataframes, cleanup_lineage

In [20]:
test_file = "../../pipelines/bmock12/S1_expected_species_annotated.csv"
def species_to_genus_annotated(fp: str, lineage_df: pd.DataFrame, nodes_dict: dict, delim: str = "_"):
    df = pd.read_csv(fp, index_col=0, skiprows=1, header=None, names=["species", "RA", "tax_id"])

    # Convert to int, then to string.
    df["tax_id"] = df["tax_id"].astype(int).astype(str)
    # display(df)
    # print(df.dtypes)

    # On the tax_id column, apply the make_annotated_lineage function, then cleanup the lineage.
    df["genus_lineage"] = df["tax_id"].apply(lambda x: cleanup_lineage(make_annotated_lineage(str(x), lineage_df, nodes_dict), "genus"))

    # Now, we will split the index column on "_" and take the first element.
    index = df.index.tolist()
    index = [i.split(delim)[0] for i in index]
    df.index = index

    # Select only the columns we need.
    genus_df = df[["RA", "genus_lineage"]].copy()

    genus_df.index.name = "genus"
    genus_df.reset_index(inplace=True)

    # Group by genus and sum the RA column just in case there multiple species in the same genus.
    genus_df = genus_df.groupby("genus_lineage", as_index=False).agg({"genus": "first", "RA": "sum", "genus_lineage": "first"})

    genus_df.sort_values(by="RA", ascending=False, inplace=True)

    genus_df.rename(columns={"genus_lineage": "tax_id"}, inplace=True)

    return genus_df

# genus_df = species_to_genus_annotated(test_file, " ", lineage_df, nodes_dict)

In [21]:
# We need to start at the annotated species level and work our way up to the genus level.
# To do this, we will use the new lineage file to get the parent tax_ids of each tax_id.
# Then, we will search for the rank of interest in the nodes file.

# First, go into the expected_pipelines directory and search for files with "species_relabund_annotated" in the name.
wanted_pipelines = ["biobakery4", "bio4", "bio3", "biobakery3", "jams"]

def get_annotated_species_files(expected: bool, root_dir: str) -> pd.DataFrame:
    root_dir = os.path.abspath(root_dir)
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if expected:
                if "expected_species_annotated" in file:
                    yield os.path.join(root, file)
            else:
                if "species_relabund_annotated" in file:
                    yield os.path.join(root, file)

def annotate_species_files(expected: bool, root_dir: str):
    lineage_df, nodes_dict = make_annotation_dataframes()
    for file in get_annotated_species_files(expected=expected, root_dir=root_dir):
        src_name = dirname(dirname(file)).split("/")[-1]
        pipeline_name = dirname(file).split("/")[-1]
        # src_name = "nist"
        # pipeline_name = "jams"
        sample_name = basename(file).split("_")[0] + "_"
        # print(sample_name)
        if sample_name == "expected_":
            sample_name = ""

        if src_name == "gut" or src_name == "tongue":
            continue
            
        # if pipeline_name in wanted_pipelines:
        print(pipeline_name, src_name, sample_name)
        print(file)
        if expected:
            output_file = os.path.join(dirname(file), f"{sample_name}expected_genus_annotated.csv")
        else:
            output_file = os.path.join(dirname(file), f"{sample_name}genus_relabund_annotated.csv")

        output = species_to_genus_annotated(file, lineage_df, nodes_dict, "_")
        # display(output.head())
        output.to_csv(output_file, index=False)
   
annotate_species_files(expected=True, root_dir="../../expected_pipelines")

mixed amos 
/Users/valenciaem/coding/pipelines/expected_pipelines/amos/mixed/expected_species_annotated.csv
hilo amos 
/Users/valenciaem/coding/pipelines/expected_pipelines/amos/hilo/expected_species_annotated.csv
bmock12 expected_pipelines S1_
/Users/valenciaem/coding/pipelines/expected_pipelines/bmock12/S1_expected_species_annotated.csv
Desired rank not found in lineage, using last value instead.. {'131567': 'no rank', '2': 'superkingdom', '1783272': 'clade', '201174': 'phylum', '1760': 'class', '85009': 'order', '31957': 'family', '185283': 'no rank'}
tourlousse expected_pipelines 
/Users/valenciaem/coding/pipelines/expected_pipelines/tourlousse/expected_species_annotated.csv
nist expected_pipelines EG_
/Users/valenciaem/coding/pipelines/expected_pipelines/nist/EG_expected_species_annotated.csv
nist expected_pipelines MIX-A_
/Users/valenciaem/coding/pipelines/expected_pipelines/nist/MIX-A_expected_species_annotated.csv
nist expected_pipelines MIX-B_
/Users/valenciaem/coding/pipeline