In [37]:
import pandas as pd
import os
from os.path import basename, dirname
from lineage import make_annotated_lineage, make_annotation_dataframes, cleanup_lineage

In [45]:
test_file = "../../pipelines/bmock12/S1_expected_species_annotated.csv"
def species_to_genus_annotated(fp: str, lineage_df: pd.DataFrame, nodes_dict: dict, output_path: str, delim: str = "_"):
    df = pd.read_csv(fp, index_col=0)

    # On the tax_id column, apply the make_annotated_lineage function, then cleanup the lineage.
    df["genus_lineage"] = df["tax_id"].apply(lambda x: cleanup_lineage(make_annotated_lineage(str(x), lineage_df, nodes_dict), "genus"))

    # Now, we will split the index column on "_" and take the first element.
    index = df.index.tolist()
    index = [i.split(delim)[0] for i in index]
    df.index = index

    # Select only the columns we need.
    genus_df = df[["RA", "genus_lineage"]].copy()

    genus_df.index.name = "genus"
    genus_df.reset_index(inplace=True)

    # Group by genus and sum the RA column just in case there multiple species in the same genus.
    genus_df = genus_df.groupby("genus_lineage", as_index=False).agg({"genus": "first", "RA": "sum", "genus_lineage": "first"})

    genus_df.sort_values(by="RA", ascending=False, inplace=True)

    return genus_df

# genus_df = species_to_genus_annotated(test_file, " ", lineage_df, nodes_dict)

Desired rank not found in lineage, using last value instead.. {'131567': 'no rank', '2': 'superkingdom', '1783272': 'clade', '201174': 'phylum', '1760': 'class', '85009': 'order', '31957': 'family', '185283': 'no rank'}


Unnamed: 0,RA,genus_lineage
Halomonas,0.19493,2745
Muricauda,0.180311,111500
Halomonas,0.174278,2745
Marinobacter,0.157486,2742
Psychrobacter,0.111128,497
Cohaesibacter,0.070702,655352
Marinobacter,0.048779,2742
Thioclava,0.031534,285107
Propionibacteriaceae,0.011641,185283
Micromonospora,0.010391,1873


Unnamed: 0,genus,RA,genus_lineage
0,Halomonas,0.19493,2745
1,Muricauda,0.180311,111500
2,Halomonas,0.174278,2745
3,Marinobacter,0.157486,2742
4,Psychrobacter,0.111128,497
5,Cohaesibacter,0.070702,655352
6,Marinobacter,0.048779,2742
7,Thioclava,0.031534,285107
8,Propionibacteriaceae,0.011641,185283
9,Micromonospora,0.010391,1873


In [None]:
# We need to start at the annotated species level and work our way up to the genus level.
# To do this, we will use the new lineage file to get the parent tax_ids of each tax_id.
# Then, we will search for the rank of interest in the nodes file.

# First, go into the pipelines directory and search for files with "species_relabund_annotated" in the name.
wanted_pipelines = ["biobakery4", "bio4", "bio3", "biobakery3", "jams"]

def get_annotated_species_files() -> pd.DataFrame:
    root_dir = os.path.abspath("../../pipelines/")
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if "species_relabund_annotated" in file:
                yield os.path.join(root, file)

def annotate_species_files():
    lineage_df, nodes_dict = make_annotation_dataframes()
    for file in get_annotated_species_files():
        src_name = dirname(dirname(file)).split("/")[-1]
        pipeline_name = dirname(file).split("/")[-1]
        sample_name = basename(file).split("_")[0]
        if pipeline_name in wanted_pipelines:
            print(pipeline_name, src_name, sample_name)
            output_file = os.path.join(dirname(file), f"{sample_name}_genus_relabund_annotated.csv")
            species_to_genus_annotated(file, output_file, lineage_df, nodes_dict)
    
annotate_species_files()

bio3 mixed SRR11487938
bio3 mixed SRR11487940
bio3 mixed SRR11487939
bio3 mixed SRR11487941
bio3 mixed SRR11487937
jams mixed SRR11487938
jams mixed SRR11487940
jams mixed SRR11487939
jams mixed SRR11487941
jams mixed SRR11487937
biobakery4 mixed SRR11487938
biobakery4 mixed SRR11487940
biobakery4 mixed SRR11487939
biobakery4 mixed SRR11487941
biobakery4 mixed SRR11487937
bio3 hilo SRR11487934
bio3 hilo SRR11487932
bio3 hilo SRR11487933
bio3 hilo SRR11487935
bio3 hilo SRR11487931
jams hilo SRR11487934
jams hilo SRR11487932
jams hilo SRR11487933
jams hilo SRR11487935
jams hilo SRR11487931
biobakery4 hilo SRR11487934
biobakery4 hilo SRR11487932
biobakery4 hilo SRR11487933
biobakery4 hilo SRR11487935
biobakery4 hilo SRR11487931
jams bmock12 S1
biobakery3 bmock12 S1
biobakery4 bmock12 S1
bio3 tourlousse SRR17380246
bio3 tourlousse SRR17380241
bio3 tourlousse SRR17380244
bio3 tourlousse SRR17380242
bio3 tourlousse SRR17380243
bio3 tourlousse SRR17380245
jams tourlousse SRR17380246
jams tour