In [1]:
import pandas as pd
import os
from os.path import basename, dirname
from lineage import make_annotated_lineage, make_annotation_dataframes, cleanup_lineage

In [2]:
test_file = "../../pipelines/bmock12/S1_expected_species_annotated.csv"
def species_to_genus_annotated(fp: str, lineage_df: pd.DataFrame, nodes_dict: dict, delim: str = "_"):
    df = pd.read_csv(fp, index_col=0, skiprows=1, header=None, names=["species", "RA", "tax_id"])

    # Convert to int, then to string.
    df["tax_id"] = df["tax_id"].astype(int).astype(str)
    display(df)
    print(df.dtypes)

    # On the tax_id column, apply the make_annotated_lineage function, then cleanup the lineage.
    df["genus_lineage"] = df["tax_id"].apply(lambda x: cleanup_lineage(make_annotated_lineage(str(x), lineage_df, nodes_dict), "genus"))

    # Now, we will split the index column on "_" and take the first element.
    index = df.index.tolist()
    index = [i.split(delim)[0] for i in index]
    df.index = index

    # Select only the columns we need.
    genus_df = df[["RA", "genus_lineage"]].copy()

    genus_df.index.name = "genus"
    genus_df.reset_index(inplace=True)

    # Group by genus and sum the RA column just in case there multiple species in the same genus.
    genus_df = genus_df.groupby("genus_lineage", as_index=False).agg({"genus": "first", "RA": "sum", "genus_lineage": "first"})

    genus_df.sort_values(by="RA", ascending=False, inplace=True)

    return genus_df

# genus_df = species_to_genus_annotated(test_file, " ", lineage_df, nodes_dict)

In [3]:
# We need to start at the annotated species level and work our way up to the genus level.
# To do this, we will use the new lineage file to get the parent tax_ids of each tax_id.
# Then, we will search for the rank of interest in the nodes file.

# First, go into the pipelines directory and search for files with "species_relabund_annotated" in the name.
wanted_pipelines = ["biobakery4", "bio4", "bio3", "biobakery3", "jams"]

def get_annotated_species_files() -> pd.DataFrame:
    root_dir = os.path.abspath("../../pipelines")
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if "species_relabund_annotated" in file:
                yield os.path.join(root, file)

def annotate_species_files():
    lineage_df, nodes_dict = make_annotation_dataframes()
    for file in get_annotated_species_files():
        src_name = dirname(dirname(file)).split("/")[-1]
        pipeline_name = dirname(file).split("/")[-1]
        # src_name = "nist"
        # pipeline_name = "jams"
        sample_name = basename(file).split("_")[0]
        if src_name == "gut" or src_name == "tongue":
            continue
            
        if src_name != "camisimGI":
            continue

        if pipeline_name in wanted_pipelines:
            print(pipeline_name, src_name, sample_name)
            print(file)
            output_file = os.path.join(dirname(file), f"{sample_name}_genus_relabund_annotated.csv")
            output = species_to_genus_annotated(file, lineage_df, nodes_dict, "_")
            display(output.head())
            output.to_csv(output_file, index=False)
    
annotate_species_files()

jams camisimGI S2
/Users/valenciaem/coding/pipelines/pipelines/camisimGI/jams/S2_species_relabund_annotated.csv


Unnamed: 0_level_0,RA,tax_id
species,Unnamed: 1_level_1,Unnamed: 2_level_1
BORDETELLA_PERTUSSIS,0.326897,520
ACHROMOBACTER_XYLOSOXIDANS,0.219329,85698
CLOSTRIDIUM_BOTULINUM,0.100597,1491
ROSEBURIA_HOMINIS,0.072701,301301
CLOSTRIDIUM_PERFRINGENS,0.059441,1502
...,...,...
BURKHOLDERIA_TERRITORII,0.000000,1503055
BURKHOLDERIA_STAGNALIS,0.000000,1503054
BURKHOLDERIA_SINGULARIS,0.000000,1503053
BURKHOLDERIA_PURAQUAE,0.000000,1904757


RA        float64
tax_id     object
dtype: object
520
85698
1491
301301
1502
1548
817
818
357276
1493
1492
84022
1515
823
1501
873
185008
169679
1510
12908
Desired rank not found in lineage, using last value instead.. {'2787823': 'no rank'}
821
35814
1796646
820
1509
28111
88431
2709398
2292943
2293014
1496
28116
2716538
39493
674529
371601
1538095
2320086
291644
1807691
518
46503
658662
387661
691816
544645
2302940
2447885
165179
137838
2731211
2292952
1560217
2293117
563193
310298
329854
40575
246787
91623
1289519
469591
39481
1622073
1962263
1715194
29367
100716
1121298
46506
1917883
28901
2293116
328812
2290935
36834
100174
2293123
392838
204516
28117
1540257
2759022
36745
469589
2292282
1538552
539813
2093864
2854780
319475
208962
2093861
1691940
562
564
33954
1538
2508714
1169321
2202164
299767
2044467
2749084
61645
2044462
2484854
2723312
2723308
550
1648
158836
1347366
1736266
394958
39489
306026
420412
1512
1513
1553
1504
150336
1465809
1519
119641
52704
84031
2762229
84029


IndexError: index 0 is out of bounds for axis 0 with size 0