In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import sys

sys.path.append("../../../utils")
from data_paths import *

In [2]:
def find_output_files(rank, data_dir):
    """Find output files from Woltka classify.
    
    Parameters
    ----------
    rank : str
        Taxonomic rank.
    data_dir : str
        Path to directory containing output files.
    
    Returns
    -------
    list
        List of paths to output files.
    """
    output_files = []
    for root, dirs, files in os.walk(data_dir):
        for file in files:
            if f"{rank}.tsv" == file:
                output_files.append(os.path.join(root, file))
    return output_files

In [3]:
def rel_abundance(df, output, rank, plot=False):
    # df.drop("FeatureID", axis=1, inplace=True)
    # pct = df[["Count"]].apply(lambda x: x / x.sum(), axis=0)
    df["RA"] = df["Count"] / df["Count"].sum()
    df.drop("Count", axis=1, inplace=True)
    df = df.sort_values(by="RA", ascending=False)

    # Remove any [ and ] from the index.
    df.index = df.index.str.replace("[", "", regex=False)
    df.index = df.index.str.replace("]", "", regex=False)

    # df = df.where(df > 5e-5).dropna()

    final_df = df[["RA", "TAX_ID"]]
    final_df.to_csv(output, sep=",")

    print("saved to: ", output)

    if plot:
        final_df.T.plot.bar(figsize=(10,10), xlabel="{rank} Name", ylabel="Fraction", title=f"{rank} Relative Abundance above 0.005%").legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title=f"{rank}")
        plt.savefig(output + ".png", bbox_inches='tight')

In [4]:
# Find output files.
def find_and_save(input_data: str, rank: str, output_dir: str):
    """Find and save output files from Woltka classify.
    
    Parameters
    ----------
    input_data : str
        Path to directory containing output files.
    rank : str
        Taxonomic rank.
    output_dir : str
        Path to directory to save output files.
    """
    output_files = find_output_files(rank, input_data)
    for file in output_files:
        print(file)
        df = pd.read_csv(file, sep="\t", names=["TAX_ID", "Count", "Species"], header=0, index_col=2)

        sampleID = (os.path.dirname(file).split("/")[-1]).split("_")[0]
        output_path = os.path.join(output_dir, f"{sampleID.upper()}_{rank}_relabund_annotated.csv")
        rel_abundance(df, output_path, rank)

# Main

In [5]:
# data_dir = amos_hilo.woltka
# output_dir = os.path.abspath("../../pipelines/amos/hilo/wol/")
# if not os.path.exists(data_dir):
#     raise Exception("path does not exist.")

data_paths = make_data_list()
for d in data_paths:
    print(d.woltka)
    # if "bmock12" not in d.path:
        # continue

    output_dir = os.path.join(d.path, "wol")
    if not os.path.exists(output_dir):
        output_dir = os.path.join(d.path, "woltka")

    find_and_save(input_data=d.woltka, rank="genus", output_dir=output_dir)
    find_and_save(input_data=d.woltka, rank="species", output_dir=output_dir)

/Volumes/TBHD_share/valencia/pipelines/bmock12/woltka/classify/results
/Volumes/TBHD_share/valencia/pipelines/bmock12/woltka/classify/results/genus.tsv
saved to:  /Users/valenciaem/coding/pipelines/pipelines/bmock12/woltka/RESULTS_genus_relabund_annotated.csv
/Volumes/TBHD_share/valencia/pipelines/bmock12/woltka/classify/results/species.tsv
saved to:  /Users/valenciaem/coding/pipelines/pipelines/bmock12/woltka/RESULTS_species_relabund_annotated.csv
/Volumes/TBHD_share/cami_data/NOADAPTERS/pipelines/woltka/classify
/Volumes/TBHD_share/cami_data/NOADAPTERS/pipelines/woltka/classify/S2_classify/genus.tsv
saved to:  /Users/valenciaem/coding/pipelines/pipelines/camisimGI/woltka/S2_genus_relabund_annotated.csv
/Volumes/TBHD_share/cami_data/NOADAPTERS/pipelines/woltka/classify/S1_classify/genus.tsv
saved to:  /Users/valenciaem/coding/pipelines/pipelines/camisimGI/woltka/S1_genus_relabund_annotated.csv
/Volumes/TBHD_share/cami_data/NOADAPTERS/pipelines/woltka/classify/S2_classify/species.tsv
s