In [1]:
import numpy as np
# from biom.table import Table
# from biom import load_table
import pandas as pd
import os

import sys
sys.path.append("../..")

from utils.ncbi.names import generate_names_df, names_db_path, standardize_core
from utils.ncbi.jams_convert import fix_name, convert_jams_to_taxid
from utils.data_paths import *

In [2]:
# The genus was changed, but the Amos paper uses the original genus name.
# replacement_dict = {"Anaerobutyricum hallii": "Eubacterium hallii", "Anaerobutyricum": "Eubacterium"}
# For the Amos paper, the genus was changed to Eubacterium. This is the only result, so we can just replace it.

In [3]:
def standardize_wgsa(df: pd.DataFrame):
    """
    Standardizes the WGSA data using the standardize_core function.
    """
    names_df = generate_names_df(names_db_path, load_pickle=True)

    # We can use the convert_jams_to_taxid function from the utils.ncbi.convert_jams since the format is the same.
    ann, unann = convert_jams_to_taxid(df.head(50), names_df)

    return ann, unann

In [4]:
tax_dict = {"Genus": "G", "Species": "S", "Family": "F", "Order": "O", "Class": "C", "Phylum": "P", "Kingdom": "K"}

def clean_and_parse_wgsa(data_path, output_dir, rank="Genus", left_prefix=""):
    df = pd.read_csv(data_path, sep="\t", header=None, usecols=[1, 3, 4, 5])

    # Split off the first two rows. The sum of the unclassified and root counts is the total.
    total_counts = df.iloc[:2, 0].sum()

    # We need an OR statement to get the unclassified.
    df = df.loc[(df[3] == tax_dict[rank]) | (df[3] == "U")]
    # df = df.where(df[3] == tax_dict[rank]).dropna()

    df.sort_values(by=1, ascending=False, inplace=True)

    clean_genus = df[[5, 1, 4]].copy(deep=True)
    clean_genus.columns = [rank, "RA", "TAX_ID"]


    # Convert the RA column from counts to RA.
    clean_genus["RA"] = clean_genus["RA"].apply(lambda x: x / total_counts)

    clean_genus.set_index(rank, inplace=True)

    # The unclassified tax_id is actually 12908, not 0.
    clean_genus.loc["unclassified", "TAX_ID"] = 12908

    indices = clean_genus.index

    indices = [i.lstrip() for i in indices]
    
    # Remove any [ and ] characters from the indices.
    indices = [i.replace("[", "") for i in indices]
    indices = [i.replace("]", "") for i in indices]

    clean_genus.index = indices

    # Since the WGSA data already contains the TAXID, it is much faster than this function.
    # ann, uann = standardize_wgsa(clean_genus)

    clean_genus = clean_genus.astype({"TAX_ID": int})

    prefix = os.path.basename(data_path).split("_")[0]
    # left_prefix = "s"
    output_file = os.path.join(output_dir, left_prefix + prefix.upper() + "_" + rank.lower() + "_" + "relabund_annotated.csv")

    clean_genus.to_csv(output_file, sep=",", header=True, index_label=rank)

# clean_and_parse_wgsa(cami_sim_data, output_path)

In [5]:
# There may be more than one output file, so we need to combine them.
def combine_files(data_path: str, rank: str, output_dir: str, prefix: str = ""):
    if not os.path.exists(data_path):
        raise FileNotFoundError("The data path does not exist.")

    for root, dirs, files in os.walk(data_path):
        print(files)
        if len(files) == 0:
            raise Exception("No files found in output directory.")
            
        for file in files:
            if "REPORT" in file:
                print(os.path.join(root, file))
                clean_and_parse_wgsa(os.path.join(root, file), output_dir, rank=rank, left_prefix=prefix)

# Main

In [6]:
def run_wgsa_clean():
    paths = make_data_list()
    for p in paths:
        prefix = ""
        # Skip if the data doesn't exist.
        if p.wgsa == "":
            continue
            
        if "bmock12" in p.path or "camisim" in p.path:
            prefix = "S"
        else:
            prefix = ""

        # Remove this to do all the directories 
        if "nist" in p.path:
            out_path = os.path.join(p.path, "wgsa")
            if os.path.exists(out_path):
                combine_files(p.wgsa, "Genus", out_path, prefix)
                combine_files(p.wgsa, "Species", out_path, prefix)
            else:
                out_path = os.path.join(p.path, "wgsa2")
                combine_files(p.wgsa, "Genus", out_path, prefix)
                combine_files(p.wgsa, "Species", out_path, prefix)

run_wgsa_clean()

['Mix-D_taxREPORT.txt', 'EG_taxREPORT.txt', 'Neg_taxREPORT.txt', 'Mix-A_taxREPORT.txt', 'Mix-B_taxREPORT.txt', 'Mix-C_taxREPORT.txt']
/Volumes/TBHD_share/valencia/pipelines/NIST/pipelines/wgsa/outputs/TAXprofiles/TEDreadsTAX/reports/Mix-D_taxREPORT.txt
/Volumes/TBHD_share/valencia/pipelines/NIST/pipelines/wgsa/outputs/TAXprofiles/TEDreadsTAX/reports/EG_taxREPORT.txt
/Volumes/TBHD_share/valencia/pipelines/NIST/pipelines/wgsa/outputs/TAXprofiles/TEDreadsTAX/reports/Neg_taxREPORT.txt
/Volumes/TBHD_share/valencia/pipelines/NIST/pipelines/wgsa/outputs/TAXprofiles/TEDreadsTAX/reports/Mix-A_taxREPORT.txt
/Volumes/TBHD_share/valencia/pipelines/NIST/pipelines/wgsa/outputs/TAXprofiles/TEDreadsTAX/reports/Mix-B_taxREPORT.txt
/Volumes/TBHD_share/valencia/pipelines/NIST/pipelines/wgsa/outputs/TAXprofiles/TEDreadsTAX/reports/Mix-C_taxREPORT.txt
['Mix-D_taxREPORT.txt', 'EG_taxREPORT.txt', 'Neg_taxREPORT.txt', 'Mix-A_taxREPORT.txt', 'Mix-B_taxREPORT.txt', 'Mix-C_taxREPORT.txt']
/Volumes/TBHD_share/val