In [1]:
import numpy as np
# from biom.table import Table
# from biom import load_table
import pandas as pd
import os
from utils.ncbi.names import generate_names_df, names_db_path, standardize_core
from utils.ncbi.jams_convert import fix_name, convert_jams_to_taxid

In [2]:
# Use the TEDreadsTAX reports in TAXprofiles/TEDreadsTAX/reports.
data = "/Volumes/TBHD_share/valencia/pipelines/amos/nibsc/mixed/wgsa/outputs/TAXprofiles/TEDreadsTAX/reports"
output_path = "pipelines/amos/mixed/wgsa"

In [3]:
# The genus was changed, but the Amos paper uses the original genus name.
# replacement_dict = {"Anaerobutyricum hallii": "Eubacterium hallii", "Anaerobutyricum": "Eubacterium"}
# For the Amos paper, the genus was changed to Eubacterium. This is the only result, so we can just replace it.

In [4]:
def standardize_wgsa(df: pd.DataFrame):
    """
    Standardizes the WGSA data using the standardize_core function.
    """
    names_df = generate_names_df(names_db_path, load_pickle=True)

    # Let's remove anything below 0.5% (50 ppm). 
    # There are thousands of features and are taking too much time to annotate.
    print("Before: ", df.shape)
    df = df.where(df["RA"] >= 0.005).dropna()
    print("After: ", df.shape)

    # We can use the convert_jams_to_taxid function from the utils.ncbi.convert_jams since the format is the same.
    ann, unann = convert_jams_to_taxid(df.head(50), names_df)

    return ann, unann


In [5]:
tax_dict = {"Genus": "G", "Species": "S", "Family": "F", "Order": "O", "Class": "C", "Phylum": "P", "Kingdom": "K"}

def clean_and_parse_wgsa(data_path, output_dir, rank="Genus", left_prefix=""):
    df = pd.read_csv(data_path, sep="\t", header=None, usecols=[0, 3, 5])
    df = df.where(df[3] == tax_dict[rank]).dropna()
    df.sort_values(by=0, ascending=False, inplace=True)

    clean_genus = df[[5, 0]].copy(deep=True)
    clean_genus.columns = [rank, "RA"]
    clean_genus["RA"] = clean_genus["RA"].apply(lambda x: x / 100)

    clean_genus.set_index(rank, inplace=True)

    indices = clean_genus.index
    indices = [i.lstrip() for i in indices]

    # Remove any [ and ] characters from the indices.
    indices = [i.replace("[", "") for i in indices]
    indices = [i.replace("]", "") for i in indices]

    # Replace any genus names with the correct genus name.
    # indices = [replacement_dict[i] if i in replacement_dict else i for i in indices]

    clean_genus.index = indices

    ann, uann = standardize_wgsa(clean_genus)

    # If the shape of uann is not 0, then we have unannotated features.
    if uann.shape[0] != 0:
        print("WARNING: Unannotated features: ", uann.shape[0])
        print(uann.head(10))

    prefix = os.path.basename(data_path).split("_")[0]
    # left_prefix = "s"
    output_file = os.path.join(output_dir, left_prefix + prefix + "_" + rank.lower() + "_" + "relabund.csv")

    ann.to_csv(output_file, sep=",", header=True, index_label=rank)

# clean_and_parse_wgsa(cami_sim_data, output_path)

In [6]:
# There may be more than one output file, so we need to combine them.
def combine_files(data_path: str, rank: str):
    if not os.path.exists(data_path):
        raise FileNotFoundError("The data path does not exist.")

    for root, dirs, files in os.walk(data_path):
        print(files)
        if len(files) == 0:
            raise Exception("No files found in output directory.")
            
        for file in files:
            if "REPORT" in file:
                print(os.path.join(root, file))
                clean_and_parse_wgsa(os.path.join(root, file), output_path, rank=rank)

combine_files(data, "Genus")
combine_files(data, "Species")

['SRR11487940_taxREPORT.txt', 'SRR11487939_taxREPORT.txt', 'SRR11487938_taxREPORT.txt', 'SRR11487937_taxREPORT.txt', 'SRR11487941_taxREPORT.txt']
/Volumes/TBHD_share/valencia/pipelines/amos/nibsc/mixed/wgsa/outputs/TAXprofiles/TEDreadsTAX/reports/SRR11487940_taxREPORT.txt
Before:  (926, 1)
After:  (15, 1)
/Volumes/TBHD_share/valencia/pipelines/amos/nibsc/mixed/wgsa/outputs/TAXprofiles/TEDreadsTAX/reports/SRR11487939_taxREPORT.txt
Before:  (917, 1)
After:  (15, 1)
/Volumes/TBHD_share/valencia/pipelines/amos/nibsc/mixed/wgsa/outputs/TAXprofiles/TEDreadsTAX/reports/SRR11487938_taxREPORT.txt
Before:  (907, 1)
After:  (15, 1)
/Volumes/TBHD_share/valencia/pipelines/amos/nibsc/mixed/wgsa/outputs/TAXprofiles/TEDreadsTAX/reports/SRR11487937_taxREPORT.txt
Before:  (892, 1)
After:  (15, 1)
/Volumes/TBHD_share/valencia/pipelines/amos/nibsc/mixed/wgsa/outputs/TAXprofiles/TEDreadsTAX/reports/SRR11487941_taxREPORT.txt
Before:  (916, 1)
After:  (15, 1)
['SRR11487940_taxREPORT.txt', 'SRR11487939_taxREP