In [2]:
import pandas as pd
import numpy as np
import os
import openpyxl
import sys

sys.path.append("../..")

from utils.ncbi.jams_convert import convert_jams_to_taxid, generate_names_df, names_db_path
from utils.data_paths import *

In [3]:
replacement_dict = {"LKT__s__Anaerobutyricum_hallii": "LKT__s__Eubacterium_hallii"}

In [4]:
# TODO: Update replacement dict for JAMS.

camisim_replacement_dict = {
    "Acetivibrio_thermocellus": "Ruminiclostridium_thermocellum", 
    "Thermoclostridium_stercorarium": "Ruminiclostridium_stercorarium",
}

def clean_jams(input_file: str, rank: str = "Genus", input_type="csv"):
    """
    This function cleans the output from JAMSalpha in the bmock12 dataset. From now on, use the JAMSbeta function.
    """

    output_dir = os.path.dirname(input_file)
    file_name = os.path.basename(input_file).split(".")[0]
    csv_path = os.path.join(output_dir, f"{file_name.upper()}_{rank.lower()}_relabund.csv")

    df = pd.DataFrame()
    if input_type == "csv":
        df = pd.read_csv(input_file, index_col=0)
    elif input_type == "excel":
        df = pd.read_excel(input_file, index_col=0)
    else:
        raise Exception("Input type not recognized.")
    
    df["RA"] = df["NumBases"] / df["NumBases"].sum()
    # display(df.head())
    species_df = df[["Species", "RA"]].groupby("Species").sum()
    species_df.sort_values("RA", ascending=False, inplace=True)

    # We need to remove g__ and s__ from the index names
    # genus_df.index = genus_df.index.str.replace("g__", "")
    species_df.index = species_df.index.str.replace("s__", "")

    names_df = generate_names_df(names_db_path, load_pickle=True)

    if rank == "Genus":
        # We need to split the species names into genus and species on the _ character.
        species_names = species_df.index.to_list()
        genus_names = [x.split("_")[0] for x in species_names]

        species_df["Genus"] = genus_names

        genus_df = species_df[["Genus", "RA"]].groupby("Genus").sum()

        genus_df.sort_values("RA", ascending=False, inplace=True)

        # genus_df.to_csv(csv_path)

        annotated, unannotated = convert_jams_to_taxid(genus_df, names_df)
        annotated.to_csv(csv_path.replace(".csv", "_annotated.csv"), index_label=rank)

        return

    annotated, unannotated = convert_jams_to_taxid(species_df, names_df)
    annotated.to_csv(csv_path.replace(".csv", "_annotated.csv"), index_label=rank)

# clean_jams(input_file = "pipelines/camisimGI/jams/s1.csv", rank = "Genus", input_type = "csv")

In [5]:
def save_jams_to_csv(df: pd.DataFrame, taxid_df: pd.DataFrame, output_dir: str, rank: str):
    # Save each column as a separate file.
    columns = df.columns.to_list()
    for c, i in enumerate(columns):
        col = df[[i]]
        col = col.join(taxid_df, how="left")

        col.astype({"tax_id": "int64"})

        col.sort_values(i, ascending=False, inplace=True)

        col.to_csv(os.path.join(output_dir, f"{i.upper()}_{rank}_relabund_annotated.csv"), index_label=rank)

In [6]:
def clean_jams_beta(input_file: str, rank="genus", output_dir=""):
    """Clean JAMS output excel file and save them in the preferred format. 
    It will generate a separate file for each sample.
    
    Parameters
    ----------
    input_dir : str
        Path to directory containing JAMS output files.
    rank : str
        Taxonomic rank.
    output_dir : str
        Path to directory where output files will be saved.
    """
    print(input_file)
    df = pd.read_excel(input_file, index_col=0, sheet_name=1)
    # Make everything into relative abundances (i.e. pct).
    df = df / df.sum(axis=0)
    # Convert PPM to percentage. This does the same as above.
    # df = df / 10000

    # We need to find anything with s__ or g__ in the row.names.
    df.reset_index(inplace=True)

    # If s__ is in the name, we need to get the genus name.
    index_names = []
    if rank == "genus":
        df = df.where(df["row.names"].str.contains("g__|s__|Unclassified")).dropna()
        lkt = df["row.names"].to_list()
        for i in lkt:
            if "s__" in i:
                index_names.append(i.split("s__")[1].split("_")[0])
            elif "g__" in i:
                index_names.append(i.split("g__")[1].split("_")[0])
            else:
                # Then it is LKT__Unclassified, so we need to remove the LKT__.
                index_names.append(i.split("LKT__")[1])

    elif rank == "species":
        df = df.where(df["row.names"].str.contains("s__|Unclassified")).dropna()
        lkt = df["row.names"].to_list()
        for i in lkt:
            if "s__" in i:
                index_names.append(i.split("s__")[1])
            else:
                print(i)
                # Then it is LKT__Unclassified, so we need to remove the LKT__.
                index_names.append(i.split("LKT__")[1])
            
        index_names = [i.replace("_", " ") for i in index_names]


    df.index = index_names
    df.drop(columns=["row.names"], inplace=True)

    # Sum all of the rows with the same index name.
    df = df.groupby(df.index).sum()

    names_df = generate_names_df(names_db_path, load_pickle=True)
    annotated, unannotated = convert_jams_to_taxid(df.copy(), names_df)

    # We want to split off the tax_ids so we can save each column as a separate csv. 
    # Otherwise, we will save the tax_ids as a separate csv.
    taxid_df = annotated[["tax_id"]]
    annotated.drop(columns=["tax_id"], inplace=True)

    save_jams_to_csv(annotated, taxid_df, output_dir, rank)

In [7]:
def clean_jams_beta_higher(path: str):
    # First, read in the data.
    df_ppm = pd.read_excel(path, index_col=0, sheet_name=1)
    # Convert the PPM to percentages.
    df_pct = df_ppm / 10000

    df_tax = pd.read_excel(path, index_col=0, sheet_name=4)

    # Add a column to df_pct that is the phylum from df_tax. 
    # The row numbers are the same, so we can just use the index.
    # df_pct["Phylum"] = df_tax["Phylum"]
    
    # We can also join on the index to combined relabund and taxonomy.
    df_pct = df_pct.join(df_tax, how="inner")

    output_path = os.path.join(os.path.dirname(path), f"taxonomy_relabund.csv")

    df_pct.to_csv(output_path, index=True)

# Main

In [8]:
# Main
# rank = "species"
paths = make_data_list()
for p in paths:
    # Bmock12 did not need to be resubmitted.
    if "bmock12" in p.path:
        # Bmock12 already has the unclassifieds added.
        continue

    if "nist" in p.path:
        clean_jams_beta(p.jams, rank="genus", output_dir=os.path.join(p.path, "jams"))
        clean_jams_beta(p.jams, rank="species", output_dir=os.path.join(p.path, "jams"))

    # if p.jams != "":
        # clean_jams_beta(p.jams, rank="genus", output_dir=os.path.join(p.path, "jams"))
        # clean_jams_beta(p.jams, rank="species", output_dir=os.path.join(p.path, "jams"))


# data_path = hmpTongue.jams
# output_dir = "pipelines/hmp/tongue/jams"

# clean_jams_beta(data_path, rank="species", output_dir=output_dir)
# clean_jams_beta(data_path, rank="genus", output_dir=output_dir)
# clean_jams_beta_higher("/Volumes/NRTS_share/SMS_NIAID_0162/fqfiles/Batch1/jams/brain_jams/brainjams_Relabund_PPM.xlsx")

/Volumes/TBHD_share/valencia/pipelines/NIST/pipelines/jams/beta_output/NIST_Relabund_PPM.xlsx
The pkl file was last modified (and hopefully generated) on 2022-10-27 14:39:35+00:00
/Volumes/TBHD_share/valencia/pipelines/NIST/pipelines/jams/beta_output/NIST_Relabund_PPM.xlsx
LKT__Unclassified
The pkl file was last modified (and hopefully generated) on 2022-10-27 14:39:35+00:00
