In [2]:
import pandas as pd
import numpy as np
import os
import openpyxl
import sys  # noqa

sys.path.append("../..")  # noqa

from utils.ncbi.jams_convert import convert_jams_to_taxid, generate_names_df, names_db_path
from utils.data_paths import *


In [3]:
# replacement_dict = {
#     "LKT__s__Anaerobutyricum_hallii": "LKT__s__Eubacterium_hallii"}

# camisim_replacement_dict = {
#     "Acetivibrio_thermocellus": "Ruminiclostridium_thermocellum",
#     "Thermoclostridium_stercorarium": "Ruminiclostridium_stercorarium",
# }

In [4]:
def clean_jams(input_file: str, rank: str = "Genus", input_type="csv"):
    """
    This function cleans the output from JAMSalpha in the bmock12 dataset. From now on, use the JAMSbeta function.
    """

    output_dir = os.path.dirname(input_file)
    file_name = os.path.basename(input_file).split(".")[0]
    csv_path = os.path.join(
        output_dir, f"{file_name.upper()}_{rank.lower()}_relabund.csv")

    df = pd.DataFrame()
    if input_type == "csv":
        df = pd.read_csv(input_file, index_col=0)
    elif input_type == "excel":
        df = pd.read_excel(input_file, index_col=0)
    else:
        raise Exception("Input type not recognized.")

    df["RA"] = df["NumBases"] / df["NumBases"].sum()
    # display(df.head())
    species_df = df[["Species", "RA"]].groupby("Species").sum()
    species_df.sort_values("RA", ascending=False, inplace=True)

    # We need to remove g__ and s__ from the index names
    # genus_df.index = genus_df.index.str.replace("g__", "")
    species_df.index = species_df.index.str.replace("s__", "")

    names_df = generate_names_df(names_db_path, load_pickle=True)

    if rank == "Genus":
        # We need to split the species names into genus and species on the _ character.
        species_names = species_df.index.to_list()
        genus_names = [x.split("_")[0] for x in species_names]

        species_df["Genus"] = genus_names

        genus_df = species_df[["Genus", "RA"]].groupby("Genus").sum()

        genus_df.sort_values("RA", ascending=False, inplace=True)

        # genus_df.to_csv(csv_path)

        annotated, unannotated = convert_jams_to_taxid(genus_df, names_df)
        annotated.to_csv(csv_path.replace(
            ".csv", "_annotated.csv"), index_label=rank)

        return

    annotated, unannotated = convert_jams_to_taxid(species_df, names_df)
    annotated.to_csv(csv_path.replace(
        ".csv", "_annotated.csv"), index_label=rank)

clean_jams(input_file = "../../pipelines/bmock12/jams/s1.csv", rank = "Species", input_type = "csv")


The pkl file was last modified (and hopefully generated) on 2023-02-14 18:37:13.183117+00:00


In [5]:
def clean_jams_alpha(input_file: str, rank: str = "Genus", input_type="csv"):
    """
    This function cleans the output from JAMSalpha in the bmock12 dataset. From now on, use the JAMSbeta function.
    """

    output_dir = os.path.dirname(input_file)
    file_name = os.path.basename(input_file).split(".")[0]
    csv_path = os.path.join(
        output_dir, f"{file_name.upper()}_{rank.lower()}_relabund.csv")

    df = pd.DataFrame()
    if input_type == "csv":
        df = pd.read_csv(input_file, index_col=0)
    elif input_type == "excel":
        df = pd.read_excel(input_file, index_col=0)
    else:
        raise Exception("Input type not recognized.")

    df["RA"] = df["NumBases"] / df["NumBases"].sum()

    rank_df = df[[rank, "RA"]].groupby(rank).sum()

    cleaned_names = [i.split("__")[1] for i in rank_df.index]
    rank_df.index = cleaned_names
    rank_df.index.name = rank

    rank_df.sort_values("RA", ascending=False, inplace=True)

    names_df = generate_names_df(names_db_path, load_pickle=True)
    annotated, unannotated = convert_jams_to_taxid(rank_df, names_df)
    annotated.to_csv(csv_path.replace(
        ".csv", "_annotated.csv"), index_label=rank)

    display(annotated)


clean_jams_alpha(input_file="../../pipelines/bmock12/jams/s1.csv",
                 rank="Genus", input_type="csv")
clean_jams_alpha(input_file="../../pipelines/bmock12/jams/s1.csv",
                 rank="Species", input_type="csv")


The pkl file was last modified (and hopefully generated) on 2023-02-14 18:37:13.183117+00:00


Unnamed: 0,RA,tax_id
HALOMONAS,0.3559839,2745
MARINOBACTER,0.2138735,2742
MURICAUDA,0.1781729,111500
PSYCHROBACTER,0.1158328,497
COHAESIBACTER,0.06859137,655352
THIOCLAVA,0.02607573,285107
MICROMONOSPORA,0.01850388,1873
ENEMELLA,0.01125847,2896773
UNCLASSIFIED,0.006811037,12908
LITOREIBACTER,0.002136694,947567


The pkl file was last modified (and hopefully generated) on 2023-02-14 18:37:13.183117+00:00


Unnamed: 0,RA,tax_id
HALOMONAS_SP._HL-93,0.3154949,1666906
MURICAUDA_LUTIMARIS,0.1759923,475082
MARINOBACTER_SP._LV10R510-11A,0.1482264,1415568
PSYCHROBACTER_SP._72-O-C,0.1070827,2774125
COHAESIBACTER_SP._ES-047,0.06859137,1798205
MARINOBACTER_SP._LV10MA510-1,0.04835039,1415567
HALOMONAS_SP._HL-4,0.03509602,1761789
THIOCLAVA_SP._ES-031,0.02019231,1798203
UNCLASSIFIED,0.01322326,12908
MARINOBACTER_SP._LV10R520-4,0.01205668,1761796


In [6]:
def save_jams_to_csv(df: pd.DataFrame, taxid_df: pd.DataFrame, output_dir: str, rank: str):
    # Save each column as a separate file.
    columns = df.columns.to_list()
    for c, i in enumerate(columns):
        col = df[[i]]
        col = col.join(taxid_df, how="left")

        # col.astype({"tax_id": "int64"})
        col["tax_id"] = col["tax_id"].astype("int64")

        col.sort_values(i, ascending=False, inplace=True)

        if not os.path.exists(output_dir):
            os.makedirs(output_dir, exist_ok=True)

        output_path = os.path.join(
            output_dir, f"{i.upper()}_{rank}_relabund_annotated.csv")

        col.to_csv(output_path, index_label=rank)


In [7]:
def clean_jams_join(input_file: str, rank: str, output_dir="") -> None:
    """ 
    This function cleans jams, but uses the LKT_featuretable sheet to join the taxonomy to the relabund.
    Parameters:
        input_file: The path to the input file.
        rank: The rank to use for the output file.
        output_dir: The directory to save the output file to.
    """
    rank = rank.capitalize()
    print(input_file)
    relabund_df = pd.read_excel(input_file, index_col=0, sheet_name=1)
    featuretable_df = pd.read_excel(
        input_file, index_col=0, sheet_name="LKT_featuretable")

    # Make everything into relative abundances (i.e. pct).
    relabund_df = relabund_df / relabund_df.sum(axis=0)

    joined = relabund_df.join(featuretable_df, how="inner")

    # The two dataframes should have the same number of rows.
    assert relabund_df.shape[0] == joined.shape[0], "The relabund and featuretable dfs should have the same number of rows."

    # Set the index to the rank.
    joined.set_index(rank, inplace=True)

    # Drop any columns that are not from the relabund_df.
    joined.drop(columns=[
                i for i in joined.columns if i not in relabund_df.columns], inplace=True)

    # Reset the index so that we can groupby the rank.
    joined.reset_index(inplace=True)

    # Now, we can groupby the rank and sum the relabund, while keeping the genus as first.
    agg_dict = {i: "sum" for i in relabund_df.columns}
    agg_dict[rank] = "first"

    joined = joined.groupby(rank).agg(agg_dict)
    joined.set_index(rank, inplace=True)

    # Now, we need to split off the first three characters from the index.
    # This is the cleaned name.
    cleaned_names = [i.split("__")[1] for i in joined.index]
    joined.index = cleaned_names
    joined.index.name = rank

    # Now, we can run it through the annotation pipeline.
    names_df = generate_names_df(names_db_path, load_pickle=True)
    annotated, unannotated = convert_jams_to_taxid(joined.copy(), names_df)

    taxid_df = annotated[["tax_id"]]
    annotated.drop(columns=["tax_id"], inplace=True)

    save_jams_to_csv(annotated, taxid_df, output_dir, rank.lower())

# clean_jams_join("/Volumes/TBHD_share/valencia/pipelines/amos/nibsc/hilo/jams2022/beta_output/hilo_Relabund_PPM.xlsx", "Genus")
# clean_jams_join("/Volumes/TBHD_share/valencia/pipelines/amos/nibsc/hilo/jams2022/beta_output/hilo_Relabund_PPM.xlsx", "Species")


# Main

In [8]:
# Main
# rank = "species"
paths = make_data_list()
for p in paths:
    # bmock12 uses a single sample, so jamsbeta was not run.
    # Use the clean_jams function on the csv dump from the R session.
    if "bmock12" in p.path:
        continue

    # if p.jams != "":
    #     clean_jams_join(p.jams, rank="genus", output_dir=os.path.join(p.path, "jams"))
    #     clean_jams_join(p.jams, rank="species", output_dir=os.path.join(p.path, "jams"))

    if p.jams202212 != "":
        clean_jams_join(p.jams202212, rank="genus",
                        output_dir=os.path.join(p.path, "jams202212"))
        clean_jams_join(p.jams202212, rank="species",
                        output_dir=os.path.join(p.path, "jams202212"))

# data_path = hmpTongue.jams
# output_dir = "pipelines/hmp/tongue/jams"

# clean_jams_beta(data_path, rank="species", output_dir=output_dir)
# clean_jams_beta(data_path, rank="genus", output_dir=output_dir)
# clean_jams_beta_higher("/Volumes/NRTS_share/SMS_NIAID_0162/fqfiles/Batch1/jams/brain_jams/brainjams_Relabund_PPM.xlsx")


/Volumes/TBHD_share/valencia/pipelines/microbio_spectrum/CLEANED/pipelines/jams2022/beta_output/filtered_PPM.xlsx
The pkl file was last modified (and hopefully generated) on 2023-02-14 18:37:13.183117+00:00
/Volumes/TBHD_share/valencia/pipelines/microbio_spectrum/CLEANED/pipelines/jams2022/beta_output/filtered_PPM.xlsx
The pkl file was last modified (and hopefully generated) on 2023-02-14 18:37:13.183117+00:00
/Volumes/TBHD_share/valencia/pipelines/amos/nibsc/mixed/jams2022/beta_output/filtered_PPM.xlsx
The pkl file was last modified (and hopefully generated) on 2023-02-14 18:37:13.183117+00:00
/Volumes/TBHD_share/valencia/pipelines/amos/nibsc/mixed/jams2022/beta_output/filtered_PPM.xlsx
The pkl file was last modified (and hopefully generated) on 2023-02-14 18:37:13.183117+00:00
/Volumes/TBHD_share/valencia/pipelines/amos/nibsc/hilo/jams2022/beta_output/filtered_PPM.xlsx
The pkl file was last modified (and hopefully generated) on 2023-02-14 18:37:13.183117+00:00
/Volumes/TBHD_share/val