In [2]:
import pandas as pd
import numpy as np
import os
import openpyxl

In [15]:
replacement_dict = {"LKT__s__Anaerobutyricum_hallii": "LKT__s__Eubacterium_hallii"}

In [17]:
# TODO: Update replacement dict for JAMS.

tourlousse_replacement_dict = {"Acetivibrio_thermocellus": "Clostridium_thermocellum"}

def clean_jams(input_file: str, rank: str = "Genus", input_type="csv"):
    """
    This function cleans the output from JAMSalpha in the tourlousse dataset. From now on, use the JAMSbeta function.
    """
    df = pd.DataFrame()
    if input_type == "csv":
        df = pd.read_csv(input_file, index_col=0)
    elif input_type == "excel":
        df = pd.read_excel(input_file, index_col=0)
    else:
        raise Exception("Input type not recognized.")
    
    df["RA"] = df["NumBases"] / df["NumBases"].sum()
    # display(df.head())
    species_df = df[["Species", "RA"]].groupby("Species").sum()
    species_df.sort_values("RA", ascending=False, inplace=True)

    # We need to remove g__ and s__ from the index names
    # genus_df.index = genus_df.index.str.replace("g__", "")
    species_df.index = species_df.index.str.replace("s__", "")

    # We need to replace the names of the species that have changed to the old ones.
    species_names = species_df.index.tolist()
    for i, name in enumerate(species_names):
        if name in tourlousse_replacement_dict:
            species_names[i] = tourlousse_replacement_dict[name]
    
    species_df.index = species_names

    if rank == "Genus":
        # We need to split the species names into genus and species on the _ character.
        species_names = species_df.index.to_list()
        genus_names = [x.split("_")[0] for x in species_names]

        species_df["Genus"] = genus_names

    output_dir = os.path.dirname(input_file)
    file_name = os.path.basename(input_file).split(".")[0]

    display(species_df.loc("Clostridium_thermocellum"))

    # genus_df.to_csv(os.path.join(output_dir, f"{file_name}_{rank.lower()}_relabund.csv"))

clean_jams("pipelines/camisimGI/jams/s2.csv", "Genus")

ValueError: No axis named Clostridium_thermocellum for object type DataFrame

In [23]:
def clean_jams_beta(input_file: str, rank="genus", output_dir=""):
    """Clean JAMS output excel file and save them in the preferred format. 
    It will generate a separate file for each sample.
    
    Parameters
    ----------
    input_dir : str
        Path to directory containing JAMS output files.
    rank : str
        Taxonomic rank.
    output_dir : str
        Path to directory where output files will be saved.
    """

    df = pd.read_excel(input_file, index_col=0, sheet_name=1)
    # Make everything into relative abundances
    df = df / df.sum(axis=0)
    # Convert PPM to percentage.
    # df = df / 10000


    # We need to find anything with s__ or g__ in the row.names.
    df.reset_index(inplace=True)

    # Replace from the replacement dictionary.
    df["row.names"] = df["row.names"].replace(replacement_dict)
    df.to_csv("test.csv")

    # If s__ is in the name, we need to get the genus name.
    index_names = []
    if rank == "genus":
        df = df.where(df["row.names"].str.contains("g__|s__")).dropna()
        lkt = df["row.names"].to_list()
        for i in lkt:
            if "s__" in i:
                index_names.append(i.split("s__")[1].split("_")[0])
            else:
                index_names.append(i.split("g__")[1].split("_")[0])

    elif rank == "species":
        df = df.where(df["row.names"].str.contains("s__")).dropna()
        lkt = df["row.names"].to_list()
        for i in lkt:
            index_names.append(i.split("s__")[1])
            
        index_names = [i.replace("_", " ") for i in index_names]


    df.index = index_names
    df.drop(columns=["row.names"], inplace=True)

    # Sum all of the rows with the same index name.
    df = df.groupby(df.index).sum()

    # Save each column as a separate file.
    if output_dir is not None:
        for col in df.columns:
            df[col].to_csv(os.path.join(output_dir, f"{col}_{rank}_relabund.csv"), index_label=f"{rank}")

amos_path = "/Volumes/TBHD_share/valencia/pipelines/amos/nibsc/hilo/jams/beta_output/hilo_Relabund_PPM.xlsx"

clean_jams_beta(amos_path, rank="genus", output_dir="pipelines/amos/hilo/jams")
