In [11]:
import sys
sys.path.append("../../")

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from utils.ncbi.names import split_bio, map_and_add_tax_ids, standardize_core, generate_names_df, names_db_path, find_tax_id
from typing import Tuple
from utils.data_paths import *

taxonomy_dict = {"s": "species", "g": "genus", "f": "family", "o": "order", "c": "class", "p": "phylum", "k": "kingdom"}

In [12]:
replacement_dict = {"Clostridium_clostridioforme": "Clostridium_clostridiiforme"}

split_name_replacement_dict = {
    "LACHNOSPIRACEAE_UNCLASSIFIED": "UNCLASSIFIED_LACHNOSPIRACEAE",
    "CLOSTRIDIALES_FAMILY_XIII-INCERTAE-SEDIS-UNCLASSIFIED":"CLOSTRIDIALES_FAMILY_XIII--INCERTAE-SEDIS",
    "ACTINOMYCES_ODONTOLYTICUS": "SCHAALIA_ODONTOLYTICA",
}

In [13]:
# We want a csv file with genus, relative abundance.
def clean_biobakery_with_taxid(df, rank="g") -> Tuple[pd.DataFrame, pd.DataFrame]:
    data = df.copy()

    # Get the indices so we can split then. They are of the form ex: ...|g__Bacteroides|s__Bacteroides_vulgatus
    indices = data.index.to_list()
    # Tax_ids are of the form #|#|# ...
    tax_ids = data["TAX_ID"].to_list()

    # This makes a list of lists, where each sublist is the taxonomy split by the rank.
    splitted = [i.split("|") for i in indices]
    splitted_ids = [i.split("|") for i in tax_ids]

    new_index = []
    new_ids = []
    # Traverse all of the rows.
    for c, i in enumerate(splitted):
        # Traverse the sublists.
        for c2, j in enumerate(i):
            # If it matches the rank we want, then append it to the new index.
            if j.startswith(f"{rank}__"):
                new_index.append(j.replace(f"{rank}__", ""))
                # The taxid is the same index as the taxonomy.
                new_ids.append(splitted_ids[c][c2])

    # Set the new index.
    data.index = new_index

    # Make a new dataframe with the new index and the new_ids. We will use this to merge since we do not want to sum the tax_ids.
    taxid_df = pd.DataFrame(index=new_index, data=new_ids, columns=["TAX_ID"])

    # Sum the rows where the genus/species is the same.
    grouped = data.groupby(data.index).sum(numeric_only=True)

    # Now, we want to rename the columns to be the sample names, since they are of the form sampleid_###.
    columns = grouped.columns.to_list()
    new_cols = [i.split("_")[0] for i in columns]
    grouped.columns = new_cols

    # Divide all the values by 100.
    grouped = grouped / 100

    # We should now remove duplicates in the taxIDs (since genera can be equivalent). 
    # If there are duplicates, then we should make it equal to the first value.
    taxid_df = taxid_df[~taxid_df.index.duplicated(keep="first")]

    return grouped, taxid_df

# We want to save to csv, but we want a csv for each column.
def save_to_csv(df, taxid_df, output_path, rank="s"):
    # Get the columns.
    columns = df.columns.to_list()

    # Iterate over the columns.
    for c, i in enumerate(columns):
        # Get the column.
        col = df[[i]]

        # Join the tax_ids to the column.
        # Now, we want to add the new_ids to the dataframe by joining on the index.
        col = col.join(taxid_df, how="left")

        # TODO: Figure out why I did this in the first place, removing for now.
        # We need to multiply each column by the reciprocal of the minimum non-zero value.

        # # Get the minimum non-zero value.
        # minimum = col[col > 0].min().min()
        # col[i] = col[i] / minimum

        # # Round to closest integer.
        # col[i] = col[i].round()

        # display(col.head())

        # Save to csv.
        col.to_csv(os.path.join(output_path, f"{i.upper()}_{taxonomy_dict[rank]}_relabund_annotated.csv"), index_label=f"{taxonomy_dict[rank]}")

In [14]:
# In their great wisdom, they decided that the taxid should not be incldued in the file. 
# Therefore, we will need to standardize the names and attach the taxids.
def clean_biobakery_merged(df: pd.DataFrame, rank="g"):
    """
    Cleans the merged biobakery file (after cutting into species_relab.txt) and attached TAX_IDs.
    """
    data = df.copy()

    # Get the indices so we can split then. They are of the form ex: ...|g__Bacteroides|s__Bacteroides_vulgatus
    indices = data.index.to_list()

    # This makes a list of lists, where each sublist is the taxonomy split by the rank.
    splitted = [i.split("|") for i in indices]
    # print("splitted: ", splitted)

    new_index = []
    # Traverse all of the rows.
    for c, i in enumerate(splitted):
        # Traverse the sublists.
        for c2, j in enumerate(i):
            # If it matches the rank we want, then append it to the new index.
            if j.startswith(f"{rank}__"):
                new_index.append(j.replace(f"{rank}__", ""))

    # print(new_index)
    # Set the new index.
    data.index = new_index

    # Sum the rows where the genus/species is the same.
    grouped = data.groupby(data.index).sum(numeric_only=True)

    # Now, we want to rename the columns to be the sample names, since they are of the form sampleid_###.
    columns = grouped.columns.to_list()
    new_cols = [i.split("_")[0] for i in columns]
    grouped.columns = new_cols

    # Divide all the values by 100.
    grouped = grouped / 100

    return grouped

In [15]:
def standardize_merged(df: pd.DataFrame):
    """ 
    This function takes the merged dataframe (converted to RA with clean_biobakery_merged) \  
    and standardizes the names plus adds the taxids.
    """
    names_df = generate_names_df(names_db_path, load_pickle=True)

    # Get the index. Replace names with those in replacement_dict.
    index = df.index.to_list()
    index = [replacement_dict[i] if i in replacement_dict else i for i in index]
    df.index = index

    split_names = split_bio(df)

    standard_df = standardize_core(input_df=df, split_names=split_names)

    # Replace the split_name using the replacement_dict.
    standard_df.set_index("split_name", inplace=True)

    # Replace any split_names that are in the replacement_dict.
    standard_df.index = [split_name_replacement_dict[i] if i in split_name_replacement_dict else i for i in standard_df.index]

    annotated = map_and_add_tax_ids(df=standard_df, names_df=names_df)

    # We want to split the tax_id column off the dataframe.
    taxid_df = annotated[["tax_id"]]

    # If any rows contain nan, then we raise an error.
    if taxid_df.isna().any().any():
        # Print the rows that contain nan.
        display(taxid_df[taxid_df.isna().any(axis=1)])
        raise ValueError("There are nan values in the tax_id column.")

    annotated.drop(columns=["tax_id"], inplace=True)

    return annotated, taxid_df

### Main Code
Run the below function for cleaning of biobakery. Change rank for the desired rank.

In [16]:
# data_path = hmpTongue.biobakery4
# # data_path = "pipelines/bmock12/biobakery4/species_relab.txt"
# output_dir = "pipelines/hmp/tongue/bio4/"
# if not os.path.exists(data_path):
#     raise Exception("Data file does not exist!")
# # data_path = "pipelines/bmock12/biobakery4/species_relab.txt"
# data = pd.read_csv(data_path, sep="\t", index_col=0)
# # display(data.head())
# # print(data.shape)

In [17]:
# rank = "s"
# names, taxids = clean_biobakery_with_taxid(data, rank=rank)
# save_to_csv(names, taxids, output_dir, rank=rank)

In [18]:
dpths = make_data_list()

# We have an issue with the HMP. Let's make a replacement dictionary to fix this for now.
hmp_dict = {
    "Clostridiales_unclassified": "Unclassified",
    "Firmicutes_unclassified": "Unclassified",
    "Lachnospiraceae_unclassified": "Unclassified",
    "Ruminococcaceae_unclassified": "Unclassified",
    # "Ruminococcaceae_unclassified": "Unclassified",
}

amos_dict = {
    "Prevotella_copri_clade_A": "Prevotella_copri",
}

def main(data_path: str, output_dir: str, rank: str, replace_dict: dict = None) -> None:
    print(data_path)
    data = pd.read_csv(data_path, sep="\t", index_col=0)
    # display(data)

    cleaned = clean_biobakery_merged(data, rank=rank)
        
    if replace_dict is not None:
        cleaned.rename(index=replace_dict, inplace=True)

    annotated, ids = standardize_merged(cleaned)

    # display(annotated)

    save_to_csv(annotated, ids, output_path=output_dir, rank=rank)

def clean_all():
    for i in dpths:
        output_path = os.path.join(i.path, "bio4")
        if not os.path.exists(output_path):
            output_path = os.path.join(i.path, "biobakery4")
            os.makedirs(output_path, exist_ok=True)

        print(output_path)
        if i.biobakery4 == "":
            continue
        
        if "hmp" in i.path:
            replace_dict = hmp_dict
        elif "mixed" in i.path or "hilo" in i.path:
            replace_dict = amos_dict
        else:
            replace_dict = None

        main(i.biobakery4, output_dir=output_path, rank="g", replace_dict=replace_dict)
        main(i.biobakery4, output_dir=output_path, rank="s", replace_dict=replace_dict)

    for i in dpths:
        output_path = os.path.join(i.path, "bio3")
        if not os.path.exists(output_path):
            output_path = os.path.join(i.path, "biobakery3")
            os.makedirs(output_path, exist_ok=True)

        print(output_path)
        if i.biobakery3 == "":
            continue
        
        if "hmp" in i.path:
            replace_dict = hmp_dict
        elif "mixed" in i.path or "hilo" in i.path:
            replace_dict = amos_dict
        else:
            replace_dict = None

        main(i.biobakery3, output_dir=output_path, rank="g", replace_dict=replace_dict)
        main(i.biobakery3, output_dir=output_path, rank="s", replace_dict=replace_dict)

clean_all()


/Users/valenciaem/coding/pipelines/pipelines/bmock12/biobakery4
/Volumes/TBHD_share/valencia/pipelines/bmock12/biobakery4/metaphlan/merged/species_relab.txt
The pkl file was last modified (and hopefully generated) on 2023-02-14 18:37:13.183117+00:00
/Volumes/TBHD_share/valencia/pipelines/bmock12/biobakery4/metaphlan/merged/species_relab.txt
The pkl file was last modified (and hopefully generated) on 2023-02-14 18:37:13.183117+00:00
/Users/valenciaem/coding/pipelines/pipelines/camisimGI/biobakery4
/Volumes/TBHD_share/cami_data/bio4/metaphlan/merged/species_relab.txt
The pkl file was last modified (and hopefully generated) on 2023-02-14 18:37:13.183117+00:00
/Volumes/TBHD_share/cami_data/bio4/metaphlan/merged/species_relab.txt
The pkl file was last modified (and hopefully generated) on 2023-02-14 18:37:13.183117+00:00
/Users/valenciaem/coding/pipelines/pipelines/tourlousse/biobakery4
/Volumes/TBHD_share/valencia/pipelines/microbio_spectrum/CLEANED/pipelines/bio4/metaphlan/merged/species_

In [19]:
def clean_eg():
    # We also have to add the bio_eg.
    bio_eg_path = "/Volumes/TBHD_share/valencia/pipelines/NIST/pipelines/bio3_eg/metaphlan/merged/species_relab.txt"
    output_eg = "../../pipelines/nist/biobakery3/"

    main(bio_eg_path, output_eg, rank="g")
    main(bio_eg_path, output_eg, rank="s")

clean_eg()

/Volumes/TBHD_share/valencia/pipelines/NIST/pipelines/bio3_eg/metaphlan/merged/species_relab.txt
The pkl file was last modified (and hopefully generated) on 2023-02-14 18:37:13.183117+00:00
/Volumes/TBHD_share/valencia/pipelines/NIST/pipelines/bio3_eg/metaphlan/merged/species_relab.txt
The pkl file was last modified (and hopefully generated) on 2023-02-14 18:37:13.183117+00:00
