In [5]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from utils.ncbi.names import split_bio, map_and_add_tax_ids, standardize_core, generate_names_df, names_db_path, find_tax_id
from typing import Tuple

taxonomy_dict = {"s": "species", "g": "genus", "f": "family", "o": "order", "c": "class", "p": "phylum", "k": "kingdom"}

In [13]:
# We want a csv file with genus, relative abundance.
def clean_biobakery_with_taxid(df, rank="g") -> Tuple[pd.DataFrame, pd.DataFrame]:
    data = df.copy()

    # Get the indices so we can split then. They are of the form ex: ...|g__Bacteroides|s__Bacteroides_vulgatus
    indices = data.index.to_list()
    # Tax_ids are of the form #|#|# ...
    tax_ids = data["TAX_ID"].to_list()

    # This makes a list of lists, where each sublist is the taxonomy split by the rank.
    splitted = [i.split("|") for i in indices]
    splitted_ids = [i.split("|") for i in tax_ids]

    new_index = []
    new_ids = []
    # Traverse all of the rows.
    for c, i in enumerate(splitted):
        # Traverse the sublists.
        for c2, j in enumerate(i):
            # If it matches the rank we want, then append it to the new index.
            if j.startswith(f"{rank}__"):
                new_index.append(j.replace(f"{rank}__", ""))
                # The taxid is the same index as the taxonomy.
                new_ids.append(splitted_ids[c][c2])

    # Set the new index.
    data.index = new_index

    # Make a new dataframe with the new index and the new_ids. We will use this to merge since we do not want to sum the tax_ids.
    taxid_df = pd.DataFrame(index=new_index, data=new_ids, columns=["TAX_ID"])

    # Sum the rows where the genus/species is the same.
    grouped = data.groupby(data.index).sum(numeric_only=True)

    # Now, we want to rename the columns to be the sample names, since they are of the form sampleid_###.
    columns = grouped.columns.to_list()
    new_cols = [i.split("_")[0] for i in columns]
    grouped.columns = new_cols

    # Divide all the values by 100.
    grouped = grouped / 100

    # We should now remove duplicates in the taxIDs (since genera can be equivalent). 
    # If there are duplicates, then we should make it equal to the first value.
    taxid_df = taxid_df[~taxid_df.index.duplicated(keep="first")]

    return grouped, taxid_df

# We want to save to csv, but we want a csv for each column.
def save_to_csv(df, taxid_df, output_path, rank="s"):
    # Get the columns.
    columns = df.columns.to_list()

    # Iterate over the columns.
    for c, i in enumerate(columns):
        # Get the column.
        col = df[[i]]

        # Join the tax_ids to the column.
        # Now, we want to add the new_ids to the dataframe by joining on the index.
        col = col.join(taxid_df, how="left")

        display(col.head())

        # Save to csv.
        col.to_csv(os.path.join(output_path, f"{i.upper()}_{taxonomy_dict[rank]}_relabund_annotated.csv"), index_label=f"{taxonomy_dict[rank]}")

In [9]:
# In their great wisdom, they decided that the taxid should not be incldued in the file. 
# Therefore, we will need to standardize the names and attach the taxids.
def clean_biobakery_merged(df: pd.DataFrame, rank="g"):
    data = df.copy()

    # Get the indices so we can split then. They are of the form ex: ...|g__Bacteroides|s__Bacteroides_vulgatus
    indices = data.index.to_list()

    # This makes a list of lists, where each sublist is the taxonomy split by the rank.
    splitted = [i.split("|") for i in indices]

    new_index = []
    # Traverse all of the rows.
    for c, i in enumerate(splitted):
        # Traverse the sublists.
        for c2, j in enumerate(i):
            # If it matches the rank we want, then append it to the new index.
            if j.startswith(f"{rank}__"):
                new_index.append(j.replace(f"{rank}__", ""))

    # Set the new index.
    data.index = new_index

    # Sum the rows where the genus/species is the same.
    grouped = data.groupby(data.index).sum(numeric_only=True)

    # Now, we want to rename the columns to be the sample names, since they are of the form sampleid_###.
    columns = grouped.columns.to_list()
    new_cols = [i.split("_")[0] for i in columns]
    grouped.columns = new_cols

    # Divide all the values by 100.
    grouped = grouped / 100

    return grouped

In [10]:
def standardize_merged(df: pd.DataFrame):
    """ 
    This function takes the merged dataframe (converted to RA with clean_biobakery_merged) \  
    and standardizes the names plus adds the taxids.
    """
    names_df = generate_names_df(names_db_path, load_pickle=True)

    split_names = split_bio(df)

    standard_df = standardize_core(input_df=df, split_names=split_names)

    standard_df.set_index("split_name", inplace=True)

    annotated = map_and_add_tax_ids(df=standard_df, names_df=names_df)

    # We want to split the tax_id column off the dataframe.
    taxid_df = annotated[["tax_id"]]
    annotated.drop(columns=["tax_id"], inplace=True)

    return annotated, taxid_df

### Main Code
Run the below function for cleaning of biobakery. Change rank for the desired rank.

In [6]:
data_path = "/Volumes/TBHD_share/valencia/pipelines/bmock12/biobakery4/metaphlan/main/species_relab.txt"
# data_path = "pipelines/bmock12/biobakery4/species_relab.txt"
output_dir = "pipelines/bmock12/biobakery4"
if not os.path.exists(data_path):
    raise Exception("Data file does not exist!")
# data_path = "pipelines/bmock12/biobakery4/species_relab.txt"
data = pd.read_csv(data_path, sep="\t", index_col=0, usecols=[0,1,2])
display(data.head())

Unnamed: 0_level_0,TAX_ID,s1_profile
# taxonomy,Unnamed: 1_level_1,Unnamed: 2_level_1
k__Bacteria|p__Bacteroidetes|c__Flavobacteriia|o__Flavobacteriales|f__Flavobacteriaceae|g__Muricauda|s__Muricauda_sp_ES_050,2|976|117743|200644|49546|111500|1798204,74.33332
k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Cohaesibacteraceae|g__Cohaesibacter|s__Cohaesibacter_sp_ES_047,2|1224|28211|356|655351|655352|1798205,19.36211
k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhodobacterales|f__Rhodobacteraceae|g__Thioclava|s__Thioclava_sediminum,2|1224|28211|204455|31989|285107|1915319,2.05531
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Micromonosporales|f__Micromonosporaceae|g__Micromonospora|s__Micromonospora_echinofusca,2|201174|1760|85008|28056|1873|47858,2.01187
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Micromonosporales|f__Micromonosporaceae|g__Micromonospora|s__Micromonospora_echinaurantiaca,2|201174|1760|85008|28056|1873|47857,1.6624


In [15]:
rank = "s"
names, taxids = clean_biobakery_with_taxid(data, rank=rank)
save_to_csv(names, taxids, output_dir, rank=rank)

Unnamed: 0,s1,TAX_ID
Cohaesibacter_sp_ES_047,0.193621,1798205
Halomonas_sp_HL_48,0.000401,1479235
Micromonospora_echinaurantiaca,0.016624,47857
Micromonospora_echinofusca,0.020119,47858
Muricauda_lutimaris,0.005349,475082


In [19]:
data_path_merged = "/Volumes/TBHD_share/valencia/pipelines/bmock12/NEPHELE/bio/outputs/metaphlan/merged/species_abundance.txt"
data_merged_output = "pipelines/bmock12/biobakery3"
rank = "s"

data = pd.read_csv(data_path_merged, sep="\t", index_col=0)
cleaned = clean_biobakery_merged(data, rank=rank)

annotated, ids = standardize_merged(cleaned)
save_to_csv(annotated, ids, data_merged_output, rank=rank)

['Cohaesibacter_sp._ES_047', 'Halomonas_sp._HL_48', 'Marinobacter_sp._ELB17', 'Micromonospora_echinaurantiaca', 'Micromonospora_echinofusca', 'Muricauda_lutimaris', 'Muricauda_sp._ES_050', 'Thioclava_nitratireducens', 'Thioclava_sediminum']


Unnamed: 0_level_0,1,tax_id
split_name,Unnamed: 1_level_1,Unnamed: 2_level_1
COHAESIBACTER_SP._ES-047,0.204007,1798205
HALOMONAS_SP._HL-48,0.00057,1479235
MARINOBACTER_SP._ELB17,2e-06,270374
MICROMONOSPORA_ECHINAURANTIACA,0.017007,47857
MICROMONOSPORA_ECHINOFUSCA,0.020925,47858
