In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from utils.ncbi.names import split_bio, map_and_add_tax_ids, standardize_core, generate_names_df, names_db_path, find_tax_id
from typing import Tuple
from utils.data_paths import *

taxonomy_dict = {"s": "species", "g": "genus", "f": "family", "o": "order", "c": "class", "p": "phylum", "k": "kingdom"}

In [2]:
replacement_dict = {"Clostridium_clostridioforme": "Clostridium_clostridiiforme"}

In [3]:
# We want a csv file with genus, relative abundance.
def clean_biobakery_with_taxid(df, rank="g") -> Tuple[pd.DataFrame, pd.DataFrame]:
    data = df.copy()

    # Get the indices so we can split then. They are of the form ex: ...|g__Bacteroides|s__Bacteroides_vulgatus
    indices = data.index.to_list()
    # Tax_ids are of the form #|#|# ...
    tax_ids = data["TAX_ID"].to_list()

    # This makes a list of lists, where each sublist is the taxonomy split by the rank.
    splitted = [i.split("|") for i in indices]
    splitted_ids = [i.split("|") for i in tax_ids]

    new_index = []
    new_ids = []
    # Traverse all of the rows.
    for c, i in enumerate(splitted):
        # Traverse the sublists.
        for c2, j in enumerate(i):
            # If it matches the rank we want, then append it to the new index.
            if j.startswith(f"{rank}__"):
                new_index.append(j.replace(f"{rank}__", ""))
                # The taxid is the same index as the taxonomy.
                new_ids.append(splitted_ids[c][c2])

    # Set the new index.
    data.index = new_index

    # Make a new dataframe with the new index and the new_ids. We will use this to merge since we do not want to sum the tax_ids.
    taxid_df = pd.DataFrame(index=new_index, data=new_ids, columns=["TAX_ID"])

    # Sum the rows where the genus/species is the same.
    grouped = data.groupby(data.index).sum(numeric_only=True)

    # Now, we want to rename the columns to be the sample names, since they are of the form sampleid_###.
    columns = grouped.columns.to_list()
    new_cols = [i.split("_")[0] for i in columns]
    grouped.columns = new_cols

    # Divide all the values by 100.
    grouped = grouped / 100

    # We should now remove duplicates in the taxIDs (since genera can be equivalent). 
    # If there are duplicates, then we should make it equal to the first value.
    taxid_df = taxid_df[~taxid_df.index.duplicated(keep="first")]

    return grouped, taxid_df

# We want to save to csv, but we want a csv for each column.
def save_to_csv(df, taxid_df, output_path, rank="s"):
    # Get the columns.
    columns = df.columns.to_list()

    # Iterate over the columns.
    for c, i in enumerate(columns):
        # Get the column.
        col = df[[i]]

        # Join the tax_ids to the column.
        # Now, we want to add the new_ids to the dataframe by joining on the index.
        col = col.join(taxid_df, how="left")

        display(col.head())

        # Save to csv.
        col.to_csv(os.path.join(output_path, f"{i.upper()}_{taxonomy_dict[rank]}_relabund_annotated.csv"), index_label=f"{taxonomy_dict[rank]}")

In [4]:
# In their great wisdom, they decided that the taxid should not be incldued in the file. 
# Therefore, we will need to standardize the names and attach the taxids.
def clean_biobakery_merged(df: pd.DataFrame, rank="g"):
    data = df.copy()

    # Get the indices so we can split then. They are of the form ex: ...|g__Bacteroides|s__Bacteroides_vulgatus
    indices = data.index.to_list()

    # This makes a list of lists, where each sublist is the taxonomy split by the rank.
    splitted = [i.split("|") for i in indices]

    new_index = []
    # Traverse all of the rows.
    for c, i in enumerate(splitted):
        # Traverse the sublists.
        for c2, j in enumerate(i):
            # If it matches the rank we want, then append it to the new index.
            if j.startswith(f"{rank}__"):
                new_index.append(j.replace(f"{rank}__", ""))

    # Set the new index.
    data.index = new_index

    # Sum the rows where the genus/species is the same.
    grouped = data.groupby(data.index).sum(numeric_only=True)

    # Now, we want to rename the columns to be the sample names, since they are of the form sampleid_###.
    columns = grouped.columns.to_list()
    new_cols = [i.split("_")[0] for i in columns]
    grouped.columns = new_cols

    # Divide all the values by 100.
    grouped = grouped / 100

    return grouped

In [5]:
def standardize_merged(df: pd.DataFrame):
    """ 
    This function takes the merged dataframe (converted to RA with clean_biobakery_merged) \  
    and standardizes the names plus adds the taxids.
    """
    names_df = generate_names_df(names_db_path, load_pickle=True)

    # Get the index. Replace names with those in replacement_dict.
    index = df.index.to_list()
    index = [replacement_dict[i] if i in replacement_dict else i for i in index]
    df.index = index

    split_names = split_bio(df)

    standard_df = standardize_core(input_df=df, split_names=split_names)

    # Replace the split_name using the replacement_dict.
    standard_df.set_index("split_name", inplace=True)

    annotated = map_and_add_tax_ids(df=standard_df, names_df=names_df)

    # We want to split the tax_id column off the dataframe.
    taxid_df = annotated[["tax_id"]]

    # If any rows contain nan, then we raise an error.
    if taxid_df.isna().any().any():
        # Print the rows that contain nan.
        print(taxid_df[taxid_df.isna().any(axis=1)])
        raise ValueError("There are nan values in the tax_id column.")

    annotated.drop(columns=["tax_id"], inplace=True)

    return annotated, taxid_df

### Main Code
Run the below function for cleaning of biobakery. Change rank for the desired rank.

In [6]:
data_path = amos_hilo.biobakery4
# data_path = "pipelines/bmock12/biobakery4/species_relab.txt"
output_dir = "pipelines/amos/hilo/bio4"
if not os.path.exists(data_path):
    raise Exception("Data file does not exist!")
# data_path = "pipelines/bmock12/biobakery4/species_relab.txt"
data = pd.read_csv(data_path, sep="\t", index_col=0)
# display(data.head())
# print(data.shape)

In [7]:
# rank = "s"
# names, taxids = clean_biobakery_with_taxid(data, rank=rank)
# save_to_csv(names, taxids, output_dir, rank=rank)

In [8]:
def main(rank):
    data = pd.read_csv(data_path, sep="\t", index_col=0)
    cleaned = clean_biobakery_merged(data, rank=rank)

    annotated, ids = standardize_merged(cleaned)
    save_to_csv(annotated, ids, output_path=output_dir, rank=rank)

main("g")
main("s")

['Akkermansia', 'Alistipes', 'Anaerostipes', 'Bacteroides', 'Bifidobacterium', 'Blautia', 'Clostridium', 'Collinsella', 'Escherichia', 'Eubacterium', 'Faecalibacterium', 'Lactobacillus', 'Parabacteroides', 'Prevotella', 'Roseburia']


Unnamed: 0_level_0,SRR11487931,tax_id
split_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AKKERMANSIA,0.001403,239934
ALISTIPES,0.000103,239759
ANAEROSTIPES,0.012009,207244
BACTEROIDES,0.078641,816
BIFIDOBACTERIUM,0.40544,1678


Unnamed: 0_level_0,SRR11487932,tax_id
split_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AKKERMANSIA,0.001057,239934
ALISTIPES,0.000223,239759
ANAEROSTIPES,0.011465,207244
BACTEROIDES,0.082481,816
BIFIDOBACTERIUM,0.388847,1678


Unnamed: 0_level_0,SRR11487933,tax_id
split_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AKKERMANSIA,0.001553,239934
ALISTIPES,3.6e-05,239759
ANAEROSTIPES,0.01404,207244
BACTEROIDES,0.084279,816
BIFIDOBACTERIUM,0.375105,1678


Unnamed: 0_level_0,SRR11487934,tax_id
split_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AKKERMANSIA,0.0015,239934
ALISTIPES,5.4e-05,239759
ANAEROSTIPES,0.012488,207244
BACTEROIDES,0.08639,816
BIFIDOBACTERIUM,0.372791,1678


Unnamed: 0_level_0,SRR11487935,tax_id
split_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AKKERMANSIA,0.000937,239934
ALISTIPES,0.000118,239759
ANAEROSTIPES,0.014956,207244
BACTEROIDES,0.087929,816
BIFIDOBACTERIUM,0.3723,1678


['Akkermansia_muciniphila', 'Alistipes_finegoldii', 'Anaerostipes_hadrus', 'Bacteroides_thetaiotaomicron', 'Bacteroides_uniformis', 'Bifidobacterium_longum', 'Blautia_wexlerae', 'Clostridium_butyricum', 'Collinsella_aerofaciens', 'Escherichia_coli', 'Eubacterium_hallii', 'Faecalibacterium_prausnitzii', 'Lactobacillus_paragasseri', 'Parabacteroides_distasonis', 'Prevotella_copri', 'Prevotella_melaninogenica', 'Roseburia_hominis', 'Roseburia_intestinalis']


Unnamed: 0_level_0,SRR11487931,tax_id
split_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AKKERMANSIA_MUCINIPHILA,0.001403,239935
ALISTIPES_FINEGOLDII,0.000103,214856
ANAEROSTIPES_HADRUS,0.012009,649756
BACTEROIDES_THETAIOTAOMICRON,0.076616,818
BACTEROIDES_UNIFORMIS,0.002025,820


Unnamed: 0_level_0,SRR11487932,tax_id
split_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AKKERMANSIA_MUCINIPHILA,0.001057,239935
ALISTIPES_FINEGOLDII,0.000223,214856
ANAEROSTIPES_HADRUS,0.011465,649756
BACTEROIDES_THETAIOTAOMICRON,0.08053,818
BACTEROIDES_UNIFORMIS,0.001951,820


Unnamed: 0_level_0,SRR11487933,tax_id
split_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AKKERMANSIA_MUCINIPHILA,0.001553,239935
ALISTIPES_FINEGOLDII,3.6e-05,214856
ANAEROSTIPES_HADRUS,0.01404,649756
BACTEROIDES_THETAIOTAOMICRON,0.082011,818
BACTEROIDES_UNIFORMIS,0.002268,820


Unnamed: 0_level_0,SRR11487934,tax_id
split_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AKKERMANSIA_MUCINIPHILA,0.0015,239935
ALISTIPES_FINEGOLDII,5.4e-05,214856
ANAEROSTIPES_HADRUS,0.012488,649756
BACTEROIDES_THETAIOTAOMICRON,0.084528,818
BACTEROIDES_UNIFORMIS,0.001863,820


Unnamed: 0_level_0,SRR11487935,tax_id
split_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AKKERMANSIA_MUCINIPHILA,0.000937,239935
ALISTIPES_FINEGOLDII,0.000118,214856
ANAEROSTIPES_HADRUS,0.014956,649756
BACTEROIDES_THETAIOTAOMICRON,0.085548,818
BACTEROIDES_UNIFORMIS,0.00238,820
