In [33]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from utils.ncbi.names import split_bio, map_and_add_tax_ids, standardize_core, generate_names_df, names_db_path, find_tax_id
from typing import Tuple

taxonomy_dict = {"s": "species", "g": "genus", "f": "family", "o": "order", "c": "class", "p": "phylum", "k": "kingdom"}

In [34]:
replacement_dict = {"Clostridium_clostridioforme": "Clostridium_clostridiiforme"}

In [35]:
# We want a csv file with genus, relative abundance.
def clean_biobakery_with_taxid(df, rank="g") -> Tuple[pd.DataFrame, pd.DataFrame]:
    data = df.copy()

    # Get the indices so we can split then. They are of the form ex: ...|g__Bacteroides|s__Bacteroides_vulgatus
    indices = data.index.to_list()
    # Tax_ids are of the form #|#|# ...
    tax_ids = data["TAX_ID"].to_list()

    # This makes a list of lists, where each sublist is the taxonomy split by the rank.
    splitted = [i.split("|") for i in indices]
    splitted_ids = [i.split("|") for i in tax_ids]

    new_index = []
    new_ids = []
    # Traverse all of the rows.
    for c, i in enumerate(splitted):
        # Traverse the sublists.
        for c2, j in enumerate(i):
            # If it matches the rank we want, then append it to the new index.
            if j.startswith(f"{rank}__"):
                new_index.append(j.replace(f"{rank}__", ""))
                # The taxid is the same index as the taxonomy.
                new_ids.append(splitted_ids[c][c2])

    # Set the new index.
    data.index = new_index

    # Make a new dataframe with the new index and the new_ids. We will use this to merge since we do not want to sum the tax_ids.
    taxid_df = pd.DataFrame(index=new_index, data=new_ids, columns=["TAX_ID"])

    # Sum the rows where the genus/species is the same.
    grouped = data.groupby(data.index).sum(numeric_only=True)

    # Now, we want to rename the columns to be the sample names, since they are of the form sampleid_###.
    columns = grouped.columns.to_list()
    new_cols = [i.split("_")[0] for i in columns]
    grouped.columns = new_cols

    # Divide all the values by 100.
    grouped = grouped / 100

    # We should now remove duplicates in the taxIDs (since genera can be equivalent). 
    # If there are duplicates, then we should make it equal to the first value.
    taxid_df = taxid_df[~taxid_df.index.duplicated(keep="first")]

    return grouped, taxid_df

# We want to save to csv, but we want a csv for each column.
def save_to_csv(df, taxid_df, output_path, rank="s"):
    # Get the columns.
    columns = df.columns.to_list()

    # Iterate over the columns.
    for c, i in enumerate(columns):
        # Get the column.
        col = df[[i]]

        # Join the tax_ids to the column.
        # Now, we want to add the new_ids to the dataframe by joining on the index.
        col = col.join(taxid_df, how="left")

        display(col.head())

        # Save to csv.
        col.to_csv(os.path.join(output_path, f"{i.upper()}_{taxonomy_dict[rank]}_relabund_annotated.csv"), index_label=f"{taxonomy_dict[rank]}")

In [36]:
# In their great wisdom, they decided that the taxid should not be incldued in the file. 
# Therefore, we will need to standardize the names and attach the taxids.
def clean_biobakery_merged(df: pd.DataFrame, rank="g"):
    data = df.copy()

    # Get the indices so we can split then. They are of the form ex: ...|g__Bacteroides|s__Bacteroides_vulgatus
    indices = data.index.to_list()

    # This makes a list of lists, where each sublist is the taxonomy split by the rank.
    splitted = [i.split("|") for i in indices]

    new_index = []
    # Traverse all of the rows.
    for c, i in enumerate(splitted):
        # Traverse the sublists.
        for c2, j in enumerate(i):
            # If it matches the rank we want, then append it to the new index.
            if j.startswith(f"{rank}__"):
                new_index.append(j.replace(f"{rank}__", ""))

    # Set the new index.
    data.index = new_index

    # Sum the rows where the genus/species is the same.
    grouped = data.groupby(data.index).sum(numeric_only=True)

    # Now, we want to rename the columns to be the sample names, since they are of the form sampleid_###.
    columns = grouped.columns.to_list()
    new_cols = [i.split("_")[0] for i in columns]
    grouped.columns = new_cols

    # Divide all the values by 100.
    grouped = grouped / 100

    return grouped

In [37]:
def standardize_merged(df: pd.DataFrame):
    """ 
    This function takes the merged dataframe (converted to RA with clean_biobakery_merged) \  
    and standardizes the names plus adds the taxids.
    """
    names_df = generate_names_df(names_db_path, load_pickle=True)

    # Get the index. Replace names with those in replacement_dict.
    index = df.index.to_list()
    index = [replacement_dict[i] if i in replacement_dict else i for i in index]
    df.index = index

    split_names = split_bio(df)

    standard_df = standardize_core(input_df=df, split_names=split_names)

    # Replace the split_name using the replacement_dict.
    standard_df.set_index("split_name", inplace=True)

    annotated = map_and_add_tax_ids(df=standard_df, names_df=names_df)

    # We want to split the tax_id column off the dataframe.
    taxid_df = annotated[["tax_id"]]

    # If any rows contain nan, then we raise an error.
    if taxid_df.isna().any().any():
        # Print the rows that contain nan.
        print(taxid_df[taxid_df.isna().any(axis=1)])
        raise ValueError("There are nan values in the tax_id column.")

    annotated.drop(columns=["tax_id"], inplace=True)

    return annotated, taxid_df

### Main Code
Run the below function for cleaning of biobakery. Change rank for the desired rank.

In [38]:
data_path = "/Volumes/TBHD_share/valencia/pipelines/microbio_spectrum/bio4/metaphlan/merged/species_relabund.txt"
# data_path = "pipelines/bmock12/biobakery4/species_relab.txt"
output_dir = "pipelines/tourlousse/bio4"
if not os.path.exists(data_path):
    raise Exception("Data file does not exist!")
# data_path = "pipelines/bmock12/biobakery4/species_relab.txt"
data = pd.read_csv(data_path, sep="\t", index_col=0)
display(data.head())
print(data.shape)

Unnamed: 0_level_0,SRR17380241_taxonomic_profile,SRR17380242_taxonomic_profile,SRR17380243_taxonomic_profile,SRR17380244_taxonomic_profile,SRR17380245_taxonomic_profile,SRR17380246_taxonomic_profile
# taxonomy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillaceae|g__Bacillus|s__Bacillus_murimartini,3.78091,3.80262,3.79379,3.73562,3.98963,3.69453
k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillaceae|g__Bacillus|s__Bacillus_intestinalis,0.46177,0.57957,0.46383,0.48681,0.4829,0.49157
k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Blautia|s__Blautia_producta,6.04723,5.82678,5.80916,5.99614,5.84795,6.00332
k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Blautia|s__Blautia_coccoides,0.03649,0.03751,0.04457,0.03894,0.03718,0.03775
k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Blautia|s__Ruminococcus_gnavus,7.20723,7.26855,7.16951,7.32413,7.32548,7.26494


(24, 6)


In [39]:
# rank = "s"
# names, taxids = clean_biobakery_with_taxid(data, rank=rank)
# save_to_csv(names, taxids, output_dir, rank=rank)

In [40]:
rank = "g"

data = pd.read_csv(data_path, sep="\t", index_col=0)
cleaned = clean_biobakery_merged(data, rank=rank)

annotated, ids = standardize_merged(cleaned)
save_to_csv(annotated, ids, output_path=output_dir, rank=rank)

['Akkermansia', 'Anaerostipes', 'Bacillus', 'Bacteroides', 'Bifidobacterium', 'Blautia', 'Collinsella', 'Cutibacterium', 'Escherichia', 'Flavonifractor', 'Lachnoclostridium', 'Lactobacillus', 'Megamonas', 'Megasphaera', 'Parabacteroides', 'Pseudomonas', 'Staphylococcus', 'Streptococcus']


Unnamed: 0_level_0,SRR17380241,tax_id
split_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AKKERMANSIA,0.048541,239934
ANAEROSTIPES,0.066303,207244
BACILLUS,0.042427,1386
BACTEROIDES,0.059015,816
BIFIDOBACTERIUM,0.105952,1678


Unnamed: 0_level_0,SRR17380242,tax_id
split_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AKKERMANSIA,0.049393,239934
ANAEROSTIPES,0.06469,207244
BACILLUS,0.043822,1386
BACTEROIDES,0.059418,816
BIFIDOBACTERIUM,0.106186,1678


Unnamed: 0_level_0,SRR17380243,tax_id
split_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AKKERMANSIA,0.048761,239934
ANAEROSTIPES,0.066043,207244
BACILLUS,0.042576,1386
BACTEROIDES,0.060159,816
BIFIDOBACTERIUM,0.104451,1678


Unnamed: 0_level_0,SRR17380244,tax_id
split_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AKKERMANSIA,0.047648,239934
ANAEROSTIPES,0.067423,207244
BACILLUS,0.042224,1386
BACTEROIDES,0.059721,816
BIFIDOBACTERIUM,0.103746,1678


Unnamed: 0_level_0,SRR17380245,tax_id
split_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AKKERMANSIA,0.04864,239934
ANAEROSTIPES,0.066216,207244
BACILLUS,0.044725,1386
BACTEROIDES,0.058457,816
BIFIDOBACTERIUM,0.10541,1678


Unnamed: 0_level_0,SRR17380246,tax_id
split_name,Unnamed: 1_level_1,Unnamed: 2_level_1
AKKERMANSIA,0.048716,239934
ANAEROSTIPES,0.066708,207244
BACILLUS,0.041861,1386
BACTEROIDES,0.05925,816
BIFIDOBACTERIUM,0.10475,1678
