In [2]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

taxonomy_dict = {"s": "species", "g": "genus", "f": "family", "o": "order", "c": "class", "p": "phylum", "k": "kingdom"}

In [10]:
data_path = "/Volumes/TBHD_share/valencia/pipelines/bmock12/biobakery4/metaphlan/main/species_relab.txt"
if not os.path.exists(data_path):
    raise Exception("Data file does not exist!")
# data_path = "pipelines/bmock12/biobakery4/species_relab.txt"
data = pd.read_csv(data_path, sep="\t", index_col=0)
display(data.head())

Unnamed: 0_level_0,s1
taxonomy,Unnamed: 1_level_1
k__Bacteria|p__Bacteroidetes|c__Flavobacteriia|o__Flavobacteriales|f__Flavobacteriaceae|g__Muricauda|s__Muricauda_sp_ES_050,74.33332
k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Cohaesibacteraceae|g__Cohaesibacter|s__Cohaesibacter_sp_ES_047,19.36211
k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhodobacterales|f__Rhodobacteraceae|g__Thioclava|s__Thioclava_sediminum,2.05531
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Micromonosporales|f__Micromonosporaceae|g__Micromonospora|s__Micromonospora_echinofusca,2.01187
k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Micromonosporales|f__Micromonosporaceae|g__Micromonospora|s__Micromonospora_echinaurantiaca,1.6624


In [11]:
# We want a csv file with genus, relative abundance.
def clean_biobakery(df, rank="g"):
    data = df.copy()

    # Get the indices so we can split then. They are of the form ex: ...|g__Bacteroides|s__Bacteroides_vulgatus
    indices = data.index.to_list()

    # This makes a list of lists, where each sublist is the taxonomy split by the rank.
    splitted = [i.split("|") for i in indices]

    new_index = []
    # Traverse all of the rows.
    for c, i in enumerate(splitted):
        # Traverse the sublists.
        for c2, j in enumerate(i):
            # If it matches the rank we want, then append it to the new index.
            if j.startswith(f"{rank}__"):
                new_index.append(j.replace(f"{rank}__", ""))

    # Set the new index.
    data.index = new_index

    # Sum the rows where the genus/species is the same.
    grouped = data.groupby(data.index).sum()

    # Now, we want to rename the columns to be the sample names.
    columns = grouped.columns.to_list()
    new_cols = [i.split("_")[0] for i in columns]
    grouped.columns = new_cols

    # Divide all the values by 100.
    grouped = grouped / 100

    return grouped

# We want to save to csv, but we want a csv for each column.
def save_to_csv(df, output_path, rank="g"):
    # Get the columns.
    columns = df.columns.to_list()

    # Iterate over the columns.
    for c, i in enumerate(columns):
        # Get the column.
        col = df[i]

        # Save to csv.
        col.to_csv(output_path + f"{i.lower()}_{taxonomy_dict[rank]}_relabund.csv", index_label="genus")

In [12]:
output = clean_biobakery(data)
save_to_csv(output, "pipelines/bmock12/biobakery4/")