In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

taxonomy_dict = {"s": "species", "g": "genus", "f": "family", "o": "order", "c": "class", "p": "phylum", "k": "kingdom"}

In [2]:
data_path = "/Volumes/TBHD_share/valencia/pipelines/microbio_spectrum/bio4/metaphlan/merged/species_relabund.txt"
if not os.path.exists(data_path):
    raise Exception("Data file does not exist!")
# data_path = "pipelines/bmock12/biobakery4/species_relab.txt"
data = pd.read_csv(data_path, sep="\t", index_col=0)
display(data.head())

Unnamed: 0_level_0,SRR17380241_taxonomic_profile,SRR17380242_taxonomic_profile,SRR17380243_taxonomic_profile,SRR17380244_taxonomic_profile,SRR17380245_taxonomic_profile,SRR17380246_taxonomic_profile
# taxonomy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillaceae|g__Bacillus|s__Bacillus_murimartini,3.78091,3.80262,3.79379,3.73562,3.98963,3.69453
k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillaceae|g__Bacillus|s__Bacillus_intestinalis,0.46177,0.57957,0.46383,0.48681,0.4829,0.49157
k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Blautia|s__Blautia_producta,6.04723,5.82678,5.80916,5.99614,5.84795,6.00332
k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Blautia|s__Blautia_coccoides,0.03649,0.03751,0.04457,0.03894,0.03718,0.03775
k__Bacteria|p__Firmicutes|c__Clostridia|o__Clostridiales|f__Lachnospiraceae|g__Blautia|s__Ruminococcus_gnavus,7.20723,7.26855,7.16951,7.32413,7.32548,7.26494


In [3]:
# We want a csv file with genus, relative abundance.
def clean_biobakery(df, rank="g"):
    data = df.copy()

    # Get the indices so we can split then. They are of the form ex: ...|g__Bacteroides|s__Bacteroides_vulgatus
    indices = data.index.to_list()

    # This makes a list of lists, where each sublist is the taxonomy split by the rank.
    splitted = [i.split("|") for i in indices]

    new_index = []
    # Traverse all of the rows.
    for c, i in enumerate(splitted):
        # Traverse the sublists.
        for c2, j in enumerate(i):
            # If it matches the rank we want, then append it to the new index.
            if j.startswith(f"{rank}__"):
                new_index.append(j.replace(f"{rank}__", ""))

    # Set the new index.
    data.index = new_index

    # Sum the rows where the genus/species is the same.
    grouped = data.groupby(data.index).sum()

    # Now, we want to rename the columns to be the sample names.
    columns = grouped.columns.to_list()
    new_cols = [i.split("_")[0] for i in columns]
    grouped.columns = new_cols

    # Divide all the values by 100.
    grouped = grouped / 100

    return grouped

# We want to save to csv, but we want a csv for each column.
def save_to_csv(df, output_path, rank="g"):
    # Get the columns.
    columns = df.columns.to_list()

    # Iterate over the columns.
    for c, i in enumerate(columns):
        # Get the column.
        col = df[i]

        # Save to csv.
        col.to_csv(output_path + f"{i.lower()}_{taxonomy_dict[rank]}_relabund.csv", index_label="genus")

In [5]:
output = clean_biobakery(data)
save_to_csv(output, "pipelines/tourlousse/bio4/")