In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re

In [67]:
rank = "Genus"
expected_path = "/Volumes/TBHD_share/cami_data/gitract/expected/"


if not os.path.exists(expected_path):
    raise Exception("Expected path does not exist")


def get_expected():
    for root, dirs, files in os.walk(expected_path):
        for f in files:
            if "taxonomic_profile" in f and not f.endswith("_4krona.txt"):
                yield os.path.join(root, f)


In [68]:
def get_expected_df():
    for f in get_expected():
        df = pd.read_csv(f, sep="\t")
        print(f)
        
        # Rename "Unnamed: 0" to "counts"
        df.rename(columns={"Unnamed: 0": "RA"}, inplace=True)
        new_df = df[[rank, "RA"]].copy()
        new_df.set_index(rank, inplace=True)

        # Get the sum of all the counts by rank
        sum_df = new_df.groupby(rank).sum()

        # Divide by 100 to get the percentage.
        sum_df["RA"] = sum_df["RA"] / 100
        sum_df.sort_values(by="RA", ascending=False, inplace=True)

        pattern = re.compile(r"_\d\.")
        sample = pattern.search(f).group(0)[1]

        output_file = os.path.join(f"s{sample.upper()}_expected.csv")
        sum_df.to_csv(output_file, index_label=rank)

# get_expected_df()

In [138]:
def clean_taxonomic_profile(path: str, rank: str, prefix="S"):
    print(path)
    with open(path, "r") as f:
        df = pd.read_csv(f, sep="\t", skiprows=3, header=0)

        # Split TAXPATH and TAXPATHSN on the "|", then take the last element.
        df["TAXPATH"] = df["TAXPATH"].str.split("|").str[-1]
        df["TAXPATHSN"] = df["TAXPATHSN"].str.split("|").str[-1]

        df = df[df["RANK"] == rank.lower()]
        display(df.head())

        # Sum of percentage column
        total = df["PERCENTAGE"].sum()

        # Divide the percentage column by 100.
        df["PERCENTAGE"] = df["PERCENTAGE"].apply(lambda x: x/100)

        final = df[["TAXPATHSN", "PERCENTAGE", "@@TAXID"]].copy()
        final.rename(columns={"TAXPATHSN": f"{rank}", "PERCENTAGE": "RA", "@@TAXID": "TAX_ID"}, inplace=True)
        final.set_index(rank, inplace=True)

        final.sort_values("RA", inplace=True, ascending=False)
        final = final.astype({"TAX_ID": "int64"})

        pattern = re.compile(r"_\d\.")
        sample = pattern.search(path).group(0)[1]

        output_path = os.path.join(f"{prefix}{sample.upper()}_{rank.lower()}_relabund_annotated.csv")
        
        final.to_csv(output_path, index=True, index_label=rank)

        display(final.head())

In [139]:
for file in get_expected():
    clean_taxonomic_profile(file, "Genus")

/Volumes/TBHD_share/cami_data/gitract/expected/taxonomic_profile_2.txt


Unnamed: 0,@@TAXID,RANK,TAXPATH,TAXPATHSN,PERCENTAGE,_CAMI_GENOMEID
118,32257.0,genus,32257,Kingella,0.0,
119,2050.0,genus,2050,Mobiluncus,0.0,
120,1784836.0,genus,1784836,Fermentimonas,0.0,
121,517.0,genus,517,Bordetella,36.6534,
122,29580.0,genus,29580,Janthinobacterium,0.0,


Unnamed: 0_level_0,RA,TAX_ID
Genus,Unnamed: 1_level_1,Unnamed: 2_level_1
Bordetella,0.366534,517
Clostridium,0.290837,1485
Achromobacter,0.14741,222
Roseburia,0.087649,841
Bacteroides,0.079681,816


/Volumes/TBHD_share/cami_data/gitract/expected/taxonomic_profile_1.txt


Unnamed: 0,@@TAXID,RANK,TAXPATH,TAXPATHSN,PERCENTAGE,_CAMI_GENOMEID
118,32257.0,genus,32257,Kingella,0.0,
119,2050.0,genus,2050,Mobiluncus,0.0,
120,1784836.0,genus,1784836,Fermentimonas,1.3274,
121,517.0,genus,517,Bordetella,56.1947,
122,29580.0,genus,29580,Janthinobacterium,0.0,


Unnamed: 0_level_0,RA,TAX_ID
Genus,Unnamed: 1_level_1,Unnamed: 2_level_1
Bordetella,0.561947,517
Clostridium,0.225664,1485
Bacteroides,0.123894,816
Fermentimonas,0.013274,1784836
Ruminiclostridium,0.00885,1508657
