In [None]:
import numpy as np
from biom.table import Table
from biom import load_table
import pandas as pd
import os

In [None]:
# data_path = "/Volumes/TBHD_share/valencia/bmock12/NEPHELE/wgsa2/subset_bmock12/outputs/TAXprofiles/TEDreadsTAX/reports/1_taxREPORT.txt"
cami_sim_data = "/Volumes/TBHD_share/valencia/pipelines/microbio_spectrum/wgsa/outputs/TAXprofiles/TEDreadsTAX/reports"
output_path = "pipelines/toulousse/wgsa"

In [None]:
tax_dict = {"Genus": "G", "Species": "S", "Family": "F", "Order": "O", "Class": "C", "Phylum": "P", "Kingdom": "K"}

def clean_and_parse_wgsa(data_path, output_dir, rank="Genus"):
    df = pd.read_csv(data_path, sep="\t", header=None, usecols=[0, 3, 5])
    df = df.where(df[3] == tax_dict[rank]).dropna()
    df.sort_values(by=0, ascending=False, inplace=True)

    clean_genus = df[[5, 0]]
    clean_genus.columns = [rank, "RA"]
    clean_genus["RA"] = clean_genus["RA"] / 100
    clean_genus.set_index(rank, inplace=True)

    indices = clean_genus.index
    indices = [i.lstrip() for i in indices]
    clean_genus.index = indices

    # clean_genus.head(10)

    prefix = os.path.basename(data_path).split("_")[0]
    output_file = os.path.join(output_dir, "s" + prefix + "_" + rank.lower() + "_" + "relabund.csv")

    clean_genus.to_csv(output_file, sep=",", header=True, index_label=rank)

# clean_and_parse_wgsa(cami_sim_data, output_path)

In [None]:
# There may be more than one output file, so we need to combine them.
def combine_files(data_path: str, rank: str):
    for root, dirs, files in os.walk(data_path):
        if len(files) == 0:
            raise Exception("No files found in output directory.")
            
        for file in files:
            if "REPORT" in file:
                print(os.path.join(root, file))
                clean_and_parse_wgsa(os.path.join(root, file), output_path, rank=rank)

combine_files(cami_sim_data, "Genus")
combine_files(cami_sim_data, "Species")

In [None]:
"""
Old way of doing it.
# t = load_table("/Volumes/TBHD/Valencia/Microbiome_Analysis/Nephele_cloud_play_project/outputs/for_analyze_with_microbiomedb.biom")
# t = load_table("/Volumes/TBHD_share/valencia/3sample/no_merge_trim+trim_output/for_analyze_with_microbiomedb.biom")
# t = load_table("/Volumes/TBHD_share/valencia/3sample/batch2/wgsa2/outputs/for_analyze_with_microbiomedb.biom")
# t = load_table("/Volumes/TBHD_share/valencia/3sample/batch2/wgsa2/outputs/TAXprofiles/MAGs_TAX/MAG-based_Counts+TAX.biom")
# t = load_table("/Volumes/TBHD_share/walitt_sample/3sample/outputs/for_analyze_with_microbiomedb.biom")
t = load_table("/Volumes/TBHD_share/valencia/bmock12/NEPHELE/wgsa2/subset_bmock12/outputs/for_analyze_with_microbiomedb.biom")
# print(t)
t.ids(axis='observation')
# Use 6 for species, 5 for genus.

phylum_idx = 6
# Somtimes taxonomy is capitalized, sometimes not.
collapse_f = lambda id_, md: '; '.join(md['Taxonomy'][phylum_idx:phylum_idx+1])
collapsed = t.collapse(collapse_f, axis='observation')

df = collapsed.to_dataframe()
display(df)

col1 = pd.DataFrame(df.iloc[:, :])
display(col1)

col1 = col1.sparse.to_dense()
col1.rename(index={'':'Unclassified'},inplace=True)

print(col1.sum(axis=0))
# col1.to_csv('test.csv')
# col1.drop('Unclassified_sp', inplace=True)
# Percent abundance operation.
pct = col1.apply(lambda x: x / x.sum(), axis=0)
display(pct)
pct.to_csv("pipelines/bmock12/wgsa2/fullpct_genus.csv", header=["Count"], index_label="Genus")

# Drop features with less than x percent abundance.
filtered_pct = pct.where(pct >= 0.001).dropna()

# Add other category to account for dropped samples.
filtered_pct.loc['Rare Taxa']= 1.0 - filtered_pct.sum(numeric_only=True, axis=0)
display(pct)
# pct.to_csv("pipelines/bmock12/wgsa2/0-001pct.csv")

# import matplotlib
# matplotlib.style.use('ggplot') 
filtered_pct.T.plot.bar(stacked=True, figsize=(10, 10), ylabel="Fraction", xlabel="Sample", title="Alignment Genus").legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title="Genus")
indices = pct.index
cleaned_indices = [i.split("_")[0] for i in indices]
cleaned_indices = [i.replace("[" , "") for i in cleaned_indices]
cleaned_indices = [i.replace("]" , "") for i in cleaned_indices]

pct.index = cleaned_indices
pct = pct.groupby(pct.index).sum()

pct.to_csv("pipelines/bmock12/wgsa2/cleaned_pct.csv")
# Sort the values by the first sample in ascending order.
pct.T
pct.sort_values(axis=0, ascending=False, by=pct.columns[0], inplace=True)
# Plot with seaborn.
import plotly.express as px
fig = px.bar(pct.T, x=pct.index, y=pct.columns, title="Alignment Species", labels={'index': 'Sample', 'value': 'Fraction', "variable": "Species"})
fig.show()
# Sanity check to make sure my operations are correct.

test_df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=['a', 'b', 'c'])

pct_test = test_df.apply(lambda x: x / x.sum(), axis=0)
display(pct_test)

pct_test = pct_test.where(pct_test >= 0.50).dropna()
display(pct_test)

pct_test.loc['Column_Total']= 1.0 - pct_test.sum(numeric_only=True, axis=0)
display(pct_test)
"""