In [2]:
import sys
sys.path.append('../../')

import pandas as pd
from python_src.figures_utils import get_all_expected, generate_experimental_df, get_relabund_files, fully_combined, generate_cb
from dataclasses import dataclass
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [3]:
@dataclass
class Pipeline:
    """
    This class will hold the parameters for each pipeline.
    Variables:
        root: str
            The root directory of the pipeline.
        inset: bool
            Whether or not to include an inset plot.
    """
    root: str
    inset: bool

    def __init__(self, root: str, inset: bool):
        self.root = root
        self.inset = inset

tourlousse = Pipeline("../../pipelines/tourlousse", False)
amos_hilo = Pipeline("../../pipelines/amos/hilo", False)
amos_mixed = Pipeline("../../pipelines/amos/mixed", False)

experiments = [tourlousse, amos_hilo, amos_mixed]

In [4]:
def fix_x_labels(ax, df, rank):
    xticks = ax.get_xticklabels()
    # print(xticks)
    new_labels = []
    for x in xticks:
        # res = df.loc[int(x.get_text()), rank]
        res = df.loc[df["TAX_ID"] == int(x.get_text()), rank]
        # Get only the first row from the series.
        # This is necessary because if it is unique, it will return a string, but if it is not unique, it will return a series.
        if isinstance(res, pd.Series):
            res = res.iloc[0]
        new_labels.append(res)

    return new_labels

In [11]:
cb_palette = generate_cb()

def make_title(rank: str, exp_name: str, thresh: float):
    return f"Average Relative Abundance of {rank.capitalize()} in Experiement {exp_name.capitalize()} at Threshold {thresh}"

def plot_bars(thresh: float, rank: str = "genus"):
    for e in experiments:
        # Initialize the dataframe to hold the data.
        plt_df = pd.DataFrame()
        # Make the bigger figure.
        fig = plt.figure(figsize=(15, 12))

        exp_name = e.root.split("/")[-1]

        # Get the data for the experiment.
        fc = fully_combined(e.root, rank)

        # Only select where Source is "expected", "bio4", or "jams202212".
        fc = fc.loc[fc["Source"].isin(["Expected", "biobakery4", "jams202212"])]
        fc.to_csv(f"{rank}_{exp_name}_fc.csv", index=True)

        continue

        for pl, pl_df in fc.groupby("Source"):
            # Average the abundances on the same index values. This keeps the names of the taxa.
            averaged_df = pl_df.groupby(["TAX_ID", rank, "Source"]).mean(numeric_only=True)

            # Filter out the values that are below the threshold.
            averaged_df = averaged_df.loc[averaged_df["RA"] > thresh]

            # Add the data to the plot dataframe.
            plt_df = pd.concat([plt_df, averaged_df], axis=0)

            # pl_df.to_csv(f"genus_{pl}.csv", index=True)

        plt_df.reset_index(inplace=True)        

        # We want only bio4 and jams.
        plt_df = plt_df.loc[plt_df["Source"].isin(["Expected", "bio4", "jams"])]

        # Plot the data.
        ax = sns.barplot(x="TAX_ID", y='RA', hue="Source", data=plt_df, errorbar=None, log=True, palette=cb_palette)
        ax.set_xticklabels(fix_x_labels(ax=ax, df=plt_df, rank=rank), rotation=45, horizontalalignment='right')

        # Make the title and add the axes labels.
        ax.set_title(make_title(rank, exp_name, thresh))
        ax.set_xlabel(rank.capitalize())
        ax.set_ylabel('Average Relative Abundance')

        # Save the plt_df.
        # plt_df.to_csv(f"{rank}_{exp_name}_{thresh}.csv", index=False)

plot_bars(0.001, "species")
plot_bars(0.001, "genus")

<Figure size 1500x1200 with 0 Axes>

<Figure size 1500x1200 with 0 Axes>

<Figure size 1500x1200 with 0 Axes>

<Figure size 1500x1200 with 0 Axes>

<Figure size 1500x1200 with 0 Axes>

<Figure size 1500x1200 with 0 Axes>

In [6]:
# Creation of Statistical Analysis Plots
stats_path_genus = os.path.abspath("../../utils/analysis/all_stats_genus.csv")
stats_path_species = os.path.abspath("../../utils/analysis/all_stats_species.csv")
stats_paths = [stats_path_genus, stats_path_species]


for stats_path in stats_paths:
    stats_df = pd.DataFrame()
    print(stats_path)
    df = pd.read_csv(stats_path)
    df = df.loc[(df["Source"] == "tourlousse") | (df["Source"] == "mixed") | (df["Source"] == "hilo")]
    df = df.loc[(df["Pipeline"] == "biobakery4") | (df["Pipeline"] == "jams")]
    df = df.loc[(df["threshold"] == 0.0001)]
    for src, src_df in df.groupby("Source"):
        # display(src_df)
        # fig = plt.figure(figsize=(15, 12))
        # g = sns.catplot(x="SampleID", y="AD", hue="Pipeline", data=src_df, kind="box", palette=cb_palette)
        # plt.show()
        # sns.barplot(x="SampleID", y="AD", hue="Pipeline", data=src_df, palette=cb_palette)
        for pl, pl_df in src_df.groupby("Pipeline"):
            # pl_df.loc["Average"] = pl_df.mean(numeric_only=True)
            avg = pl_df.mean(numeric_only=True)
            std = pl_df.std(numeric_only=True)

            avg_cols = avg.keys()
            avg_cols = ["Pipeline", "Community"] + avg_cols.to_list()

            new_line = [f"{val:.2f} ± {std[c]:.2f}" for c, val in enumerate(avg)]
            new_line.insert(0, src)
            new_line.insert(0, pl)
            # print(new_line)
            stats_df = pd.concat([stats_df, pd.DataFrame(new_line, index=avg_cols).T])

            # stats_df = pd.concat([stats_df, avg])
            # stats_df = pd.concat([stats_df, std])
            # pl_df.loc["Stddev"] = pl_df.std(numeric_only=True)
            # pl_df.loc["Stddev"] = pl_df.std(numeric_only=True)
            # display(pl_df)
    # display(stats_df)
    rank = stats_path.split("/")[-1].split("_")[2].split(".")[0]
    stats_df.to_csv(f"{rank}_stats.csv", index=False)


/Users/valenciaem/coding/pipelines/utils/analysis/all_stats_genus.csv
/Users/valenciaem/coding/pipelines/utils/analysis/all_stats_species.csv
