In [175]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import os

In [176]:
cb_palette = sns.color_palette(as_cmap=True)

color_palette = {
    "Expected": cb_palette[0], 
    "expected": cb_palette[0], 
    "woltka": cb_palette[1], 
    "wol": cb_palette[1], 
    "jams": cb_palette[2], 
    "wgsa": cb_palette[3], 
    "wgsa2": cb_palette[3], 
    "biobakery3": cb_palette[4], 
    "bio3": cb_palette[4], 
    "biobakery4": cb_palette[5], 
    "bio4": cb_palette[5]
}


In [177]:
pdf_output = PdfPages("stats_summary.pdf")

In [178]:
project_root = os.path.abspath("../../pipelines/")
threshold = 0.0

# First, we load the data from the CSV file.
def find_stats_files():
    for root, dirs, files in os.walk(project_root):
        for file in files:
            # print(file)
            if "stats" in file and file.endswith('.csv'):
                stats_path = os.path.join(root, file)
                df = pd.read_csv(stats_path)
                yield stats_path, df

In [179]:
def combine_stats():
    combined_df = pd.DataFrame()

    for path, df in find_stats_files():
        df["Source"] = path.split("/")[-2]
        combined_df = pd.concat([combined_df, df])

    return combined_df

In [180]:
def make_catplot(df: pd.DataFrame, id_var: str, src: str, plot_type: str):
    melted = df.melt(id_vars=["SampleID", id_var, "Source"], var_name="Metric", value_name="Value").dropna()

    ax = sns.catplot(data=melted, x=id_var, y="Value", col="Metric", col_wrap=3, kind=plot_type, sharey=False, palette=color_palette)
    ax.fig.suptitle(f'Summary of Statistics for {src}', y=1.05)

    pdf_output.savefig(ax.figure, bbox_inches='tight', dpi=300)
    plt.close(ax.figure)

def plot_stats(df: pd.DataFrame):
    # display(df.head(30))
    for src, df in df.groupby("Source"):
        if src == "bmock12" or src == "camisimGI":
            make_catplot(df, "Source/Pipeline", src, "bar")

        else:
            no_average = df.loc[df['SampleID'] != 'Average']
            make_catplot(no_average, "Pipeline", src, "box")

    pdf_output.close()

In [181]:
plot_stats(combine_stats())