# Summary Tables
This notebook will create summaries of the data with the following format:

![wanted_output](format2.png)

In [182]:
import pandas as pd
import numpy as np
import seaborn as sns
from typing import Tuple, List

rename_dict = {"hilo": "Amos HiLo", "mixed": "Amos Mixed", "tourlousse": "Tourlousse"}
metrics = ["FPRA", "Sens", "AD"]
wanted_pipelines = ["biobakery3", "biobakery4", "jams", "wgsa2", "woltka"]

def initialize_datasets() -> Tuple[pd.DataFrame, pd.DataFrame]:
    # Read in the data
    df = pd.read_csv('../results/all_stats_species.csv')
    df = df.loc[df["threshold"] == 0.0001]
    # If the source is mixed, hilo, or tourlousse, then we need to average the samples together.
    replicate_comm = ["mixed", "hilo", "tourlousse"]

    wanted_cols_one_to_one = metrics + ["Community", "Pipeline"]
    wanted_cols_replicates = metrics + ["Pipeline", "Source"]

    replicates = df.loc[df["Source"].isin(replicate_comm) & df["Pipeline"].isin(wanted_pipelines)]
    one_to_one = df.loc[~df["Source"].isin(replicate_comm) & df["Pipeline"].isin(wanted_pipelines)]

    # Average together replicates within same source and pipeline
    replicates = replicates.groupby(["Pipeline", "Source"]).agg({
        "FPRA": ["count", "mean", "std"],
        "Sens": ["count", "mean", "std"],
        "AD": ["count", "mean", "std"],
    })

    replicates = replicates.groupby(["Source", "Pipeline"]).mean().reset_index()
    replicates = replicates.loc[:, wanted_cols_replicates]

    return replicates, one_to_one

replicates, one_to_one = initialize_datasets()

def make_replicate_df(replicates: pd.DataFrame, metric: str) -> pd.DataFrame:
    test = replicates[["Pipeline", "Source", metric]]
    test = test.droplevel(0, axis=1)
    test.columns = ["Pipeline", "Source", "count", "mean", "std"]
    test.index.name = metric
    test = test.pivot(index="Source", columns="Pipeline", values=["count", "mean", "std"])
    # Rename the sources
    test = test.rename(index=rename_dict)
    # test.to_csv("{}_replicates.csv".format(metric))
    # display(test)

    reorganized = test.groupby(level=1, axis=1).apply(lambda x: x.droplevel(1, axis=1))

    # Rename the columns
    rename_cols = {"count": "n", "mean": "Mean", "std": "Stdev", "Source": "Community"}
    reorganized = reorganized.rename(columns=rename_cols)
    reorganized.index.name = "Community"

    return reorganized



In [183]:
one_to_one_rename = {
    "EG nist": "NIST EG",
    "MIX-A nist": "NIST MIX-A",
    "MIX-B nist": "NIST MIX-B",
    "MIX-C nist": "NIST MIX-C",
    "MIX-D nist": "NIST MIX-D",
    "S1 bmock12": "BMock12",
    "S1 camisimGI": "CamiSim S1",
    "S2 camisimGI": "CamiSim S2",
}
def make_one_to_one_dfs(df: pd.DataFrame) -> pd.DataFrame:
    one_to_one_copy = df.copy()
    one_to_one_copy["Community"] = one_to_one["SampleID"] + " " + one_to_one["Source"]
    # display(one_to_one_copy.head())
    one_to_one_copy = one_to_one_copy[["FPRA", "Sens", "AD", "Community", "Pipeline"]]
    # display(one_to_one_copy.head())

    # display(one_to_one_copy.head())
    one_to_one_copy["n"] = 1
    one_to_one_copy["Stdev"] = np.NaN
    # We want the five pipelines as the columns
    # one_to_one_copy["n"] = 1
    piv_df = one_to_one_copy.pivot(index="Community", columns="Pipeline", values=["FPRA", "Sens", "AD", "n", "Stdev"])
    piv_df = piv_df.rename(index=one_to_one_rename)

    return piv_df


# The current data should be moved into a multiindex called "Mean".
# The current index should be moved into a column called "Pipeline"
def reorganize_df(df: pd.DataFrame) -> pd.DataFrame:
    reorganized = df.groupby(level=1, axis=1).apply(lambda x: x.droplevel(1, axis=1))
    return reorganized


# one_to_one_copy.to_csv("one_to_one.csv", index=True)

In [196]:
piv_df = make_one_to_one_dfs(one_to_one)
for metric in metrics:
    print(metric)
    reorganized_replicate = make_replicate_df(replicates, metric)
    # display(piv_df[["n", metric, "Stdev"]])
    reorganized_one_to_one = reorganize_df(piv_df[["n", metric, "Stdev"]])
    # Sort by alphabetical order
    reorganized_one_to_one = reorganized_one_to_one.sort_index()

    # Rename FPRA, Sens, AD to Mean
    rename_cols = {"FPRA": "Mean", "Sens": "Mean", "AD": "Mean"}
    reorganized_one_to_one = reorganized_one_to_one.rename(columns=rename_cols)

    # display(reorganized_replicate)
    # display(reorganized_one_to_one)
    # Stack the two dataframes together
    stacked = pd.concat([reorganized_one_to_one, reorganized_replicate], axis=0)

    # Drop all columns with n except the first one
    # stacked = stacked.drop(stacked.columns[stacked.columns.get_level_values(1) == "n"][1:], axis=1, level=1)
    stacked = stacked.drop(stacked.columns[stacked.columns.get_level_values(1) == "n"][1:], axis=1)
    display(stacked)

    stacked.to_csv(f"{metric}.csv")

# dfs = [piv_df[["n", i, "Stdev"]] for i in metrics]

# for df in dfs:
#     reorganized = reorganize_df(df)
#     display(reorganized)


FPRA


Pipeline,biobakery3,biobakery3,biobakery3,biobakery4,biobakery4,jams,jams,wgsa2,wgsa2,woltka,woltka
Unnamed: 0_level_1,n,Mean,Stdev,Mean,Stdev,Mean,Stdev,Mean,Stdev,Mean,Stdev
Community,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
BMock12,1.0,2.6303,,17.9266,,50.0438,,37.3442,,11.9647,
CamiSim S1,1.0,0.5265,,3.0566,,1.4388,,2.0432,,8.5091,
CamiSim S2,1.0,0.0586,,3.8344,,0.2055,,3.9102,,30.6668,
NIST EG,1.0,2.6645,,0.0,,0.2104,,1.1601,,26.4802,
NIST MIX-A,1.0,0.8464,,0.0,,0.052,,0.6781,,30.9152,
NIST MIX-B,1.0,11.4154,,0.0141,,0.2655,,1.0866,,31.0568,
NIST MIX-C,1.0,32.0567,,32.3094,,22.3335,,9.162,,69.1682,
NIST MIX-D,1.0,0.0607,,0.1024,,0.0194,,0.2506,,13.1948,
Amos HiLo,5.0,0.00954,0.021332,0.0,0.0,3.2694,0.115418,4.06548,0.09676,22.23968,0.205998
Amos Mixed,5.0,3.86412,0.220681,0.0,0.0,9.75528,1.431971,14.06352,0.115006,27.3918,0.30741


Sens


Pipeline,biobakery3,biobakery3,biobakery3,biobakery4,biobakery4,jams,jams,wgsa2,wgsa2,woltka,woltka
Unnamed: 0_level_1,n,Mean,Stdev,Mean,Stdev,Mean,Stdev,Mean,Stdev,Mean,Stdev
Community,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
BMock12,1.0,33.3333,,58.3333,,50.0,,50.0,,8.3333,
CamiSim S1,1.0,100.0,,89.4737,,97.3684,,97.3684,,86.8421,
CamiSim S2,1.0,100.0,,95.2381,,100.0,,100.0,,80.9524,
NIST EG,1.0,85.7143,,92.8571,,92.8571,,100.0,,85.7143,
NIST MIX-A,1.0,81.8182,,90.9091,,72.7273,,100.0,,90.9091,
NIST MIX-B,1.0,72.7273,,72.7273,,81.8182,,100.0,,90.9091,
NIST MIX-C,1.0,63.6364,,63.6364,,90.9091,,100.0,,81.8182,
NIST MIX-D,1.0,72.7273,,72.7273,,81.8182,,100.0,,81.8182,
Amos HiLo,5.0,89.4737,0.0,100.0,0.0,98.94736,2.353775,89.4737,0.0,94.7368,0.0
Amos Mixed,5.0,94.7368,0.0,100.0,0.0,100.0,0.0,89.4737,0.0,94.7368,0.0


AD


Pipeline,biobakery3,biobakery3,biobakery3,biobakery4,biobakery4,jams,jams,wgsa2,wgsa2,woltka,woltka
Unnamed: 0_level_1,n,Mean,Stdev,Mean,Stdev,Mean,Stdev,Mean,Stdev,Mean,Stdev
Community,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
BMock12,1.0,22.1457,,16.8327,,33.7177,,44.16,,42.1407,
CamiSim S1,1.0,8.6188,,12.7647,,15.0361,,13.4764,,30.7596,
CamiSim S2,1.0,1.2308,,7.1138,,4.8525,,6.3786,,25.8766,
NIST EG,1.0,10.1826,,8.9446,,12.2531,,10.6638,,22.5953,
NIST MIX-A,1.0,9.5852,,2.8426,,12.3527,,6.9975,,19.242,
NIST MIX-B,1.0,13.6522,,9.814,,13.5504,,13.0792,,22.8751,
NIST MIX-C,1.0,14.1087,,11.569,,17.1601,,14.702,,24.1026,
NIST MIX-D,1.0,4.1345,,4.1098,,11.2166,,8.9316,,21.1155,
Amos HiLo,5.0,8.40252,1.069797,2.71252,0.040068,12.79564,0.341404,12.43696,0.28674,16.3845,0.11232
Amos Mixed,5.0,8.76744,0.774772,1.99916,0.055893,10.42296,0.358741,14.81574,0.051594,20.62992,0.216708
