In [53]:
import numpy as np
import pandas as pd
import seaborn as sns

from functools import reduce

import sys
sys.path.append("../../") # noqa

from typing import Generator, Tuple

from python_src.figures_utils import get_all_expected, generate_experimental_df, get_relabund_files, fully_combined

# NIST Sample Sensitivity Analysis

Our goal is to use the five NIST samples to conduct an analysis of TP, TN, FP, and FN scores across the bacterial species.

In [47]:
# We should read in the expected data and make a large dataframe.

expected_df = get_all_expected("../../expected_pipelines/nist/", rank="species")
expected_df.drop(columns=["Source"], inplace=True)
display(expected_df)

expected_df.reset_index(inplace=True)
df = expected_df.pivot(index=["species", "TAX_ID"], columns="SampleID", values="RA")
df

Unnamed: 0_level_0,RA,TAX_ID,SampleID
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ESCHERICHIA_COLI,0.0880,562,EG
STAPHYLOCOCCUS_AUREUS,0.0880,1280,EG
NEISSERIA_MENINGITIDIS,0.0880,487,EG
SALMONELLA_ENTERICA,0.0760,28901,EG
ACINETOBACTER_BAUMANNII,0.0760,470,EG
...,...,...,...
ENTEROCOCCUS_FAECALIS,0.0390,1351,MIX-C
VIBRIO_FURNISSII,0.0390,29494,MIX-C
LISTERIA_MONOCYTOGENES,0.0390,1639,MIX-C
LEGIONELLA_PNEUMOPHILA,0.0054,446,MIX-C


Unnamed: 0_level_0,SampleID,EG,MIX-A,MIX-B,MIX-C,MIX-D
species,TAX_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ACHROMOBACTER_XYLOSOXIDANS,85698,0.072,0.0,0.029,0.291,0.00023
ACINETOBACTER_BAUMANNII,470,0.076,0.026,0.301,0.00031,0.0
AEROMONAS_HYDROPHILA,644,0.0014,0.0047,0.0053,0.0054,0.0043
ENTEROCOCCUS_FAECALIS,1351,0.096,0.00033,0.0,0.039,0.3
ESCHERICHIA_COLI,562,0.088,0.304,0.00035,0.0,0.02758
KLEBSIELLA_PNEUMONIAE,573,0.076,0.026,0.301,0.00031,0.0
LEGIONELLA_PNEUMOPHILA,446,0.0014,0.0047,0.0053,0.0054,0.0043
LISTERIA_MONOCYTOGENES,1639,0.096,0.00033,0.0,0.039,0.3
NEISSERIA_MENINGITIDIS,487,0.088,0.304,0.00035,0.0,0.02758
SALMONELLA_ENTERICA,28901,0.076,0.026,0.301,0.00031,0.0


In [48]:
fc = fully_combined(root_dir="../../pipelines/nist", expected_root="../../expected_pipelines/nist", rank="species")
display(fc)

negatives = get_relabund_files("negatives", rank="species")
display(negatives)

Unnamed: 0_level_0,species,RA,Source,SampleID
TAX_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
562,ESCHERICHIA_COLI,0.088000,Expected,EG
1280,STAPHYLOCOCCUS_AUREUS,0.088000,Expected,EG
487,NEISSERIA_MENINGITIDIS,0.088000,Expected,EG
28901,SALMONELLA_ENTERICA,0.076000,Expected,EG
470,ACINETOBACTER_BAUMANNII,0.076000,Expected,EG
...,...,...,...,...
487,NEISSERIA_MENINGITIDIS,0.314772,biobakery4,MIX-A
28901,SALMONELLA_ENTERICA,0.023447,biobakery4,MIX-A
1280,STAPHYLOCOCCUS_AUREUS,0.295014,biobakery4,MIX-A
1314,STREPTOCOCCUS_PYOGENES,0.000000,biobakery4,MIX-A


Unnamed: 0_level_0,RA,TAX_ID,Source,SampleID
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Staphylococcus cohnii,0.064487,29382,wgsa2,NEG
unclassified,0.039964,12908,wgsa2,NEG
Escherichia coli,0.027248,562,wgsa2,NEG
Enterococcus faecalis,0.019982,1351,wgsa2,NEG
Vibrio furnissii,0.011807,29494,wgsa2,NEG
Listeria monocytogenes,0.007266,1639,wgsa2,NEG
Streptococcus pyogenes,0.00545,1314,wgsa2,NEG
Achromobacter xylosoxidans,0.004541,85698,wgsa2,NEG
Klebsiella pneumoniae,0.003633,573,wgsa2,NEG
Homo sapiens,0.003633,9606,wgsa2,NEG


In [54]:
def make_dataframes():
    for smpl, smpl_df in fc.groupby("SampleID"):
        exp = smpl_df.loc[smpl_df["Source"] == "Expected"]
        obs = smpl_df.loc[smpl_df["Source"] != "Expected"]

        # Take from the expected dataframe the index, species, and RA.
        final_df = pd.DataFrame()
        final_df.index = exp.index
        final_df["species"] = exp["species"]
        final_df["RA_exp"] = exp["RA"]

        data_frames = smpl_df.groupby("Source")
        for pl, pl_df in obs.groupby("Source"):
            # left join with exp
            merged = exp.merge(pl_df, on="TAX_ID", how="left", suffixes=("_exp", "_obs"))

            # Don't need most of these columns, want final table to be TAXID, Species Name, SampleID_exp, BB3, BB4, etc.
            merged.drop(columns=["Source_exp", "Source_obs", "species_obs", "SampleID_exp", "SampleID_obs"], inplace=True)

            # Rename the columns to be SampleID_exp, SampleID_obs
            merged = merged.rename(columns={"RA_exp": f"{smpl}_exp", "RA_obs": f"{pl}_obs"})

            final_df = pd.concat([final_df, merged[f"{pl}_obs"]], axis=1, sort=False)

        final_df.index.name = "TAX_ID"
        final_df["SampleID"] = smpl
        yield final_df

"""
def calculate_stats(df: pd.DataFrame, pl: str):
    df = df.copy()
    df.fillna(0, inplace=True)
    # TP condition: species is in expected and observed = 0
    # FP condition: species is in observed but not expected = 1
    # FN condition: species is in expected but not observed = 2
    # TN condition: species is not in expected or observed = 3

    conditions = [
        (df["RA_exp"] > 0) & (df[f"{pl}_obs"] > 0),
        (df["RA_exp"] == 0) & (df[f"{pl}_obs"] > 0),
        (df["RA_exp"] > 0) & (df[f"{pl}_obs"] == 0),
        (df["RA_exp"] == 0) & (df[f"{pl}_obs"] == 0),
    ]

    choices = [0, 1, 2, 3]

    df["score"] = np.select(conditions, choices)

    # Now, summarize the number of each condition

    summary = df["score"].value_counts()
    print(summary)

    display(df.head()) 
"""

def pivot_df(dataframes: Generator[pd.DataFrame, None, None]) -> Generator[Tuple[pd.DataFrame, str], None, None]:
    """
    Takes the dataframes from make_dataframes and pivots them so that the SampleID values are the columns.

    Parameters:
        dataframes: Generator[pd.DataFrame]
            The dataframes from make_dataframes
    
    Returns:
        pd.DataFrame as a generator
    
    """
    pipelines = ["biobakery3", "biobakery4", "jams", "wgsa2", "woltka"]

    for pl in pipelines:
        merged = pd.DataFrame()
        for dfs in make_dataframes():
            pl_df = dfs[["species", "RA_exp", f"{pl}_obs", "SampleID"]]
            merged = pd.concat([merged, pl_df], axis=0, sort=False)
            # calculate_stats(pl_df, pl)

        # display(merged.head())
        # print(merged.shape)

        # Make the sampleID values the columns
        print(f"Pipeline: {pl}")
        piv = merged.pivot(index=["species", "RA_exp"], columns="SampleID", values=f"{pl}_obs")
        # display(piv)

        yield (piv, pl)

In [57]:
def confusion_matrix(df: pd.DataFrame, pl: str):
    output_list = []
    for spec, spec_df in df.groupby("species"):
        # Initialize the confusion matrix
        tp, fp, fn, tn = 0, 0, 0, 0

        for ind in spec_df.index:
            ra_exp = ind[1]
            # display(spec_df.loc[[ind]])

            # Get index of True value in the row
            row_values = spec_df.loc[[ind]].values.reshape(-1)
            truth_array = spec_df.loc[[ind]].notna().values.reshape(-1)
            # print(truth_array)
            idx = [i for i, x in enumerate(truth_array) if x]

            if len(idx) > 1:
                raise Exception("More than one value in row")
            
            # Means that the pipeline did not find the value at all. This can either be a true negative or a false negative.
            if len(idx) == 0:
                # If it it supposed to be there, then it is a false negative
                if ra_exp > 0:
                    fn += 1
                # If it is not supposed to be there, then it is a true negative.
                else:
                    tn += 1
                continue

            ra_obs = spec_df.loc[[ind]].values.reshape(-1)[idx][0]
            # print(spec_df.loc[[ind]].columns[idx].values[0])
            # print(f"{ra_exp} vs {ra_obs}")

            # Finally, compare ra_exp and val.

            # TP condition: species is in expected and observed = 0
            # FP condition: species is in observed but not expected = 1
            # FN condition: species is in expected but not observed = 2
            # TN condition: species is not in expected or observed = 3

            # True positive
            if ra_exp > 0 and ra_obs > 0:
                tp += 1
            # False positive
            elif ra_exp == 0 and ra_obs > 0:
                fp += 1
            # False negative
            elif ra_exp > 0 and ra_obs == 0:
                fn += 1
            # True negative
            elif ra_exp == 0 and ra_obs == 0:
                tn += 1 

        # Verify that the sum is five.
        if tp + fp + fn + tn != 5:
            raise Exception("Sum of confusion matrix is not 5")

        # print(f"Results for {spec}: {tp}, {fp}, {fn}, {tn}")
        output_list.append([spec, tp, fp, fn, tn])

    output_df = pd.DataFrame(output_list, columns=["species", "tp", "fp", "fn", "tn"])
    return output_df


for piv in pivot_df(make_dataframes()):
    pipeline_name = piv[1]
    cf_df = confusion_matrix(piv[0], pipeline_name)
    display(cf_df)

Pipeline: biobakery3


Unnamed: 0,species,tp,fp,fn,tn
0,ACHROMOBACTER_XYLOSOXIDANS,3,0,1,1
1,ACINETOBACTER_BAUMANNII,3,0,1,1
2,AEROMONAS_HYDROPHILA,4,0,1,0
3,ENTEROCOCCUS_FAECALIS,4,0,0,1
4,ESCHERICHIA_COLI,4,1,0,0
5,KLEBSIELLA_PNEUMONIAE,3,0,1,1
6,LEGIONELLA_PNEUMOPHILA,5,0,0,0
7,LISTERIA_MONOCYTOGENES,3,0,1,1
8,NEISSERIA_MENINGITIDIS,3,0,1,1
9,SALMONELLA_ENTERICA,3,0,1,1


Pipeline: biobakery4


Unnamed: 0,species,tp,fp,fn,tn
0,ACHROMOBACTER_XYLOSOXIDANS,3,0,1,1
1,ACINETOBACTER_BAUMANNII,3,0,1,1
2,AEROMONAS_HYDROPHILA,5,0,0,0
3,ENTEROCOCCUS_FAECALIS,4,0,0,1
4,ESCHERICHIA_COLI,4,1,0,0
5,KLEBSIELLA_PNEUMONIAE,3,0,1,1
6,LEGIONELLA_PNEUMOPHILA,5,0,0,0
7,LISTERIA_MONOCYTOGENES,3,0,1,1
8,NEISSERIA_MENINGITIDIS,3,0,1,1
9,SALMONELLA_ENTERICA,3,1,1,0


Pipeline: jams


Unnamed: 0,species,tp,fp,fn,tn
0,ACHROMOBACTER_XYLOSOXIDANS,3,0,1,1
1,ACINETOBACTER_BAUMANNII,3,0,1,1
2,AEROMONAS_HYDROPHILA,4,0,1,0
3,ENTEROCOCCUS_FAECALIS,3,0,1,1
4,ESCHERICHIA_COLI,4,1,0,0
5,KLEBSIELLA_PNEUMONIAE,4,0,0,1
6,LEGIONELLA_PNEUMOPHILA,5,0,0,0
7,LISTERIA_MONOCYTOGENES,3,0,1,1
8,NEISSERIA_MENINGITIDIS,3,0,1,1
9,SALMONELLA_ENTERICA,4,1,0,0


Pipeline: wgsa2


Unnamed: 0,species,tp,fp,fn,tn
0,ACHROMOBACTER_XYLOSOXIDANS,4,1,0,0
1,ACINETOBACTER_BAUMANNII,4,1,0,0
2,AEROMONAS_HYDROPHILA,5,0,0,0
3,ENTEROCOCCUS_FAECALIS,4,1,0,0
4,ESCHERICHIA_COLI,4,1,0,0
5,KLEBSIELLA_PNEUMONIAE,4,1,0,0
6,LEGIONELLA_PNEUMOPHILA,5,0,0,0
7,LISTERIA_MONOCYTOGENES,4,1,0,0
8,NEISSERIA_MENINGITIDIS,4,1,0,0
9,SALMONELLA_ENTERICA,4,1,0,0


Pipeline: woltka


Unnamed: 0,species,tp,fp,fn,tn
0,ACHROMOBACTER_XYLOSOXIDANS,4,1,0,0
1,ACINETOBACTER_BAUMANNII,4,1,0,0
2,AEROMONAS_HYDROPHILA,5,0,0,0
3,ENTEROCOCCUS_FAECALIS,4,1,0,0
4,ESCHERICHIA_COLI,4,1,0,0
5,KLEBSIELLA_PNEUMONIAE,4,1,0,0
6,LEGIONELLA_PNEUMOPHILA,5,0,0,0
7,LISTERIA_MONOCYTOGENES,4,1,0,0
8,NEISSERIA_MENINGITIDIS,4,1,0,0
9,SALMONELLA_ENTERICA,4,1,0,0
