## Notebook to plot Aitchison Distance versus Read Length

In [None]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

In [None]:
reads_df = pd.read_csv('../../paper/read_stats.csv', encoding='utf-8')
reads_df.set_index("Community", inplace=True)
reads_recode = {
    "sub_bmock12": "S1_bmock12",
    "amos_hilo": "hilo",
    "amos_mixed": "mixed",
    "S1_camisim": "S1_camisimGI",
    "S2_camisim": "S2_camisimGI",
    "Mix-A_nist": "MIX-A_nist",
    "Mix-B_nist": "MIX-B_nist",
    "Mix-C_nist": "MIX-C_nist",
    "Mix-D_nist": "MIX-D_nist",
}

reads_df = reads_df.rename(reads_recode)
display(reads_df)

In [None]:
def make_joined(df_left: pd.DataFrame, df_right: pd.DataFrame):
    joined = pd.merge(df_left, df_right, how="left", left_on="combined_name", right_index=True)

    cols = joined.columns.to_list()[13:]

    for c in cols:
        joined = split_cols(joined, c, sep = "+-")

    return joined


def split_cols(df: pd.DataFrame, col: str, sep: str = ','):
    """Split a column into multiple columns"""
    df = df.copy()
    # display(df)
    df[[col, f'{col}_stddev']] = df[col].str.split(re.escape(sep), expand=True)
    # df = df.explode(col)
    # display(df)
    return df

In [None]:
combined_df = pd.DataFrame()

stats_df = pd.read_csv("../results/all_stats_species_2023-03-14.csv")
stats_df["combined_name"] = stats_df["SampleID"] + "_" + stats_df["Source"]

# We only want thresholds == 0.0001
stats_df = stats_df[stats_df["threshold"] == 0.0001]
stats_df = stats_df[stats_df["Pipeline"] != "jams202212"]

for t, t_df in stats_df.groupby("threshold"):
    for pl, pl_df in t_df.groupby("Pipeline"):
        for src, src_df in pl_df.groupby("Source"):
            # print(src)
            if src == "hilo" or src == "mixed" or src == "tourlousse":
                src_df.loc['mean'] = src_df.mean(numeric_only=True, axis=0)
                mean = src_df.loc[['mean']].copy()
                mean["Source"] = src
                mean["Pipeline"] = pl
                mean["combined_name"] = src
                # display(mean.head())
                joined = make_joined(mean, reads_df)
                
                combined_df = pd.concat([combined_df, joined])
            else:
                # display(src_df.head())
                joined = make_joined(src_df, reads_df)

                combined_df = pd.concat([combined_df, joined])
            
combined_df.to_csv("stats_reads_combined.csv", index=False)

In [None]:
combined_df = combined_df.astype({"AD": float, "avg_len": float, "num_seqs": float, "sum_len": float})

def decorate_sns():
    # Make a scatterplot of AD vs avg_len
    sns.set_theme(style="whitegrid")
    sns.set_context("paper", font_scale=1.5)
    sns.set_style("ticks")

def pearson_df(df: pd.DataFrame, x: str):
    """
    Do a pearson correlation test on a dataframe between a column (argument x) and AD on the input dataframe, df.
    Parameters:
        df: pd.DataFrame
            The dataframe to perform the test on
        x: str
            The column to perform the test on
    Returns:
        pearson_df: pd.DataFrame
            A dataframe with the pearson correlation coefficient and p-value for each pipeline.
    """
    pearson_df = pd.DataFrame()
    for pl, pl_df in df.groupby("Pipeline"):
        coeff, p = pearsonr(pl_df[x], pl_df['AD'])
        row = {"Pipeline": pl, "Pearson": coeff, "p": p}
        pearson_df = pd.concat([pearson_df, pd.DataFrame(row, index=[0])])
 
    return pearson_df


# Use avg_len, num_seqs, sum_len
def make_plots():
    x_labels = ["avg_len", "num_seqs", "sum_len"]
    for x in x_labels:
        if x == "avg_len":
            g = sns.lmplot(data=combined_df, x=x, y="AD", hue="Pipeline", height=5, aspect=1.5, ci=None, fit_reg=True)

            p_df = pearson_df(combined_df, x)

            # Since the pandas people deprecated the old way of doing this, we have to do it this way.
            p_df.style.hide(axis="index").to_latex("pearson_avg_len.tex")

            g.fig.suptitle("Aitchison Distance vs Average Read Length by Pipeline", y=1.05)
            g.set_ylabels("Aitchison Distance")
            g.set_xlabels("Average Read Length (bp)")

            # Now, add the pearson correlation coefficient and p-value to the plot
            # for c, (i, row) in enumerate(p_df.iterrows()):
            #     # p_val = f"{row['p']:.3f}" if row['p'] < 0.05 else "ns"
            #     p_val = f"{row['p']:.3f}"
            #     g.ax.text(1.0, 0.975 - (c * 0.05), f"{row['Pipeline']}: {row['Pearson']:.2f} (p={p_val})", transform=g.ax.transAxes, fontsize=9)

            g.fig.savefig("ad_vs_readlength.pdf", bbox_inches='tight', dpi=300)
                        
        else:
            g = sns.lmplot(data=combined_df, x=x, y="AD", hue="Pipeline", height=5, aspect=1.5, ci=None, fit_reg=True, logx=True)
            g.set(xscale="log")

        plt.show()

decorate_sns()
make_plots()