# Read Stats
This notebook will take the directory of trimmed read stats, concatenate for an overall dataframe, then generate the wanted table.

In [35]:
import pandas as pd
import numpy as np
import os

In [36]:
trim_reads_path = "/Volumes/TBHD_share/valencia/pipelines/read_stats/trimmed"
files = os.listdir(trim_reads_path)
files = [f for f in files if f.endswith(".out")]
print(files)

['camisim_trim.out', 'tourlousse_trim.out', 'amos_hilo_trim.out', 'bmock12_trim.out', 'nist_trim.out', 'amos_mixed_trim.out', 'hmp_gut_trim.out']


In [91]:
def make_df(file: str, name: str) -> pd.DataFrame:
    df = pd.read_csv(os.path.join(trim_reads_path, file), delim_whitespace=True, header=None, names=col_names)
    df["Community"] = name

    # We want to split the file column on the final / and take the last element.
    df["file"] = df["file"].apply(lambda x: x.split("/")[-1])

    # Sort the dataframe by the file column.
    df = df.sort_values("file")

    # Convert num_seqs and sum_len to integers.
    def convert_column(col: str) -> None:
        df[col] = df[col].apply(lambda x: int(x.replace(",", "")))
        df[col].astype(int)

    convert_column("num_seqs")
    convert_column("sum_len")

    df.set_index("file", inplace=True)

    return df

col_names = ["file", "format", "type", "num_seqs", "sum_len", "min_len", "avg_len", "max_len"]

def parse_files() -> pd.DataFrame:
    replicates_df = pd.DataFrame()
    one_to_one_df = pd.DataFrame()

    for f in files:
        if "bmock12" in f or "camisim" in f or "nist" in f:
            name = f.split("_")[0]
            df = make_df(f, name)

            one_to_one_df = pd.concat([one_to_one_df, df])

        else:
            # These are the replicate studies.
            split = f.split("_")
            split.pop()
            name = "_".join(split)

            df = make_df(f, name)

            # We want to add a row for the average of the replicates.
            average = df.mean(numeric_only=True)
            stddev = df.std(numeric_only=True)

            col_labels = ["Community"] + average.index.to_list()
            print(col_labels)

            # How many decimals do we want to show?
            new_row = [f"{i:.2f} ± {stddev[c]:.2f}" for c, i in enumerate(average)]
            new_row.insert(0, name)

            replicates_df = pd.concat([replicates_df, pd.DataFrame([new_row], columns=col_labels)])

    return one_to_one_df, replicates_df

one_to_one, replicates_df = parse_files()
one_to_one_stats = pd.DataFrame()

one_to_one = one_to_one.loc[(one_to_one.index != "Neg_S6_L001_R1.fastq") & (one_to_one.index != "Neg_S6_L001_R2.fastq")]
for comm, pl_df in one_to_one.groupby("Community"):
    pl_df = pl_df[["num_seqs", "sum_len", "min_len", "avg_len", "max_len"]]
    pl_df.reset_index(inplace=True)
    # We want to average every two rows
    pl_df_avg_grp = pl_df.groupby(pl_df.index // 2)
    for name, group in pl_df_avg_grp:
        new_name = group.iloc[0]["file"].split("_")[0] + f"_{comm}"
        grp_avg = group.mean(numeric_only=True)
        grp_std = group.std(numeric_only=True)

        col_labels = ["Community"] + grp_avg.index.to_list()

        new_row = [f"{i:.2f} ± {grp_std[c]:.2f}" for c, i in enumerate(grp_avg)]
        new_row.insert(0, new_name)

        one_to_one_stats = pd.concat([one_to_one_stats, pd.DataFrame([new_row], columns=col_labels)])

final_df = pd.concat([one_to_one_stats, replicates_df])
final_df.set_index("Community", inplace=True)
display(final_df)

final_df.to_csv("read_stats.csv", index=True)

['Community', 'num_seqs', 'sum_len', 'min_len', 'avg_len', 'max_len']
['Community', 'num_seqs', 'sum_len', 'min_len', 'avg_len', 'max_len']
['Community', 'num_seqs', 'sum_len', 'min_len', 'avg_len', 'max_len']
['Community', 'num_seqs', 'sum_len', 'min_len', 'avg_len', 'max_len']


Unnamed: 0_level_0,num_seqs,sum_len,min_len,avg_len,max_len
Community,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
sub_bmock12,100500000.00 ± 0.00,15175500000.00 ± 0.00,151.00 ± 0.00,151.00 ± 0.00,151.00 ± 0.00
S1_camisim,16666158.00 ± 0.00,2473504212.50 ± 12668039.31,31.00 ± 0.00,148.45 ± 0.78,150.00 ± 0.00
S2_camisim,16666291.00 ± 0.00,2473723888.00 ± 12664634.59,31.00 ± 0.00,148.45 ± 0.78,150.00 ± 0.00
EG_nist,3353278.00 ± 0.00,282025983.00 ± 3833426.68,15.00 ± 0.00,84.10 ± 1.13,151.00 ± 0.00
Mix-A_nist,3473553.00 ± 0.00,365925482.50 ± 2529868.96,19.00 ± 5.66,105.35 ± 0.78,151.00 ± 0.00
Mix-B_nist,3583192.00 ± 0.00,376371686.50 ± 2769876.56,18.50 ± 4.95,105.05 ± 0.78,151.00 ± 0.00
Mix-C_nist,2974354.00 ± 0.00,325816856.00 ± 2268975.55,16.50 ± 2.12,109.55 ± 0.78,151.00 ± 0.00
Mix-D_nist,3278203.00 ± 0.00,373347674.50 ± 2103394.48,20.00 ± 7.07,113.85 ± 0.64,151.00 ± 0.00
tourlousse,5897627.83 ± 378678.85,855639825.08 ± 51884672.55,15.00 ± 0.00,145.12 ± 1.31,151.00 ± 0.00
amos_hilo,1899005.80 ± 172132.60,380831440.00 ± 34742124.05,100.00 ± 0.00,200.54 ± 1.93,205.00 ± 0.00
