# Analyze fastq and bam files

In [None]:
import pandas as pd
from tqdm.notebook import tqdm

tqdm.pandas()
from pathlib import Path
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns

## Reads the data

In [None]:
from zipfile import ZipFile


def parse_fastqc(zip_path):
    """
    Parse fastqc report, return df of each metric
    """

    # get data from zip
    with ZipFile(zip_path, "r") as z:
        for file in z.namelist():
            if "fastqc_data.txt" in file:
                with z.open(file) as f:
                    lines = f.read().decode("utf-8").splitlines()
                    break

    # Create a dictionary to hold data for each module
    data, headers = {}, {}
    module = None
    for line in lines:
        if line.startswith(">>") and "END_MODULE" not in line:
            module = line.lstrip(">>").split("\t")[0]
            data[module] = []
        elif module and line.startswith("#"):
            headers[module] = line.strip().lstrip("#").split("\t")
        elif module and not line.startswith(">>") and line.strip():
            values = line.strip().split("\t")
            data[module].append(values)

    # Convert each module's data to a DataFrame
    dfs = {}
    for module, content in data.items():
        if content:
            df = pd.DataFrame(content, columns=headers[module])
            if "Measure" in headers[module]:
                df.set_index("Measure", inplace=True)

            # try converting each column to numeric
            for c in df.columns:
                if c == "Length":
                    df[c] = df[c].str.split("-").str[0].astype(int)

                try:
                    df[c] = pd.to_numeric(df[c])
                except ValueError:
                    df[c] = df[c]

            dfs[module] = df

    return dfs


def parse_flagstat(file):
    with open(file) as f:
        lines = f.readlines()
        lines = [line.strip() for line in lines]

    return {
        "mapped": int(lines[0].split()[0]),
        "pcr_duplicates": int(lines[3].split()[0]),
    }

In [None]:
res = defaultdict(list)

# fastq metrics
print("Reading fastqc")
for f in tqdm(snakemake.input.fastqc):
    sample = "_".join(Path(f).name.split("_")[:-1])
    read = Path(f).name.split("_")[-1].split(".")[0]
    stage = Path(f).name.split("_")[-1].split(".")[1]
    dfs = parse_fastqc(f)
    df = dfs["Sequence Length Distribution"]
    avg_length = (df["Length"] * df["Count"]).sum() / df["Count"].sum()
    df = dfs["Per sequence quality scores"]
    avg_qual = (df["Quality"] * df["Count"]).sum() / df["Count"].sum()

    res["sample"].append(sample)
    res["read"].append(read)
    res["stage"].append(stage)
    res["n_reads"].append(dfs["Basic Statistics"].loc["Total Sequences", "Value"])
    # res["avg_length"].append(avg_length)
    # res["avg_quality"].append(avg_qual)


# bam metrics
print("Reading flagstats")
for f in tqdm(snakemake.input.flagstat):
    sample = Path(f).name.rstrip(".tagged.sorted.flagstat.txt")
    flagstat = parse_flagstat(f)
    # add total reads from sample
    res["sample"].append(sample)
    res["stage"].append("mapped")
    res["read"].append("NA")
    res["n_reads"].append(flagstat["mapped"])

    # add deduplicated reads from sample
    res["sample"].append(sample)
    res["stage"].append("dedup")
    res["read"].append("NA")
    res["n_reads"].append(flagstat["mapped"] - flagstat["pcr_duplicates"])

res = pd.DataFrame(res)
res["n_reads"] = pd.to_numeric(res["n_reads"])

## Inspect Number of reads per cell at fastq level

In [None]:
def summarize_nreads(df):
    stage = df["stage"].unique()[0]
    if stage in ["raw", "trimmed", "filtered"]:
        nreads = df[df["read"].isin(["R1", "merged", "R2"])]["n_reads"].sum()
    else:
        nreads = df["n_reads"].unique()[0]
    return nreads


nreads = (
    res.groupby(["sample", "stage"]).apply(summarize_nreads).reset_index(name="nreads")
)

# remove bulk samples
nreads = nreads.query("'gDNA' not in sample")

In [None]:
g, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

sns.histplot(
    nreads,
    x="nreads",
    hue="stage",
    alpha=0.5,
    bins=70,
    hue_order=["raw", "trimmed", "filtered", "mapped", "pcr_dedup"],
    log_scale=(True, False),
    ax=ax1,
)
ax1.set_ylabel("# cells")

sns.ecdfplot(
    nreads,
    x="nreads",
    hue="stage",
    stat="count",
    hue_order=["raw", "trimmed", "filtered", "mapped", "pcr_dedup"],
    log_scale=(True, False),
    ax=ax2,
)
# set log scale
ax2.set_ylabel("# cells")

In [None]:
def stage_diff(df):
    raw = df.loc[df["stage"] == "raw", "nreads"].unique()[0]
    trimmed = df.loc[df["stage"] == "trimmed", "nreads"].unique()[0]
    filtered = df.loc[df["stage"] == "filtered", "nreads"].unique()[0]
    total = df.loc[df["stage"] == "mapped", "nreads"].unique()[0]
    dedup = df.loc[df["stage"] == "pcr_dedup", "nreads"].unique()[0]

    return pd.Series(
        {
            "trimmed": (raw - trimmed) / raw,
            "filtered": (trimmed - filtered) / trimmed,
            "mapped": (filtered - mapped) / filtered,
            "pcr_dedup": (mapped - pcr_dedup) / mapped,
        }
    )


sdiff = (
    nreads.groupby("sample")
    .apply(stage_diff)
    .reset_index()
    .melt(id_vars="sample", var_name="stage")
)

In [None]:
g, (ax1, ax2) = plt.subplots(1, 2, figsize=(11, 5), sharey=True)

# Define the order

cat_type = CategoricalDtype(
    categories=["raw", "trimmed", "filtered", "mapped", "pcr_dedup"], ordered=True
)
sdiff["stage"] = sdiff["stage"].astype(cat_type)
nreads["stage"] = nreads["stage"].astype(cat_type)

# lineplot
for sample, df in tqdm(nreads.groupby("sample")):
    sns.lineplot(
        df,
        x="nreads",
        y="stage",
        alpha=0.3,
        ax=ax1,
        c="blue",
    )

ax1.set_xscale("log")

# striplot
sns.boxenplot(sdiff, y="stage", x="value", ax=ax2)
ax2.set_xlabel("fraction of reads lost from previous step")

## Inspect sequencing saturation

In [None]:
# compute duplication rate for library saturation
def saturation(df):
    total = df.loc[df["stage"] == "mapped", "nreads"].unique()[0]
    dups = total - df.loc[df["stage"] == "pcr_dedup", "nreads"].unique()[0]
    return dups / total


sat = nreads.groupby("sample").apply(saturation)

g = sns.histplot(sat, bins=70)
g.set_xlabel("Sequencing Saturation (Duplicated / Total Reads)")
g.set_ylabel("# cells")

TODO:
1. show contig stats
2. look at correlations with clinical features