In [None]:
from pathlib import Path
from collections import defaultdict
from zipfile import ZipFile
from tqdm import tqdm
from itertools import product
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pysam
from myutils.rmsk import read_rmsk

## Sequencing depth before/after trimming and filtering

(parsing cutadapt logs)

In [None]:
p = Path("/iblm/logglun02/mcuoco/workflows/sz_slavseq/results/fastq")

res = defaultdict(dict)
for f in p.rglob("*/*qc.txt"):
    # skip bulk gDNA samples
    if ("gDNA" in f.name) or ("CommonBrain" in f.parent.name):
        continue
    sample = f.name.split(".")[0]
    stage = f.name.split(".")[1]
    lines = f.read_text().splitlines()
    res[sample]["donor_id"] = f.parent.name
    total_line = [l for l in lines if "Total read pairs" in l][0]
    if stage == "trimmed":
        res[sample]["Raw"] = int(total_line.split()[-1].replace(",", ""))
    elif stage == "filtered":
        res[sample]["After trimming"] = int(total_line.split()[-1].replace(",", ""))
        pairs_written_line = [l for l in lines if "Pairs written" in l][0]
        res[sample]["After filtering"] = int(
            pairs_written_line.split()[4].replace(",", "")
        )

res = pd.DataFrame(res).T
# res["pct_after_trimming"] = res["after_trimming"] / res["total_pairs"]
# res["pct_after_filtering"] = res["after_filtering"] / res["total_pairs"]

In [None]:
g = sns.histplot(
    data=res.melt(id_vars="donor_id"), x="value", hue="variable", element="step"
)
g.set(xlabel="# Read pairs", ylabel="# Cells")
g.legend_.set_title(None)
sns.despine()

# rename color values
for t, l in zip(g.legend_.texts, ["Raw", "After trimming", "After filtering"]):
    m = res[l].mean()
    t.set_text(f"{l} (mean: {m/1e6:.1f}e6)")

# save as svg
plt.savefig("npairs_after_trimming.svg", bbox_inches="tight")

In [None]:
# remove legend title
g = sns.ecdfplot(data=res.melt(id_vars="donor_id"), x="value", hue="variable")
g.set(xlabel="# Read pairs")
g.legend_.set_title(None)
sns.despine()

# rename color values
for t, l in zip(g.legend_.texts, ["Raw", "After trimming", "After filtering"]):
    m = res[l].mean()
    t.set_text(f"{l} (mean: {m/1e6:.1f}e6)")

# add dotted line at 1M reads
plt.axvline(1e6, ls="--", color="black")

# save as svg
plt.savefig("npairs_after_trimming.svg", bbox_inches="tight")

## Read length after trimming

Parsing fastqc data

In [None]:
# read metadata
metadata = pd.read_csv(
    "/iblm/logglun02/mcuoco/workflows/sz_slavseq/config/slavseq_metadata.tsv", sep="\t"
)
metadata["TISSUE_ID"] = metadata["TISSUE_ID"].str.replace("D0", "D")
metadata["TISSUE_ID"] = metadata["TISSUE_ID"].str.replace("H0", "H")
metadata = (
    metadata[["TISSUE_ID", "SEQUENCING"]].set_index("TISSUE_ID")["SEQUENCING"].to_dict()
)

In [None]:
p = Path("/iblm/logglun02/mcuoco/workflows/sz_slavseq/results/qc/fastqc")

res = []
for f in p.rglob("*/*fastqc.zip"):
    # skip bulk gDNA samples
    if ("gDNA" in f.name) or ("CommonBrain" in f.parent.name):
        continue
    sample = f.name.split(".")[0].rstrip("_R[12]")
    donor = f.parent.name
    read = f.name.split(".")[0][-2::]
    stage = f.name.split(".")[1]
    tissue_id = "USH" if "USH" in sample.upper() else "USD"
    instrument = metadata[tissue_id + donor]

    # get length distribution
    with ZipFile(f) as z:
        target = [i for i in z.namelist() if "fastqc_data.txt" in i][0]
        m = z.read(target).decode("utf-8")
    sqlen_ind = m.find(">>Sequence Length Distribution")
    endmod_ind = m[sqlen_ind:].find(">>END_MODULE")
    lines = m[sqlen_ind:][:endmod_ind].splitlines()[1:]
    lines = [l.split("\t") for l in lines]

    # save to DataFrame
    df = pd.DataFrame(lines, columns=["length", "count"])[1:].copy()
    df["length"] = df["length"].apply(lambda x: x.split("-")[0]).astype(int)
    df["count"] = df["count"].apply(lambda x: x.split(".")[0]).astype(int)
    df.set_index("length", inplace=True)
    df["stage"] = stage
    df["read"] = read
    df["instrument"] = instrument
    df["sample"] = sample
    res.append(df)


res = pd.concat(res)

sample_lengths = (
    res[res["stage"] == "raw"]
    .reset_index()[["length", "sample"]]
    .groupby(["sample"])
    .mean()["length"]
    .to_dict()
)
res["raw_length"] = res["sample"].map(sample_lengths).astype(int)

In [None]:
g = sns.relplot(
    res[res["stage"] == "trimmed"],
    kind="line",
    x="length",
    y="count",
    hue="raw_length",
    col="read",
    col_order=["R1", "R2"],
    palette="Set1",
).set(
    xlabel="Read length after trimming (bp)",
    ylabel="# Reads",
)

# rename columns
axes = g.axes.flatten()
axes[0].set_title("Read 1")
axes[1].set_title("Read 2")

# move legend above plot
sns.move_legend(g, "upper center", bbox_to_anchor=(0.5, 1.1), ncol=3)

# set legend title
g._legend.set_title("Raw read length (bp)")

# save as svg
plt.savefig("trimmed_read_length.svg", bbox_inches="tight")

## Mapping statistics

In [None]:
res = defaultdict(dict)
for f in Path("/iblm/logglun02/mcuoco/workflows/sz_slavseq/results/qc/flagstat").rglob(
    "*/*.genome.flagstat"
):
    donor = f.parent.name
    sample = f.name.split(".")[0]
    lines = f.read_text().splitlines()
    if int(lines[0].split()[0]) < 1e6:
        continue
    res[sample]["donor_id"] = donor
    res[sample]["Primary mapped"] = int(lines[7].split()[0])
    res[sample]["Primary mapped non-duplicated"] = int(lines[7].split()[0]) - int(
        lines[5].split()[0]
    )

res = pd.DataFrame(res).T

In [None]:
g = sns.histplot(data=res.melt(id_vars="donor_id"), x="value", hue="variable")
g.set(xlabel="# Alignments", ylabel="# Cells")
g.legend_.set_title(None)
sns.despine()

# rename color values
for t, l in zip(g.legend_.texts, ["Primary mapped", "Primary mapped non-duplicated"]):
    m = res[l].mean()
    t.set_text(f"{l} (mean: {m/1e6:.1f}e6)")

# save as svg
plt.savefig("nalignments.svg", bbox_inches="tight")

In [None]:
# remove legend title
g = sns.ecdfplot(data=res.melt(id_vars="donor_id"), x="value", hue="variable")
g.set(xlabel="# Alignments", xscale="log")
g.legend_.set_title(None)
sns.despine()

# # rename color values
# for t, l in zip(g.legend_.texts, ["Raw", "After trimming", "After filtering"]):
#     m = res[l].mean()
#     t.set_text(f"{l} (mean: {m/1e6:.1f}M)")

# # add dotted line at 1M reads
# plt.axvline(1e6, ls="--", color="black")
# plt.figure(figsize=(10, 10))

# # save as svg
# plt.savefig("npairs_after_trimming.svg", bbox_inches="tight")

## Coverage

In [None]:
rmsk = read_rmsk(
    "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.out.gz"
)
# get L1HS coordinates
l1hs = rmsk[rmsk["repName"].isin(["L1HS", "L1PA2", "L1PA3"])]

In [None]:
p = "/iblm/logglun02/mcuoco/workflows/sz_slavseq/results"
samples = pd.read_csv(
    "/iblm/logglun02/mcuoco/workflows/sz_slavseq/config/7donor_samples.tsv", sep="\t"
)

donors = ["CommonBrain", 1, 3, 4, 5, 8, 27]
res = defaultdict()
for d in donors:
    donor_samples = samples[samples["donor_id"] == str(d)]["sample_id"].tolist()
    for s in tqdm(donor_samples):
        # get sample total reads
        total_aln = int(
            pysam.view(f"{p}/align/{d}/{s}.tagged.sorted.bam", "-c").rstrip()
        )
        rmsk1 = pd.read_csv(
            f"{p}/qc/l1_coverage/{d}/{s}.rmsk.r1.bed",
            sep="\t",
            header=None,
            usecols=[0, 1, 2, 3, 6],
            names=["Chromosome", "Start", "End", "label", "coverage"],
        ).set_index(["Chromosome", "Start", "End", "label"])
        rmsk2 = pd.read_csv(
            f"{p}/qc/l1_coverage/{d}/{s}.rmsk.r2.bed",
            sep="\t",
            header=None,
            usecols=[0, 1, 2, 3, 6],
            names=["Chromosome", "Start", "End", "label", "coverage"],
        ).set_index(["Chromosome", "Start", "End", "label"])
        rmsk = pd.concat([rmsk1, rmsk2], axis=1).sum(axis=1)
        knrgl1 = pd.read_csv(
            f"{p}/qc/l1_coverage/{d}/{s}.knrgl.r1.bed",
            sep="\t",
            header=None,
            usecols=[0, 1, 2, 44],
            names=["Chromosome", "Start", "End", "coverage"],
        )
        knrgl1["label"] = "KNRGL"
        knrgl1.set_index(["Chromosome", "Start", "End", "label"], inplace=True)
        knrgl2 = pd.read_csv(
            f"{p}/qc/l1_coverage/{d}/{s}.knrgl.r1.bed",
            sep="\t",
            header=None,
            usecols=[0, 1, 2, 44],
            names=["Chromosome", "Start", "End", "coverage"],
        )
        knrgl2["label"] = "KNRGL"
        knrgl2.set_index(["Chromosome", "Start", "End", "label"], inplace=True)
        knrgl = pd.concat([knrgl1, knrgl2], axis=1).sum(axis=1)
        comb = pd.concat([rmsk, knrgl], axis=0)
        res[s] = comb / (total_aln / 1e6)  # convert to RPM
    break

In [None]:
df = (
    pd.DataFrame(res)
    .reset_index()
    .rename(
        columns={
            "level_0": "sample_id",
            "level_1": "Chromosome",
            "level_2": "Start",
            "level_3": "End",
            "level_4": "label",
            0: "coverage",
        }
    )
)
# keep primary assembly
df = df[df["Chromosome"].isin([f"chr{i}" for i in range(1, 22)] + ["chrX", "chrY"])]
df["chr_num"] = df["Chromosome"].str.extract("chr(.*)")
df["chr_num"] = df["chr_num"].apply(
    lambda x: 23 if x == "X" else (24 if x == "Y" else int(x))
)
df.sort_values(["chr_num", "Start"], inplace=True)
df["ind"] = range(len(df))

# melt dataframe
plot_df = df.melt(
    id_vars=["Chromosome", "Start", "End", "label", "ind"],
    value_name="RPM",
    var_name="sample_id",
)

In [None]:
# make subplots
fig, axs = plt.subplots(2, 1, figsize=(10, 5), sharex=True)

plot_df = plot_df[plot_df["Chromosome"] == "chr1"]
sns.despine()
g1 = sns.lineplot(
    data=plot_df[plot_df["label"] == "L1HS"],
    x="Start",
    y="RPM",
    ax=axs[0],
    errorbar="sd",
)
g1.set(title="L1HS")
g1.set(xlabel="Chromosome 1", ylabel="reads-per-million")

sns.despine()
g2 = sns.lineplot(
    data=plot_df[plot_df["label"] == "L1PA2"],
    x="Start",
    y="RPM",
    ax=axs[1],
    errorbar="sd",
)
g2.set(title="L1PA2")
g2.set(xlabel="Chromosome 1", ylabel="reads-per-million")

# save to svg
plt.savefig("l1hs_l1pa2_chr1_commonbrain.png", bbox_inches="tight")

In [None]:
for label in ["L1HS", "L1PA2"]:
    g = sns.clustermap(
        df[df["label"] == label]
        .set_index(["Chromosome", "Start", "End", "label", "chr_num", "ind"])
        .transform("log2")
        .corr("pearson"),
    )

    # remove row and col labels
    g.ax_heatmap.set_xticklabels([])
    g.ax_heatmap.set_xticks([])
    g.ax_heatmap.set_yticklabels([])
    g.ax_heatmap.set_yticks([])

    # hide dendrogram
    g.ax_row_dendrogram.set_visible(False)
    g.ax_col_dendrogram.set_visible(False)

    # add title
    g.ax_heatmap.set_title(label)

    # add legend title
    g.cax.set_title("Pearson correlation")
    plt.show()

In [None]:
# get KNRGL and window files for each donor
donors = ["CommonBrain", 1, 3, 4, 5, 8, 27]
# donors = ["CommonBrain"]
donor_files = {}
for d in donors:
    donor_files[d] = defaultdict(list)
    p = Path(f"/iblm/netapp/data4/mcuoco/sz_slavseq/results/qc/l1_coverage/{d}")
    for r, a in product(["r1", "r2"], ["xtea", "rmsk"]):
        for f in p.rglob(f"*{a}.{r}.bed"):
            donor_files[d][f"{a}_{r}"].append(str(f))
        for f in p.rglob(f"*{a}_1kb_3end.{r}.txt"):
            donor_files[d][f"{a}_1kb_3end_{r}"].append(str(f))
        for f in p.rglob(f"*{a}_20kb.{r}.txt"):
            donor_files[d][f"{a}_20kb_{r}"].append(str(f))

In [None]:
donor_files

In [None]:
# do the same for a single donor
fig, axes = plt.subplots(nrows=1, ncols=6, figsize=(18, 3), sharex=True)

# subfams = ["KNRGL", "L1HS", "L1PA2", "L1PA3", "L1PA4", "L1PA5"]
subfams = ["KNRGL", "L1HS", "L1PA2", "L1PA3", "L1PA4", "L1PA5"]

for r, k in zip(donor_files[1]["rmsk_1kb_3end_r1"], donor_files[1]["xtea_1kb_3end_r1"]):
    df = pd.read_csv(
        k, sep="\t", header=None, usecols=[3, 6], names=["label", "coverage"]
    )
    knrgl = pd.read_csv(k, sep="\t", header=None, usecols=[6], names=["coverage"])
    knrgl["label"] = "KNRGL"
    df = pd.concat([knrgl, rmsk])
    df["coverage"] += 1
    for j, l in enumerate(subfams):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            sns.ecdfplot(
                data=df[df.label == l],
                x="coverage",
                stat="count",
                alpha=0.3,
                color=sns.color_palette()[0],
                complementary=True,
                hue=None,
                ax=axes[j],
            ).set(xscale="log", ylabel="", xlabel="", xlim=(1, 4000))
            sns.despine()

for ax, col in zip(axes, subfams):
    ax.set_title(col)
    ax.set_xlabel("Read 1 Coverage")
    ax.set_ylabel("# Loci", rotation=90, size="large")

fig.tight_layout()

# add space on left
fig.subplots_adjust(left=0.1)

# save as svg
# plt.savefig("l1_coverage_commonbrain.svg", bbox_inches="tight")
# save as png
# plt.savefig("l1_coverage_commonbrain.png", bbox_inches="tight")

In [None]:
df.head()