In [13]:
import datetime as dt
import pathlib as pl

import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpat

mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

datetime_now = dt.datetime.now()
ts = datetime_now.strftime("D%y%m%dT%H%M")

TS_PLOT = False
TS_FILE = True

desc="""
This notebook uses the output of the rule
--- 63_eval_read_align.smk::combine_read_depth_stats
as input, and produces a supplementary figure
summarizing long-read coverage in the de novo
assemblies for autosomes, X and Y at different
MAPQ thresholds.
"""

create_plots = False
save_plots = False

print(desc)

repo_dir = pl.Path('/home/ebertp/work/code/marschall-lab/project-male-assembly').resolve(strict=True)
exec_dir = pl.Path('.').resolve(strict=True)
wd_dir = pl.Path('/home/ebertp/work/projects/sig_chry/paper').resolve(strict=True)
out_dir = pl.Path('/home/ebertp/work/projects/sig_chry/paper/output/figures').resolve(strict=True)

print('Repository directory: ', repo_dir)
print('Execution directory: ', exec_dir)
print('Working directory: ', wd_dir)
print('Output directory: ', out_dir)
print('=================================')

table = pd.read_csv(
    wd_dir.joinpath(
        "stats/read_cov",
        "ALL-SAMPLES.READS_aln-to_HIFIRW.ONTUL.na.wg.cov-stats.tsv"
    ),
    header=0,
    index_col=["location", "statistic", "reads", "min_mapq", "input_set"],
    sep="\t"
)

drop_samples = [
    "HG02666",
    "NA18989",
    "HG01457",
    "NA19384",
    "NA24385"
]

table = table.loc[~table["sample"].isin(drop_samples), :].copy()

box_plots = []
box_positions = []

labels = []
label_pos = []
quantiles = []

box_pos = 1

if create_plots:
    for reads in ["HIFIRW", "ONTUL"]:
        for loc, input_set in [("auto", "wg"), ("chrX", "wg"), ("chrY", "wg"), ("chrY", "noYHET")]:
            if box_pos > 1:
                box_pos += 2
            for mapq in [0, 10]:
                values = table.xs(
                    (loc, input_set, mapq, reads, "median_cov"),
                    level=["location", "input_set", "min_mapq", "reads", "statistic"]
                )
                #labels.append(f"{loc}_MQ{mapq}")
                box_plots.append(values["value"])
                box_positions.append(box_pos)
                if mapq < 10:
                    label_pos.append(box_pos + 0.5)
                    if loc == "auto":
                        label = "Other\n(not X or Y)"
                    elif loc == "chrY" and input_set == "wg":
                        label = "chrY\n(full)"
                    elif loc == "chrY":
                        label = "chrY\n(no HET)"
                    else:
                        label = "chrX"
                    labels.append(label)
                box_pos += 1
                quantiles.append([0.01,0.99])

        fig, ax = plt.subplots(figsize=(10, 8))
        violins = ax.violinplot(
            box_plots,
            positions=box_positions,
            showmedians=True,
            showextrema=False,
            quantiles=quantiles
        )
        for idx, violin in enumerate(violins["bodies"]):
            if idx % 2 == 1:
                violin.set_facecolor("cornflowerblue")
            else:
                violin.set_facecolor("salmon")
        violins["cmedians"].set_color("grey")
        violins["cquantiles"].set_color("grey")

        ax.set_xticks(label_pos, labels, fontsize=12)
        readset = "HiFi" if reads == "HIFIRW" else "ONT-UL"
        ax.set_ylabel(f"Median read depth per assembled contig\n({readset} reads)", fontsize=14)
        ax.set_xlabel("Genomic location", fontsize=14)
        if reads == "HIFIRW":
            ax.set_ylim(-1, 50)
        else:
            ax.set_ylim(-1, 70)

        custom_legend = []
        custom_legend.append(
            mpat.Patch(facecolor="salmon", label="MAPQ >= 0", edgecolor="white")
        )
        custom_legend.append(
            mpat.Patch(facecolor="cornflowerblue", label="MAPQ >= 10", edgecolor="white")
        )
        ax.legend(handles=custom_legend, fontsize=14)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        if TS_PLOT:
            ax.set_title(timestamp, pad=40, fontsize=10)

        if save_plots:
            out_base_name = f"FigSX_{readset}-read-depth_auto-X-Y"
            if TS_FILE:
                out_base_name += f".{ts}"
            plt.savefig(
                out_dir / pl.Path(out_base_name + '.pdf'),
                bbox_inches='tight',
                facecolor="w", transparent=None,
                #bbox_extra_artists=[cmap_legend]
            )
            plt.savefig(
                out_dir / pl.Path(out_base_name + '.png'),
                bbox_inches='tight',
                facecolor="w", transparent=None,
                #bbox_extra_artists=[cmap_legend],
                dpi=150
            )

        # reset everything
        labels = []
        label_pos = []
        box_plots = []
        box_pos = 1
        box_positions = []
        quantiles = []
    
# reduce table to a few data points
# for inclusion in the supplement

table.reset_index(drop=False, inplace=True)

selector = (table["min_mapq"] > 0) & (table["location"] != "wg") & (table["statistic"] == "median_cov")

table = table.loc[selector, :].copy()

table["loc_spec"] = table["location"] + "_" + table["input_set"]

joined_table = []

for readset in ["HIFIRW", "ONTUL"]:
    reduced = table.loc[table["reads"] == readset, :].pivot_table(
        index="sample",
        columns="loc_spec",
        values="value",
        aggfunc="median"
    )
    reduced["reads"] = "HiFi" if readset == "HIFIRW" else "ONT-UL"
    joined_table.append(reduced)

joined_table = pd.concat(joined_table, axis=0, ignore_index=False)
joined_table.reset_index(drop=False, inplace=True)
joined_table.sort_index(inplace=True)
joined_table.sort_values(["sample", "reads"], inplace=True)

joined_table = joined_table[
    ["sample", "reads", "auto_wg", "chrX_wg", "chrY_wg", "chrY_noYHET"]
]

joined_table.columns = ["sample", "reads", "autosomes", "chrX", "chrY", "chrY_noHET"]

table_out_name = "TableSX_assm-read-cov.median.tsv"
with open(out_dir.parent.joinpath("tables", table_out_name), "w") as table:
    joined_table.to_csv(table, sep="\t", header=True, index=False)
    table.write(f"\n## {ts}\n## 63_eval_read_align.smk::combine_read_depth_stats\n")




This notebook uses the output of the rule
--- 63_eval_read_align.smk::combine_read_depth_stats
as input, and produces a supplementary figure
summarizing long-read coverage in the de novo
assemblies for autosomes, X and Y at different
MAPQ thresholds.

Repository directory:  /home/ebertp/work/code/marschall-lab/project-male-assembly
Execution directory:  /home/ebertp/work/code/marschall-lab/project-male-assembly/notebooks/plotting/readcov
Working directory:  /home/ebertp/work/projects/sig_chry/paper
Output directory:  /home/ebertp/work/projects/sig_chry/paper/output/figures
