# Test SLAV-calling model on 7 individuals

In [1]:
# basic
import os

print(f"Number of CPUs in this system: {os.cpu_count()}")

from pathlib import Path
from collections import defaultdict
from tqdm import tqdm

# data
import numpy as np

print(f"numpy: {np.__version__}")

import pandas as pd

print(f"pandas: {pd.__version__}")
import pyranges as pr

import pyarrow
import pyarrow.parquet as pq

print(f"pyarrow: {pyarrow.__version__}")

# ML
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.metrics import PrecisionRecallDisplay, RocCurveDisplay
from flaml import AutoML

# plotting
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# custom
from scripts.fit import Model

Number of CPUs in this system: 48
numpy: 1.23.5
pandas: 1.5.3
pyarrow: 10.0.1


In [5]:
donors = [1, 3, 4, 5, 8, 27]
donors = pd.read_csv(
    "/iblm/logglun02/mcuoco/workflows/sz_slavseq/config/all_donors.tsv", sep="\t"
)["donor_id"].to_list()
donors.remove("CommonBrain")

In [None]:
data = pd.concat(
    [
        pq.read_table(f"../results/model/labelled_windows/{d}.pqt").to_pandas()
        for d in donors
    ]
)

## Examine class distributions

In [None]:
# make subplots
fig, axes = plt.subplots(
    len(donors), len(anno.keys()), figsize=(4 * len(anno.keys()), 4 * len(donors))
)
fig.subplots_adjust(hspace=0.4, wspace=0.4)

for i, d in enumerate(data["donor_id"].unique()):
    for j, l in enumerate(anno.keys()):
        ddf = data.loc[(data["donor_id"] == d) & (data[l] == True), :]
        for c in ddf["cell_id"].unique():
            sns.ecdfplot(
                data=ddf[ddf["cell_id"] == c],
                x="nreads",
                ax=axes[i, j],
                legend=False,
                stat="count",
                complementary=True,
                alpha=0.3,
                c=sns.color_palette()[0],
            ).set(xscale="log", yscale="log", title=f"{d}: {l}", xlabel="# Read 1")

## Remove RMSK and blacklist classes

In [None]:
# remove windows in blacklist regions
# df = df.loc[~df["label"].isin(["blacklist"]), :]
# remove windows with ref reads
df = df.loc[df["ref_reads"] == 0, :]
# fillna 0
df = df.fillna(0)

In [None]:
# label knrgl and knrgl_1kb_3end as knrgl
df["label"] = df["label"].replace({"knrgl_1kb_3end": "knrgl"})
# label all other windows as unknown
df["label"] = df["label"].apply(lambda x: "unknown" if x != "knrgl" else x)

In [None]:
# plot distributions
fig, axes = plt.subplots(1, 1, figsize=(5, 5))
ddf = df[df["donor_id"] == "1"]

for c in ddf["cell_id"].unique():
    sns.ecdfplot(
        data=ddf[ddf["cell_id"] == c],
        x="nr1",
        hue="label",
        ax=axes,
        legend=False,
        stat="count",
        complementary=True,
        alpha=0.3,
    ).set(xscale="log", yscale="log", xlabel="# Read 1")

## Build model

In [None]:
features = []
keys = ["_q", "frac", "gini", "bias"]
for c in df.columns:
    for k in keys:
        if k in c:
            features.append(c)

# encode labels
df["label_encoded"] = df["label"].map({"knrgl": 1, "unknown": 0})

features

In [None]:
# set flaml settings
# NOTE: Don't try logistic regression, it's too slow, doesn't converge, and doesn't perform well
flaml_settings = dict(
    task="classification",
    n_jobs=16,
    estimator_list=["xgboost", "rf"],
    early_stop=True,
    skip_transform=True,  # don't preprocess data
    auto_augment=False,  # don't augment rare classes
    # starting_points="static", # use data-independent hyperparameterstarting points
    log_training_metric=True,
)

In [None]:
rmsk = pr.read_bed(
    "/iblm/netapp/data4/mcuoco/sz_slavseq/resources/rmsk_1kb_3end.bed", as_df=True
)

# define evaluation function
def precision_recall(pred: pd.DataFrame, knrgl: pd.DataFrame):
    """
    Calculate precision and recall for a binary classifier
    pred: predicted labels for genomic windows
    insertions: L1 annotations
    """
    assert "pred" in pred.columns, "pred must have column 'pred'"
    assert set(pred.pred.unique()) == set([0, 1]), "pred must be binary"

    for col in ["Chromosome", "Start", "End"]:
        assert col in pred.columns, f"pred must have column {col}"
        assert col in rmsk.columns, f"rmsk must have column {col}"
        assert col in knrgl.columns, f"knrgl must have column {col}"

    # only consider insertions that have windows
    insertions = pr.PyRanges(insertions).overlap(pr.PyRanges(pred)).df

    # how many insertions were detected?
    y_pos = pred.loc[pred["pred"] == 1, :]
    tp = len(pr.PyRanges(insertions).overlap(pr.PyRanges(y_pos)).df)

    # how many insertions were false positives?
    fp = len(pr.PyRanges(insertions).overlap(pr.PyRanges(y_pos), invert=True).df)

    # how many insertions were missed?
    y_neg = pred.loc[pred["pred"] == 0, :]
    fn = len(pr.PyRanges(insertions).overlap(pr.PyRanges(y_neg)).df)

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    return precision, recall

### Use cross validation

In [None]:
def train_model(tune_data, eval_data):

    # get train and test chromosomes
    assert (
        np.intersect1d(
            tune_data["Chromosome"].unique(), eval_data["Chromosome"].unique()
        ).size
        == 0
    ), "Train and test chromosomes must be mutually exclusive"

    sgkf = StratifiedGroupKFold(n_splits=5)

    # fit using holdout data
    clf = AutoML()

    clf.fit(
        X_train=tune_data[features],
        y_train=tune_data["label_encoded"],
        metric="f1",
        time_budget=600,
        eval_method="cv",
        split_type=sgkf,
        groups=tune_data["Chromosome"],
        log_file_name="flaml_cv.log",
        **flaml_settings
    )

    return clf

In [None]:
res = {
    "min_reads": [],
    "model": [],
}
for mr in [100, 200]:
    # define data to tune on
    eval_chr = "chr1"
    eval_data = df.loc[
        (df["Chromosome"] == eval_chr) & (df["nr1"] >= mr), :
    ].reset_index()
    tune_data = df.loc[
        (df["Chromosome"] != eval_chr) & (df["nr1"] >= mr), :
    ].reset_index()
    clf = train_model(tune_data, eval_data)
    res["min_reads"].append(mr)
    res["model"].append(clf)
    break

In [None]:
from flaml.automl.data import get_output_from_log

time, best_valid_loss, valid_loss, config, metric = get_output_from_log(
    "flaml_cv.log", time_budget=600
)
plot_df = pd.DataFrame(
    {
        "time": time,
        "valid_loss": valid_loss,
        "train_loss": [m["train_loss"] for m in metric],
    }
)
plot_df = plot_df.melt(id_vars="time", var_name="metric", value_name="loss")
sns.scatterplot(data=plot_df, x="time", y="loss", hue="metric")

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))

for mr, clf in zip(res["min_reads"], res["model"]):
    eval_chr = "chr1"
    eval_data = df.loc[
        (df["Chromosome"] == eval_chr) & (df["nr1"] >= mr), :
    ].reset_index()
    PrecisionRecallDisplay.from_estimator(
        clf,
        eval_data[features],
        eval_data["label_encoded"],
        name=f"min reads: {mr}",
        ax=ax[0],
    )
    RocCurveDisplay.from_estimator(
        clf,
        eval_data[features],
        eval_data["label_encoded"],
        name=f"min reads: {mr}",
        ax=ax[1],
    )

# remove legend from first ax

# save as png
# fig.savefig("flaml_holdout.png", dpi=300, bbox_inches="tight")

## make predictions

In [None]:
df[df["nr1"] > res["min_reads"][-1]]["pred"] = res["model"][-1].predict(
    df[df["nr1"] > res["min_reads"][-1]][features]
)
calls = df[
    (df["nr1"] > res["min_reads"][-1]) & (df["pred"] == 1) & (df["label"] == "unknown")
]

In [None]:
meta = pd.read_csv(
    "/iblm/logglun02/mcuoco/workflows/sz_slavseq/config/7donor_donors.tsv",
    sep="\t",
    index_col=0,
)
meta["calls"] = calls.groupby("donor_id").size()
meta[["sex", "age", "race", "diagnosis", "calls"]].sort_values("calls")