# Optimize SLAV-seq classification model

In [49]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import product
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    PrecisionRecallDisplay,
    RocCurveDisplay,
    make_scorer,
    f1_score,
    average_precision_score,
)
from sklearn.model_selection import GridSearchCV
from time import time

In [4]:
# read in labels
files = Path(
    "/iblm/logglun02/mcuoco/workflows/sz_slavseq/results/model/get_labels"
).rglob("*pqt")
data = pd.concat([pd.read_parquet(f) for f in files])

In [5]:
# save all features for model to list
features = [
    x
    for x in data.columns
    if not any(
        x.endswith(y)
        for y in [
            "Chromosome",
            "Start",
            "End",
            "cell_id",
            "donor_id",
            "label",
            "blacklist",
            "index",
            "reads",
            "fwd",
            "rev",
            "reads_bg",
            "fwd_bg",
            "rev_bg",
            "mapq",
        ]
    )
]
features

['r1_orientation_entropy',
 'r1_starts_gini',
 'mean_template_length',
 'sd_template_length',
 'YG_mean',
 'YA_mean',
 'YS_mean',
 'YA_YG_ratio',
 'r2_orientation_entropy',
 'r2_starts_gini',
 'r1_orientation_entropy_bg',
 'r1_mean_mapq_bg',
 'r1_sd_mapq_bg',
 'r1_starts_gini_bg',
 'mean_template_length_bg',
 'sd_template_length_bg',
 'YG_mean_bg',
 'YA_mean_bg',
 'YS_mean_bg',
 'YA_YG_ratio_bg',
 'r2_orientation_entropy_bg',
 'r2_mean_mapq_bg',
 'r2_sd_mapq_bg',
 'r2_starts_gini_bg']

In [40]:
# setup splits for CV, Train = CommonBrain, Test = each other donor
train = np.where(data["donor_id"] == "CommonBrain")[0]
tests = {
    d: np.where(data["donor_id"] == d)
    for d in data["donor_id"].unique()
    if d != "CommonBrain"
}

In [54]:
# define model and parameter grid
rfc = RandomForestClassifier(random_state=0, oob_score=True, n_jobs=8)

params = {
    "n_estimators": [50, 100, 200],
    "max_depth": [5, 25, 50],
    "min_samples_split": [5, 10, 15],
    "min_samples_leaf": [5, 10, 15],
}

keys, values = zip(*params.items())
grid = [dict(zip(keys, v)) for v in product(*values)]
print(f"Generated {len(grid)} combinations of parameters")

Generated 81 combinations of parameters


In [55]:
# run custom grid search CV

for i, p in enumerate(grid):
    print(f"[CV {i+1}/{len(grid)}] Training model with params: {p}")

    # train model with params
    rfc.set_params(**p)
    start = time()
    rfc.fit(data.iloc[train][features], data.iloc[train]["label"])
    grid[i]["time"] = round(time() - start, 2)
    print(f"[CV {i+1}/{len(grid)}] Model trained in {grid[i]['time']} seconds")

    # get which label is KNRGL
    knrgl = np.where(rfc.classes_ == "KNRGL")[0][0]

    # make predictions on training set
    y_pred = rfc.predict(data.iloc[train][features])
    y_proba = rfc.predict_proba(data.iloc[train][features])

    # calculate scores
    grid[i]["ap_score"] = {
        "train": average_precision_score(
            data.iloc[train]["label"], y_proba[:, knrgl], pos_label="KNRGL"
        )
    }
    grid[i]["f1_score"] = {
        "train": f1_score(data.iloc[train]["label"], y_pred, pos_label="KNRGL")
    }

    for d, t in tests.items():

        # make predictions on test set
        y_pred = rfc.predict(data.iloc[t][features])
        y_proba = rfc.predict_proba(data.iloc[t][features])

        # calculate scores
        grid[i]["ap_score"][f"{d}_test"] = average_precision_score(
            data.iloc[t]["label"], y_proba[:, knrgl], pos_label="KNRGL"
        )
        grid[i]["f1_score"][f"{d}_test"] = f1_score(
            data.iloc[t]["label"], y_pred, pos_label="KNRGL"
        )

    print(f"[CV {i+1}/{len(grid)}] scores = {grid[i]}")

[CV 1/81] Training model with params: {'n_estimators': 50, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 5}
[CV 1/81] Model trained in 38.39 seconds
[CV 1/81] scores = {'n_estimators': 50, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 5, 'time': 38.39, 'ap_score': {'train': 0.5849863572920627, '35_test': 0.3932386746910518, '27_test': 0.5401161270218057, '28_test': 0.625394471283909, '24_test': 0.3734786868149597, '46_test': 0.6000764919548705, '32_test': 0.6522330585901319, '8_test': 0.6383230594084612}, 'f1_score': {'train': 0.06521832837622311, '35_test': 0.0054810447203421505, '27_test': 0.031324966791570014, '28_test': 0.03571805981189897, '24_test': 0.0017202821262687082, '46_test': 0.05392446633825944, '32_test': 0.04690034091285208, '8_test': 0.03957322987390882}}
[CV 2/81] Training model with params: {'n_estimators': 50, 'max_depth': 5, 'min_samples_split': 5, 'min_samples_leaf': 10}
[CV 2/81] Model trained in 40.66 seconds
[CV 2/81] scores = {'n_es

In [57]:
pd.DataFrame(grid).to_csv("grid_search.csv", index=False)