# Classify WT and Null Genotypes Random Forest
Plates 3, 3p, and 5 are used in all splits to classify genotypes either (WT or Null)
The feature selected data is used in all data splits.
Pre-evaluation metrics are stored from all splits and these plates.
This notebook is a Random Forest version of the original classifier. 

In [1]:
import pathlib
import random
import warnings
from collections import defaultdict

import numpy as np
import pandas as pd
from joblib import dump
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

In [2]:
MODEL_ID = "randomforest"
ROLE = "train"

import logging
from datetime import datetime
import pathlib

# ============================================
# 1) Choose a RUN_ID
# ============================================
RUN_ID = datetime.now().strftime("%m_%d_%H_%M")

RUN_ID = "12_08_16_45"
ANALYSIS_TYPE = "train"


def setup_logger(
    run_id: str,
    model_id: str,
    role: str,
    log_dir: str = "logs",
    analysis_type: str = ANALYSIS_TYPE,
) -> logging.Logger:
    """
    Create a logger that writes to both stdout and a log file.

    - Logger name:  "<analysis_type>_<run_id>_<model_id>_<role>"
    - Log file:     "log_<analysis_type>_<run_id>_<model_id>.log" in `log_dir`
      (shared by all notebooks for the same model & run & analysis_type).
    """
    log_path = pathlib.Path(log_dir)
    log_path.mkdir(exist_ok=True)

    logger_name = f"{analysis_type}_{run_id}_{model_id}_{role}"
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.INFO)
    logger.propagate = False  # don't duplicate logs to root logger

    # Avoid adding handlers multiple times if the cell is re-run
    if not logger.handlers:
        # Common formatter for both handlers
        formatter = logging.Formatter(
            fmt="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
            datefmt="%Y-%m-%dT%H:%M:%S",
        )

        # Stream handler (stdout)
        stream_handler = logging.StreamHandler()
        stream_handler.setLevel(logging.INFO)
        stream_handler.setFormatter(formatter)
        logger.addHandler(stream_handler)

        # File handler (one file per analysis_type + run_id + model_id)
        log_file = log_path / f"log_{analysis_type}_{run_id}_{model_id}.log"
        file_handler = logging.FileHandler(log_file)
        file_handler.setLevel(logging.INFO)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)

    return logger



logger = setup_logger(RUN_ID, MODEL_ID, ROLE)
logger.info("Initialized logger.")

2025-12-06T16:51:22 [train_12_08_16_45_randomforest_train] INFO: Initialized logger.


## Find the root of the git repo on the host system

In [3]:
# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

## Define paths

### Input

In [4]:
# OPTIONAL: If the data (within the cell painting directory) is stored in a different location, add location here
repo_dir = pathlib.Path(
    "/Users/marktalbot/Documents/VC Studio Homework Folders/HighRisk/nf1_schwann_cell_painting_data"
)

# Set data level
data_level = "cleaned"

# Main directory path (converted or cleaned data)
if data_level == "cleaned":
    data_dir = pathlib.Path(
        repo_dir / "3.processing_features/data/single_cell_profiles/cleaned_sc_profiles"
    )
else:
    data_dir = pathlib.Path(
        repo_dir / "3.processing_features/data/single_cell_profiles"
    )

plate3df_path = pathlib.Path(data_dir / "Plate_3_sc_feature_selected.parquet").resolve(
    strict=True
)
plate3pdf_path = pathlib.Path(
    data_dir / "Plate_3_prime_sc_feature_selected.parquet"
).resolve(strict=True)
plate5df_path = pathlib.Path(data_dir / "Plate_5_sc_feature_selected.parquet").resolve(
    strict=True
)

plate3df = pd.read_parquet(plate3df_path)
plate3pdf = pd.read_parquet(plate3pdf_path)
plate5df = pd.read_parquet(plate5df_path)

logger.info("Number of single-cells total per plate:")
logger.info(f"Plate 3: {plate3df.shape[0]}")
logger.info(f"Plate 3 prime: {plate3pdf.shape[0]}")
logger.info(f"Plate 5: {plate5df.shape[0]}")

# Set the seed
rng = np.random.default_rng(0)


2025-12-06T16:51:23 [train_12_08_16_45_randomforest_train] INFO: Number of single-cells total per plate:
2025-12-06T16:51:23 [train_12_08_16_45_randomforest_train] INFO: Plate 3: 10206
2025-12-06T16:51:23 [train_12_08_16_45_randomforest_train] INFO: Plate 3 prime: 5126
2025-12-06T16:51:23 [train_12_08_16_45_randomforest_train] INFO: Plate 5: 5348


### Outputs

In [5]:
data_path = pathlib.Path("data_randomforest")
data_path.mkdir(parents=True, exist_ok=True)

## Splitting and Processing
Functions to split and process data

In [6]:
gene_column = "Metadata_genotype"


def down_sample_by_genotype(_df):
    """
    Return an equal number of cells from each genotype.
    The number of cells in a genotype is the minimum number of cells from all genotypes.

    Parameters
    ----------
    _df: Pandas Dataframe
        The data to be downsampled by the gene_column column.

    Returns
    -------
    The dataframe down-sampled by genotype.
    """

    min_gene = _df[gene_column].value_counts().min()
    return _df.groupby(gene_column, group_keys=False).apply(
        lambda x: x.sample(n=min_gene, random_state=0)
    )


def process_plates(_df):
    """
    Drop rows with nans from the single cell data and remove HET cells.

    Parameters
    ----------
    _df: Pandas Dataframe
        Uncleaned plate data with nans and HET cells to be removed. Contains the column "Metadata_genotype".

    Returns
    -------
    _df: Pandas Dataframe
        Cleaned single cell data by removing nans and HET cells.
    """

    _df.dropna(inplace=True)
    _df = _df.loc[_df[gene_column] != "HET"]
    return _df


def shuffle_data(_X):
    """
    Shuffle the columns of the input dataframe independently.

    Parameters
    ----------
    _X: Pandas Dataframe
        Input feature data for shuffling the columns.
    """

    for column in _X.columns:
        _X[column] = rng.permutation(_X[column])


def store_pre_evaluation_data(_X, _y, _metadata, _datasplit):
    """
    Store model data to evaluate performance.

    Parameters
    ----------
    _X: pandas.DataFrame
        Feature matrix.
    _y: array-like
        True labels.
    _metadata: pandas.DataFrame
        Metadata (one row per sample) to carry through.
    _datasplit: str
        Name of the datasplit (e.g., 'train', 'val', 'test', 'shuffled_*').
    """
    # `model` is the currently trained classifier (either CV fold or final model)
    eval_data[f"probability_{probability_class}"].extend(
        model.predict_proba(_X)[:, 1].tolist()
    )
    eval_data["datasplit"].extend([_datasplit] * _X.shape[0])
    eval_data["predicted_genotype"].extend(model.predict(_X).tolist())
    eval_data["true_genotype"].extend(_y.tolist())
    for meta_col in _metadata.columns:
        eval_data[meta_col].extend(_metadata[meta_col].tolist())


## Split and process plates

In [7]:
def create_splits(_wells, _plate):
    """
    Create data splits for model training. The splits are rest (train and validation) and test.

    Parameters
    ----------
    _wells: List(String)
        The well names from which single cells will be used in the test set.

    _plate: Pandas Dataframe
        Single cell data from one of the plate's containing a "Metadata_Well" column.

    Returns
    -------
    Dataframes of the split single cell data.
    """

    return (
        _plate[~_plate["Metadata_Well"].isin(_wells)],
        _plate[_plate["Metadata_Well"].isin(_wells)],
    )

In [8]:
plate3df = process_plates(plate3df)
p3_wells = ["C11", "E11", "C3", "F3"]
rest3df, test3df = create_splits(p3_wells, plate3df)
rest3df, test3df = down_sample_by_genotype(rest3df), down_sample_by_genotype(test3df)

plate3pdf = process_plates(plate3pdf)
p3p_wells = ["F11", "G11", "C3", "F3"]
rest3pdf, test3pdf = create_splits(p3p_wells, plate3pdf)
rest3pdf, test3pdf = down_sample_by_genotype(rest3pdf), down_sample_by_genotype(
    test3pdf
)

plate5df = process_plates(plate5df)
p5_wells = ["C9", "E11", "E3", "G3"]
rest5df, test5df = create_splits(p5_wells, plate5df)
rest5df, test5df = down_sample_by_genotype(rest5df), down_sample_by_genotype(test5df)

## Combine plate columns across each data split

In [9]:
# Columns common to all plates
plate_cols = list(
    set(plate5df.columns) & set(plate3df.columns) & set(plate3pdf.columns)
)

restdf = pd.concat(
    [rest3df[plate_cols], rest3pdf[plate_cols], rest5df[plate_cols]], ignore_index=True
).reset_index(drop=True)

testdf = pd.concat(
    [test3df[plate_cols], test3pdf[plate_cols], test5df[plate_cols]], ignore_index=True
).reset_index(drop=True)

## Encode genotypes and extract feature data

In [10]:
meta_cols = testdf.filter(like="Metadata").columns
feat_cols = testdf.drop(columns=meta_cols).columns

In [11]:
le = LabelEncoder()

y = le.fit_transform(restdf["Metadata_genotype"])
X = restdf.drop(columns=meta_cols)

y_test = le.fit_transform(testdf["Metadata_genotype"])
X_test = testdf.drop(columns=meta_cols)

# Class for saving probabilities
probability_class = le.inverse_transform([1])[0]

# Train Models

## Specify parameters for training

In [None]:
from datetime import datetime
import time

# Base Random Forest parameters (not searched)
rf_params = {
    "random_state": 0,
    "n_jobs": -1,
    "class_weight": "balanced",  # handle class imbalance
}

# Refined search space centered around your previous best:
# best_hp ≈ {'n_estimators': 400, 'max_depth': 40, 'min_samples_leaf': 1, 'max_features': 'sqrt'}
param_choices = {
    "n_estimators": [300, 600, 900],     # allow more trees than before
    "max_depth": [20, 40, 60, None],     # include deeper / unbounded trees
    "min_samples_leaf": [1, 2, 4],       # finer control near 1
    "max_features": ["sqrt", "log2"],    # good defaults for RF
}

# Target *total* number of hyperparameter configurations / trials
rand_iter = 50

# Number of CV folds
n_splits = 5     # keep reasonable for runtime

# Track best performance (using ROC AUC)
best_acc = -np.inf   # best mean CV AUC
best_hp = None
best_eval_data = None


## Hyperparameter search

In [None]:
from collections import defaultdict
import os
import optuna
from optuna.trial import TrialState
from sklearn.metrics import roc_auc_score

# Ensure tuning directory exists (relative to 1.train_models/)
os.makedirs("tuning", exist_ok=True)

# Hyperparameter search for Random Forest (with timing logs, using ROC AUC)
overall_start = time.time()

# Create/load persistent Optuna study
study = optuna.create_study(
    study_name="rf_cv",
    direction="maximize",
    storage="sqlite:///tuning/optuna_rf.sqlite3",
    load_if_exists=True,
)

# How many trials have already successfully finished?
completed_trials = [t for t in study.trials if t.state == TrialState.COMPLETE]
n_complete = len(completed_trials)

# Initialize best from existing study, if any
if n_complete > 0:
    best_trial_so_far = study.best_trial
    best_acc = float(best_trial_so_far.value)
    best_hp = best_trial_so_far.params.copy()
else:
    best_acc = -np.inf
    best_hp = None

# Decide how many *new* trials to run this time
n_to_run = max(0, rand_iter - n_complete)

logger.info(
    f"{datetime.now().isoformat(timespec='seconds')} "
    f"[INFO] Loaded Optuna study 'rf_cv' with "
    f"{len(study.trials)} total trials ({n_complete} completed). "
    f"Target total trials = {rand_iter}. "
    f"Running {n_to_run} new trials in this session."
)

for i in range(n_to_run):
    trial = study.ask()

    # Draw hyperparameters from your discrete search space
    rparams = {
        "n_estimators": trial.suggest_categorical("n_estimators", param_choices["n_estimators"]),
        "max_depth": trial.suggest_categorical("max_depth", param_choices["max_depth"]),
        "min_samples_leaf": trial.suggest_categorical("min_samples_leaf", param_choices["min_samples_leaf"]),
        "max_features": trial.suggest_categorical("max_features", param_choices["max_features"]),
    }

    # Global index for logging (across reruns)
    global_idx = n_complete + i  # 0-based index among desired trials

    iter_start = time.time()
    print(
        f"{datetime.now().isoformat(timespec='seconds')} "
        f"[INFO] Iteration {global_idx + 1}/{rand_iter}: trying params = {rparams} "
        f"(Optuna trial {trial.number})"
    )

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

    # Combine fixed RF params with the sampled hyperparameters
    comb_params = rf_params | rparams

    # Reset eval data for this hyperparameter setting
    eval_data = defaultdict(list)
    auc = 0.0

    # Loop through the folds
    for fold, (train_index, val_index) in enumerate(skf.split(X, y), start=1):
        fold_start = time.time()

        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Create a shuffled version of the validation features for the null model
        X_val_shuf = X_val.copy()
        shuffle_data(X_val_shuf)

        # Train RF model on this fold
        model = RandomForestClassifier(**comb_params)
        model.fit(X_train, y_train)

        # Evaluate on validation fold using ROC AUC (Null treated as positive)
        y_proba = model.predict_proba(X_val)[:, 1]
        auc += roc_auc_score(y_val, y_proba)

        # Store predictions for evaluation (real and shuffled)
        store_pre_evaluation_data(
            X_val, y_val, restdf.iloc[val_index][meta_cols], "val"
        )
        store_pre_evaluation_data(
            X_val_shuf, y_val, restdf.iloc[val_index][meta_cols], "shuffled_val"
        )

        fold_time = (time.time() - fold_start) / 60
        logger.info(
            f"{datetime.now().isoformat(timespec='seconds')} "
            f"[INFO]    Fold {fold}/{n_splits} finished in {fold_time:.2f} min"
        )

    # Average AUC across folds
    auc = auc / n_splits

    # Keep best-performing hyperparameters + their eval data (within this run)
    if auc > best_acc:
        best_acc = auc
        best_hp = rparams
        best_eval_data = {k: v.copy() for k, v in eval_data.items()}

    # Attach some metadata to the trial (optional but handy)
    trial.set_user_attr("params", rparams)
    trial.set_user_attr("comb_params", comb_params)
    trial.set_user_attr("mean_cv_auc", float(auc))
    trial.set_user_attr("n_splits", n_splits)

    # Tell Optuna about this trial's result
    study.tell(trial, auc)

    iter_time = (time.time() - iter_start) / 60
    logger.info(
        f"{datetime.now().isoformat(timespec='seconds')} "
        f"[INFO] Iteration {global_idx + 1} finished in {iter_time:.2f} min "
        f"(AUC={auc:.4f}, best_AUC={best_acc:.4f})"
    )

total_time = (time.time() - overall_start) / 60
logger.info(
    f"{datetime.now().isoformat(timespec='seconds')} "
    f"[INFO] Total search time this session: {total_time:.2f} min"
)

# After running any new trials (or if n_to_run == 0), sync to *global* best over the study
completed_trials = [t for t in study.trials if t.state == TrialState.COMPLETE]
if not completed_trials:
    raise RuntimeError("No completed Optuna trials in the study. Nothing to select as best.")

best_trial = study.best_trial
best_acc = float(best_trial.value)
best_hp = best_trial.params.copy()

logger.info(
    f"{datetime.now().isoformat(timespec='seconds')} "
    f"[INFO] Best average validation AUC (all completed trials) = {best_acc:.4f}"
)
logger.info(f"Best hyperparameters (global over study) = {best_hp}")

# Set eval_data to the best-hyperparameter results *from this run* for saving later, if available
eval_data = defaultdict(list)
if best_eval_data is not None:
    for k, v in best_eval_data.items():
        eval_data[k].extend(v)
else:
    logger.info(
        f"{datetime.now().isoformat(timespec='seconds')} "
        "[INFO] No best_eval_data from this session (e.g., no new trials ran). "
        "eval_data left empty; global best_hp is taken from the Optuna study."
    )


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-12-06 16:51:24,746] A new study created in RDB with name: rf_cv
2025-12-06T16:51:24 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:51:24 [INFO] Loaded Optuna study 'rf_cv' with 0 total trials (0 completed). Target total trials = 15. Running 15 new trials in this session.


2025-12-06T16:51:24 [INFO] Iteration 1/15: trying params = {'n_estimators': 300, 'max_depth': 40, 'min_samples_leaf': 4, 'max_features': 'log2'} (Optuna trial 0)


2025-12-06T16:51:38 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:51:38 [INFO]    Fold 1/5 finished in 0.23 min
2025-12-06T16:51:49 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:51:49 [INFO]    Fold 2/5 finished in 0.18 min
2025-12-06T16:51:59 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:51:59 [INFO]    Fold 3/5 finished in 0.17 min
2025-12-06T16:52:10 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:52:10 [INFO]    Fold 4/5 finished in 0.18 min
2025-12-06T16:52:20 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:52:20 [INFO]    Fold 5/5 finished in 0.17 min
2025-12-06T16:52:20 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:52:20 [INFO] Iteration 1 finished in 0.94 min (AUC=0.8296, best_AUC=0.8296)


2025-12-06T16:52:20 [INFO] Iteration 2/15: trying params = {'n_estimators': 300, 'max_depth': 60, 'min_samples_leaf': 1, 'max_features': 'log2'} (Optuna trial 1)


2025-12-06T16:52:32 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:52:32 [INFO]    Fold 1/5 finished in 0.19 min
2025-12-06T16:52:43 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:52:43 [INFO]    Fold 2/5 finished in 0.19 min
2025-12-06T16:52:54 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:52:54 [INFO]    Fold 3/5 finished in 0.18 min
2025-12-06T16:53:06 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:53:06 [INFO]    Fold 4/5 finished in 0.20 min
2025-12-06T16:53:18 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:53:18 [INFO]    Fold 5/5 finished in 0.19 min
2025-12-06T16:53:18 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:53:18 [INFO] Iteration 2 finished in 0.95 min (AUC=0.8292, best_AUC=0.8296)


2025-12-06T16:53:18 [INFO] Iteration 3/15: trying params = {'n_estimators': 600, 'max_depth': 40, 'min_samples_leaf': 4, 'max_features': 'log2'} (Optuna trial 2)


2025-12-06T16:53:37 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:53:37 [INFO]    Fold 1/5 finished in 0.32 min
2025-12-06T16:53:57 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:53:57 [INFO]    Fold 2/5 finished in 0.34 min
2025-12-06T16:54:18 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:54:18 [INFO]    Fold 3/5 finished in 0.34 min
2025-12-06T16:54:38 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:54:38 [INFO]    Fold 4/5 finished in 0.33 min
2025-12-06T16:54:58 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:54:58 [INFO]    Fold 5/5 finished in 0.33 min
2025-12-06T16:54:58 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:54:58 [INFO] Iteration 3 finished in 1.67 min (AUC=0.8316, best_AUC=0.8316)


2025-12-06T16:54:58 [INFO] Iteration 4/15: trying params = {'n_estimators': 300, 'max_depth': 60, 'min_samples_leaf': 4, 'max_features': 'log2'} (Optuna trial 3)


2025-12-06T16:55:10 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:55:10 [INFO]    Fold 1/5 finished in 0.20 min
2025-12-06T16:55:20 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:55:20 [INFO]    Fold 2/5 finished in 0.17 min
2025-12-06T16:55:31 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:55:31 [INFO]    Fold 3/5 finished in 0.18 min
2025-12-06T16:55:41 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:55:41 [INFO]    Fold 4/5 finished in 0.18 min
2025-12-06T16:55:52 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:55:52 [INFO]    Fold 5/5 finished in 0.17 min
2025-12-06T16:55:52 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:55:52 [INFO] Iteration 4 finished in 0.90 min (AUC=0.8296, best_AUC=0.8316)


2025-12-06T16:55:52 [INFO] Iteration 5/15: trying params = {'n_estimators': 600, 'max_depth': 40, 'min_samples_leaf': 1, 'max_features': 'log2'} (Optuna trial 4)


2025-12-06T16:56:14 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:56:14 [INFO]    Fold 1/5 finished in 0.37 min
2025-12-06T16:56:36 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:56:36 [INFO]    Fold 2/5 finished in 0.37 min
2025-12-06T16:56:59 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:56:59 [INFO]    Fold 3/5 finished in 0.38 min
2025-12-06T16:57:21 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:57:21 [INFO]    Fold 4/5 finished in 0.37 min
2025-12-06T16:57:44 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:57:44 [INFO]    Fold 5/5 finished in 0.37 min
2025-12-06T16:57:44 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:57:44 [INFO] Iteration 5 finished in 1.87 min (AUC=0.8328, best_AUC=0.8328)


2025-12-06T16:57:44 [INFO] Iteration 6/15: trying params = {'n_estimators': 300, 'max_depth': 40, 'min_samples_leaf': 4, 'max_features': 'sqrt'} (Optuna trial 5)


2025-12-06T16:58:12 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:58:12 [INFO]    Fold 1/5 finished in 0.48 min
2025-12-06T16:58:40 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:58:40 [INFO]    Fold 2/5 finished in 0.46 min
2025-12-06T16:59:08 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:59:08 [INFO]    Fold 3/5 finished in 0.46 min
2025-12-06T16:59:36 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T16:59:36 [INFO]    Fold 4/5 finished in 0.47 min
2025-12-06T17:00:03 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:00:03 [INFO]    Fold 5/5 finished in 0.46 min
2025-12-06T17:00:03 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:00:03 [INFO] Iteration 6 finished in 2.33 min (AUC=0.8432, best_AUC=0.8432)


2025-12-06T17:00:03 [INFO] Iteration 7/15: trying params = {'n_estimators': 600, 'max_depth': 20, 'min_samples_leaf': 2, 'max_features': 'sqrt'} (Optuna trial 6)


2025-12-06T17:01:00 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:01:00 [INFO]    Fold 1/5 finished in 0.94 min
2025-12-06T17:01:56 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:01:56 [INFO]    Fold 2/5 finished in 0.94 min
2025-12-06T17:02:53 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:02:53 [INFO]    Fold 3/5 finished in 0.95 min
2025-12-06T17:03:50 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:03:50 [INFO]    Fold 4/5 finished in 0.95 min
2025-12-06T17:04:47 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:04:47 [INFO]    Fold 5/5 finished in 0.95 min
2025-12-06T17:04:47 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:04:47 [INFO] Iteration 7 finished in 4.72 min (AUC=0.8458, best_AUC=0.8458)


2025-12-06T17:04:47 [INFO] Iteration 8/15: trying params = {'n_estimators': 900, 'max_depth': 60, 'min_samples_leaf': 4, 'max_features': 'log2'} (Optuna trial 7)


2025-12-06T17:05:14 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:05:14 [INFO]    Fold 1/5 finished in 0.46 min
2025-12-06T17:05:41 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:05:41 [INFO]    Fold 2/5 finished in 0.44 min
2025-12-06T17:06:09 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:06:09 [INFO]    Fold 3/5 finished in 0.47 min
2025-12-06T17:06:36 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:06:36 [INFO]    Fold 4/5 finished in 0.45 min
2025-12-06T17:07:05 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:07:05 [INFO]    Fold 5/5 finished in 0.48 min
2025-12-06T17:07:05 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:07:05 [INFO] Iteration 8 finished in 2.30 min (AUC=0.8318, best_AUC=0.8458)


2025-12-06T17:07:05 [INFO] Iteration 9/15: trying params = {'n_estimators': 600, 'max_depth': 20, 'min_samples_leaf': 1, 'max_features': 'sqrt'} (Optuna trial 8)


2025-12-06T17:08:06 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:08:06 [INFO]    Fold 1/5 finished in 1.03 min
2025-12-06T17:09:06 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:09:06 [INFO]    Fold 2/5 finished in 0.99 min
2025-12-06T17:10:23 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:10:23 [INFO]    Fold 3/5 finished in 1.29 min
2025-12-06T17:11:28 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:11:28 [INFO]    Fold 4/5 finished in 1.09 min
2025-12-06T17:12:32 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:12:32 [INFO]    Fold 5/5 finished in 1.07 min
2025-12-06T17:12:33 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:12:33 [INFO] Iteration 9 finished in 5.46 min (AUC=0.8460, best_AUC=0.8460)


2025-12-06T17:12:33 [INFO] Iteration 10/15: trying params = {'n_estimators': 600, 'max_depth': None, 'min_samples_leaf': 1, 'max_features': 'log2'} (Optuna trial 9)


2025-12-06T17:12:56 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:12:56 [INFO]    Fold 1/5 finished in 0.39 min
2025-12-06T17:13:21 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:13:21 [INFO]    Fold 2/5 finished in 0.42 min
2025-12-06T17:13:46 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:13:46 [INFO]    Fold 3/5 finished in 0.40 min
2025-12-06T17:14:09 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:14:09 [INFO]    Fold 4/5 finished in 0.39 min
2025-12-06T17:14:31 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:14:31 [INFO]    Fold 5/5 finished in 0.37 min
2025-12-06T17:14:31 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:14:31 [INFO] Iteration 10 finished in 1.98 min (AUC=0.8328, best_AUC=0.8460)


2025-12-06T17:14:31 [INFO] Iteration 11/15: trying params = {'n_estimators': 900, 'max_depth': 20, 'min_samples_leaf': 2, 'max_features': 'sqrt'} (Optuna trial 10)


2025-12-06T17:15:59 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:15:59 [INFO]    Fold 1/5 finished in 1.46 min
2025-12-06T17:17:36 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:17:36 [INFO]    Fold 2/5 finished in 1.62 min
2025-12-06T17:19:21 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:19:21 [INFO]    Fold 3/5 finished in 1.75 min
2025-12-06T17:21:01 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:21:01 [INFO]    Fold 4/5 finished in 1.67 min
2025-12-06T17:22:40 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:22:40 [INFO]    Fold 5/5 finished in 1.64 min
2025-12-06T17:22:40 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:22:40 [INFO] Iteration 11 finished in 8.14 min (AUC=0.8464, best_AUC=0.8464)


2025-12-06T17:22:40 [INFO] Iteration 12/15: trying params = {'n_estimators': 900, 'max_depth': 20, 'min_samples_leaf': 2, 'max_features': 'sqrt'} (Optuna trial 11)


2025-12-06T17:24:11 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:24:11 [INFO]    Fold 1/5 finished in 1.52 min
2025-12-06T17:25:45 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:25:45 [INFO]    Fold 2/5 finished in 1.56 min
2025-12-06T17:27:16 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:27:16 [INFO]    Fold 3/5 finished in 1.53 min
2025-12-06T17:28:51 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:28:51 [INFO]    Fold 4/5 finished in 1.57 min
2025-12-06T17:30:36 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:30:36 [INFO]    Fold 5/5 finished in 1.75 min
2025-12-06T17:30:36 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:30:36 [INFO] Iteration 12 finished in 7.94 min (AUC=0.8464, best_AUC=0.8464)


2025-12-06T17:30:36 [INFO] Iteration 13/15: trying params = {'n_estimators': 900, 'max_depth': 20, 'min_samples_leaf': 2, 'max_features': 'sqrt'} (Optuna trial 12)


2025-12-06T17:32:26 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:32:26 [INFO]    Fold 1/5 finished in 1.83 min
2025-12-06T17:33:56 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:33:56 [INFO]    Fold 2/5 finished in 1.49 min
2025-12-06T17:35:21 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:35:21 [INFO]    Fold 3/5 finished in 1.43 min
2025-12-06T17:36:48 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:36:48 [INFO]    Fold 4/5 finished in 1.44 min
2025-12-06T17:38:13 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:38:13 [INFO]    Fold 5/5 finished in 1.41 min
2025-12-06T17:38:13 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:38:13 [INFO] Iteration 13 finished in 7.61 min (AUC=0.8464, best_AUC=0.8464)


2025-12-06T17:38:13 [INFO] Iteration 14/15: trying params = {'n_estimators': 900, 'max_depth': 20, 'min_samples_leaf': 2, 'max_features': 'sqrt'} (Optuna trial 13)


2025-12-06T17:39:38 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:39:38 [INFO]    Fold 1/5 finished in 1.42 min
2025-12-06T17:41:13 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:41:13 [INFO]    Fold 2/5 finished in 1.58 min
2025-12-06T17:43:13 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:43:13 [INFO]    Fold 3/5 finished in 2.01 min
2025-12-06T17:45:05 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:45:05 [INFO]    Fold 4/5 finished in 1.86 min
2025-12-06T17:46:45 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:46:45 [INFO]    Fold 5/5 finished in 1.68 min
2025-12-06T17:46:45 [train_12_08_16_45_randomforest_train] INFO: 2025-12-06T17:46:45 [INFO] Iteration 14 finished in 8.55 min (AUC=0.8464, best_AUC=0.8464)


2025-12-06T17:46:45 [INFO] Iteration 15/15: trying params = {'n_estimators': 900, 'max_depth': 20, 'min_samples_leaf': 2, 'max_features': 'sqrt'} (Optuna trial 14)


## Retrain model

In [None]:
# Retrain Random Forest on all training data using best hyperparameters
comb_params = rf_params | best_hp

model = RandomForestClassifier(**comb_params)
model.fit(X, y)


## Shuffle train and validation data

In [None]:
X_shuf = X.copy()
shuffle_data(X_shuf)

X_test_shuf = X_test.copy()
shuffle_data(X_test_shuf)

# Save models and model data

## Store pre-evaluation split data

In [None]:
store_pre_evaluation_data(X, y, restdf[meta_cols], "train")
store_pre_evaluation_data(X_shuf, y, restdf[meta_cols], "shuffled_train")

store_pre_evaluation_data(X_test, y_test, testdf[meta_cols], "test")
store_pre_evaluation_data(X_test_shuf, y_test, testdf[meta_cols], "shuffled_test")

In [None]:
suffix = "_qc" if data_level == "cleaned" else ""

dump(model, f"{data_path}/trained_nf1_model{suffix}.joblib")
dump(le, f"{data_path}/trained_nf1_model_label_encoder{suffix}.joblib")
pd.DataFrame(eval_data).to_parquet(
    f"{data_path}/nf1_model_pre_evaluation_results{suffix}.parquet"
)
