# Calculate evaluation Plate and Datasplit evaluation results
Evaluation results include confusion matrices, pr curves, precision, recall, f1-score, and accuracy.
This occurs for each plate across all splits.

In [1]:
import pathlib
from collections import defaultdict

import pandas as pd
from joblib import load
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_recall_curve,
    precision_score,
    recall_score,
)

In [2]:
MODEL_ID = "randomforest"
ROLE = "eval"

import logging
from datetime import datetime
import pathlib

# ============================================
# 1) Choose a RUN_ID
# ============================================
RUN_ID = datetime.now().strftime("%m_%d_%H_%M")

RUN_ID = "12_08_07_53"

RUN_ID = "12_08_08_12"

ANALYSIS_TYPE = "evaluate"


def setup_logger(
    run_id: str,
    model_id: str,
    role: str,
    log_dir: str = "logs",
    analysis_type: str = ANALYSIS_TYPE,
) -> logging.Logger:
    """
    Create a logger that writes to both stdout and a log file.

    - Logger name:  "<analysis_type>_<run_id>_<model_id>_<role>"
    - Log file:     "log_<analysis_type>_<run_id>_<model_id>.log" in `log_dir`
      (shared by all notebooks for the same model & run & analysis_type).
    """
    log_path = pathlib.Path(log_dir)
    log_path.mkdir(exist_ok=True)

    logger_name = f"{analysis_type}_{run_id}_{model_id}_{role}"
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.INFO)
    logger.propagate = False  

    # Avoid adding handlers multiple times if the cell is re-run
    if not logger.handlers:
        # Common formatter for both handlers
        formatter = logging.Formatter(
            fmt="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
            datefmt="%Y-%m-%dT%H:%M:%S",
        )

        # Stream handler (stdout)
        stream_handler = logging.StreamHandler()
        stream_handler.setLevel(logging.INFO)
        stream_handler.setFormatter(formatter)
        logger.addHandler(stream_handler)

        # File handler (one file per analysis_type + run_id + model_id)
        log_file = log_path / f"log_{analysis_type}_{run_id}_{model_id}.log"
        file_handler = logging.FileHandler(log_file)
        file_handler.setLevel(logging.INFO)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
        print(log_path)

    return logger

logger = setup_logger(RUN_ID, MODEL_ID, ROLE)
logger.info("Initialized logger.")


2025-12-06T08:15:18 [evaluate_12_08_08_12_randomforest_eval] INFO: Initialized logger.


logs


## Find the root of the git repo on the host system

In [3]:
# Get the current working directory
cwd = pathlib.Path.cwd()

if (cwd / ".git").is_dir():
    root_dir = cwd

else:
    root_dir = None
    for parent in cwd.parents:
        if (parent / ".git").is_dir():
            root_dir = parent
            break

# Check if a Git root directory was found
if root_dir is None:
    raise FileNotFoundError("No Git root directory found.")

## Define paths

### Input

In [4]:
# Set data type for the model evaluation
data_type = "cleaned"

# Set path to data directory
data_path = pathlib.Path(f"{root_dir}/1.train_models/data_randomforest")


# Set suffix for data files if using QC or cleaned data
if data_type == "cleaned":
    suffix = "_qc"
else:
    suffix = ""

evaldf = pd.read_parquet(f"{data_path}/nf1_model_pre_evaluation_results{suffix}.parquet")
model = load(f"{data_path}/trained_nf1_model{suffix}.joblib")
le = load(f"{data_path}/trained_nf1_model_label_encoder{suffix}.joblib")

In [5]:
evaldf.head()

Unnamed: 0,probability_WT,datasplit,predicted_genotype,true_genotype,Metadata_Cells_Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Cells_Location_Center_X,Metadata_WellCol,Metadata_Site,...,Metadata_Cytoplasm_Parent_Nuclei,Metadata_Cytoplasm_Parent_Cells,Metadata_Nuclei_Number_Object_Number,Metadata_ImageNumber,Metadata_gene_name,Metadata_Nuclei_Location_Center_Y,Metadata_number_of_singlecells,Metadata_Nuclei_Location_Center_X,Metadata_genotype,Metadata_Cells_Number_Object_Number
0,0.61,val,1,0,788.367226,Plate_3,G10,866.549627,10,14,...,3,3,3,1352,NF1,815.455336,117,863.960187,Null,3
1,0.243333,val,0,0,514.961459,Plate_3,E9,886.483979,9,10,...,2,2,2,1042,NF1,492.275679,42,904.761211,Null,2
2,0.241667,val,0,0,666.626255,Plate_3,G11,677.158197,11,14,...,8,5,8,1377,NF1,690.372655,250,678.619842,Null,5
3,0.435,val,0,0,466.239187,Plate_3,E12,267.364716,12,22,...,14,8,14,912,NF1,463.921004,607,281.340784,Null,8
4,0.376667,val,0,0,614.35672,Plate_3,F11,618.250909,11,23,...,10,6,10,1098,NF1,613.041494,329,638.97666,Null,6


### Outputs

In [6]:
eval_path = pathlib.Path("model_evaluation_data_randomforest")
eval_path.mkdir(parents=True, exist_ok=True)


In [7]:
gene_column = "true_genotype"

def down_sample_by_genotype(_df):
    """
    Parameters
    ----------
    _df: Pandas Dataframe
        The data to be downsampled by the gene_column column.

    Returns
    -------
        The data down-sampled by genotype.
    """

    min_gene = _df[gene_column].value_counts().min()
    return (_df.groupby(gene_column, group_keys=False)
            .apply(lambda x: x.sample(n=min_gene, random_state=0))
            )

## Calculate evaluation metrics

In [8]:
# Define evaluation metric data
# The "metrics" include precision, recall, accuracy, and f1 scores
eval_mets = {
    met: defaultdict(list) for met in
    ("metrics", "precision_recall", "confusion_matrix")
}

# Labels of confusion matrices in dataframe
cm_true_labels = [
    le.classes_[0],
    le.classes_[0],
    le.classes_[1],
    le.classes_[1]
]

cm_pred_labels = [
    le.classes_[0],
    le.classes_[1],
    le.classes_[0],
    le.classes_[1]
]

def compute_metrics(_df, _plate, _split):
    """
    Parameters
    ----------
    _df: Pandas Dataframe
        Model data to be evaluated.

    _plate: String
        Name of the plate for storing the metrics

    _split: String
        Name of the data split for storing the metric
    """

    y_true = _df[gene_column]
    y_pred = _df["predicted_genotype"]
    y_proba = _df["probability_WT"]

    # Store metrics
    eval_mets["metrics"]["f1_score"].append(f1_score(y_true, y_pred))
    eval_mets["metrics"]["precision"].append(precision_score(y_true, y_pred))
    eval_mets["metrics"]["recall"].append(recall_score(y_true, y_pred))
    eval_mets["metrics"]["accuracy"].append(accuracy_score(y_true, y_pred))
    eval_mets["metrics"]["plate"].append(_plate)
    eval_mets["metrics"]["datasplit"].append(_split)

    # Store precision and recall data
    precision, recall, _ = precision_recall_curve(y_true, y_proba)
    pr_size = precision.shape[0]
    eval_mets["precision_recall"]["precision"].extend(precision.tolist())
    eval_mets["precision_recall"]["recall"].extend(recall.tolist())
    eval_mets["precision_recall"]["plate"].extend([_plate] * pr_size)
    eval_mets["precision_recall"]["datasplit"].extend([_split] * pr_size)

    # Store confusion matrices
    cm = confusion_matrix(y_true, y_pred)
    cm = cm.flatten()
    cm_size = cm.shape[0]
    eval_mets["confusion_matrix"]["confusion_values"].extend(cm.tolist())
    eval_mets["confusion_matrix"]["true_genotype"].extend(cm_true_labels)
    eval_mets["confusion_matrix"]["predicted_genotype"].extend(cm_pred_labels)
    eval_mets["confusion_matrix"]["plate"].extend([_plate] * cm_size)
    eval_mets["confusion_matrix"]["datasplit"].extend([_split] * cm_size)

In [9]:
# Iterate through each data split
for split in evaldf["datasplit"].unique():

    # Calculate metrics for all plates
    df_temp = evaldf.loc[(evaldf["datasplit"] == split)].copy()
    compute_metrics(df_temp, "all_plates", split)

    # Calculate metrics for each plate
    for plate in evaldf["Metadata_Plate"].unique():
        df_temp = evaldf.loc[(evaldf["Metadata_Plate"] == plate) & (evaldf["datasplit"] == split)].copy()
        df_temp = down_sample_by_genotype(df_temp)
        compute_metrics(df_temp, plate, split)

### Save evaluation metrics and model coefficients for plotting

In [10]:
for met, met_data in eval_mets.items():
    # e.g. metrics_final_rf_qc_model.parquet, confusion_matrix_final_rf_qc_model.parquet, ...
    pd.DataFrame(met_data).to_parquet(
        f"{eval_path}/{met}_final_rf_qc_model.parquet"
    )

# Random Forest feature importances
pd.DataFrame(
    {
        "feature_names": model.feature_names_in_,
        "feature_importances": model.feature_importances_.reshape(-1),
    }
).to_parquet(f"{eval_path}/feature_importances_rf_qc.parquet")
