In [1]:
MODEL_ID = "ensemble"   # or "comparison"
ROLE = "eval"

import logging
from datetime import datetime
import pathlib
import pandas as pd

# ============================================
# 1) Choose a RUN_ID
# ============================================
RUN_ID = datetime.now().strftime("%m_%d_%H_%M")

RUN_ID = "12_08_08_12"

ANALYSIS_TYPE = "evaluate"


def setup_logger(
    run_id: str,
    model_id: str,
    role: str,
    log_dir: str = "logs",
    analysis_type: str = ANALYSIS_TYPE,
) -> logging.Logger:
    """
    Create a logger that writes to both stdout and a log file.

    - Logger name:  "<analysis_type>_<run_id>_<model_id>_<role>"
    - Log file:     "log_<analysis_type>_<run_id>_<model_id>.log" in `log_dir`
      (shared by all notebooks for the same model & run & analysis_type).
    """
    log_path = pathlib.Path(log_dir)
    log_path.mkdir(exist_ok=True)

    logger_name = f"{analysis_type}_{run_id}_{model_id}_{role}"
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.INFO)
    logger.propagate = False  

    # Avoid adding handlers multiple times if the cell is re-run
    if not logger.handlers:
        # Common formatter for both handlers
        formatter = logging.Formatter(
            fmt="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
            datefmt="%Y-%m-%dT%H:%M:%S",
        )

        # Stream handler (stdout)
        stream_handler = logging.StreamHandler()
        stream_handler.setLevel(logging.INFO)
        stream_handler.setFormatter(formatter)
        logger.addHandler(stream_handler)

        # File handler (one file per analysis_type + run_id + model_id)
        log_file = log_path / f"log_{analysis_type}_{run_id}_{model_id}.log"
        file_handler = logging.FileHandler(log_file)
        file_handler.setLevel(logging.INFO)
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
        print(log_path)

    return logger

logger = setup_logger(RUN_ID, MODEL_ID, ROLE)
logger.info("Initialized logger.")


2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: Initialized logger.


logs


In [2]:
# Cell 2: load metrics parquet files for LR, XGBoost, Random Forest

# Base paths relative to this notebook (2.evaluation)
lr_eval_path = pathlib.Path("model_evaluation_data")
xgb_eval_path = pathlib.Path("model_evaluation_data_xgboost")
rf_eval_path = pathlib.Path("model_evaluation_data_randomforest")

# If needed, change filenames here to match your RF eval notebook.
lr_metrics_file = lr_eval_path / "metrics_final_qc_model.parquet"
xgb_metrics_file = xgb_eval_path / "metrics_final_xgb_qc_model.parquet"
rf_metrics_file = rf_eval_path / "metrics_final_rf_qc_model.parquet"

logger.info("Loading metrics parquet files for LR, XGBoost, and Random Forest...")

metrics_lr = pd.read_parquet(lr_metrics_file)
metrics_xgb = pd.read_parquet(xgb_metrics_file)
metrics_rf = pd.read_parquet(rf_metrics_file)

logger.info(f"LR metrics shape: {metrics_lr.shape} from {lr_metrics_file}")
logger.info(f"XGB metrics shape: {metrics_xgb.shape} from {xgb_metrics_file}")
logger.info(f"RF metrics shape: {metrics_rf.shape} from {rf_metrics_file}")

logger.info(f"LR datasplits: {sorted(metrics_lr['datasplit'].unique().tolist())}")
logger.info(f"XGB datasplits: {sorted(metrics_xgb['datasplit'].unique().tolist())}")
logger.info(f"RF datasplits: {sorted(metrics_rf['datasplit'].unique().tolist())}")


2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: Loading metrics parquet files for LR, XGBoost, and Random Forest...
2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: LR metrics shape: (24, 6) from model_evaluation_data/metrics_final_qc_model.parquet
2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: XGB metrics shape: (24, 6) from model_evaluation_data_xgboost/metrics_final_xgb_qc_model.parquet
2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: RF metrics shape: (24, 6) from model_evaluation_data_randomforest/metrics_final_rf_qc_model.parquet
2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: LR datasplits: ['shuffled_test', 'shuffled_train', 'shuffled_val', 'test', 'train', 'val']
2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: XGB datasplits: ['shuffled_test', 'shuffled_train', 'shuffled_val', 'test', 'train', 'val']
2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: RF datasplits: ['shuffled_t

In [3]:
# Cell 3: helper functions for slicing and summarizing metrics

def get_test_allplates(df, model_name):
    """Non-shuffled test, all plates."""
    df_split = df.query("datasplit == 'test' and plate == 'all_plates'").copy()
    df_split["model"] = model_name
    return df_split[["model", "accuracy", "f1_score", "precision", "recall"]]

def get_test_by_plate(df, model_name):
    """Non-shuffled test, per plate."""
    df_plate = df.query(
        "datasplit == 'test' and not datasplit.str.contains('shuffled')",
        engine="python",
    ).copy()
    df_plate["model"] = model_name
    return df_plate[["model", "plate", "accuracy", "f1_score", "precision", "recall"]]

def summarize_split(df, split_name, model_name):
    """train/val/test for all plates."""
    df_s = df.query("datasplit == @split_name and plate == 'all_plates'").copy()
    df_s["model"] = model_name
    df_s["datasplit"] = split_name
    return df_s[["model", "datasplit", "accuracy", "f1_score", "precision", "recall"]]

def summarize_shuffled(df, model_name):
    """All shuffled splits for null performance."""
    df_shuf = df.query("datasplit.str.contains('shuffled')", engine="python").copy()
    df_shuf["model"] = model_name
    return df_shuf[["model", "datasplit", "plate", "accuracy", "f1_score"]]


In [4]:
# Cell 4: test-set performance comparison

logger.info("Computing test-set comparison (all plates and per plate)...")

test_lr = get_test_allplates(metrics_lr, "logistic_regression")
test_xgb = get_test_allplates(metrics_xgb, "xgboost")
test_rf = get_test_allplates(metrics_rf, "random_forest")

test_comparison = pd.concat([test_lr, test_xgb, test_rf], ignore_index=True)

logger.info("=== Test-set performance: all plates ===")
display(test_comparison)

logger.info("Test-set (all plates) comparison:")
for _, row in test_comparison.iterrows():
    logger.info(
        f"{row['model']}: accuracy={row['accuracy']:.4f}, "
        f"f1={row['f1_score']:.4f}, precision={row['precision']:.4f}, "
        f"recall={row['recall']:.4f}"
    )

# Per-plate test comparison
plate_lr = get_test_by_plate(metrics_lr, "logistic_regression")
plate_xgb = get_test_by_plate(metrics_xgb, "xgboost")
plate_rf = get_test_by_plate(metrics_rf, "random_forest")

plate_comparison = pd.concat([plate_lr, plate_xgb, plate_rf], ignore_index=True)

logger.info("=== Test-set performance: per plate ===")
display(plate_comparison.sort_values(["plate", "model"]))

logger.info("Computed per-plate test performance for all models.")


2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: Computing test-set comparison (all plates and per plate)...
2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: === Test-set performance: all plates ===


Unnamed: 0,model,accuracy,f1_score,precision,recall
0,logistic_regression,0.788747,0.789863,0.785714,0.794055
1,xgboost,0.815817,0.81708,0.811518,0.822718
2,random_forest,0.764862,0.769151,0.755374,0.783439


2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: Test-set (all plates) comparison:
2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: logistic_regression: accuracy=0.7887, f1=0.7899, precision=0.7857, recall=0.7941
2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: xgboost: accuracy=0.8158, f1=0.8171, precision=0.8115, recall=0.8227
2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: random_forest: accuracy=0.7649, f1=0.7692, precision=0.7554, recall=0.7834
2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: === Test-set performance: per plate ===


Unnamed: 0,model,plate,accuracy,f1_score,precision,recall
1,logistic_regression,Plate_3,0.844622,0.84585,0.839216,0.85259
9,random_forest,Plate_3,0.839641,0.842002,0.829787,0.854582
5,xgboost,Plate_3,0.891434,0.890672,0.89697,0.884462
2,logistic_regression,Plate_3_prime,0.813364,0.817978,0.798246,0.83871
10,random_forest,Plate_3_prime,0.764977,0.777293,0.738589,0.820276
6,xgboost,Plate_3_prime,0.836406,0.851153,0.780769,0.935484
3,logistic_regression,Plate_5,0.639013,0.631579,0.64486,0.618834
11,random_forest,Plate_5,0.596413,0.59276,0.598174,0.587444
7,xgboost,Plate_5,0.625561,0.605201,0.64,0.573991
0,logistic_regression,all_plates,0.788747,0.789863,0.785714,0.794055


2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: Computed per-plate test performance for all models.


In [5]:
# Cell 5: train / val / test comparison to check overfitting

logger.info("Summarizing train/val/test performance for all models...")

splits = ["train", "val", "test"]

summary_lr = pd.concat(
    [summarize_split(metrics_lr, s, "logistic_regression") for s in splits],
    ignore_index=True,
)

summary_xgb = pd.concat(
    [summarize_split(metrics_xgb, s, "xgboost") for s in splits],
    ignore_index=True,
)

summary_rf = pd.concat(
    [summarize_split(metrics_rf, s, "random_forest") for s in splits],
    ignore_index=True,
)

summary_all = pd.concat([summary_lr, summary_xgb, summary_rf], ignore_index=True)

print("=== Train / Val / Test comparison (all plates) ===")
display(summary_all.sort_values(["model", "datasplit"]))

for model_name in ["logistic_regression", "xgboost", "random_forest"]:
    df_m = summary_all.query("model == @model_name")
    train_acc = df_m.query("datasplit == 'train'")["accuracy"].iloc[0]
    test_acc = df_m.query("datasplit == 'test'")["accuracy"].iloc[0]
    gap = train_acc - test_acc
    logger.info(
        f"{model_name}: train_acc={train_acc:.4f}, "
        f"test_acc={test_acc:.4f}, train-test gap={gap:.4f}"
    )


2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: Summarizing train/val/test performance for all models...


=== Train / Val / Test comparison (all plates) ===


Unnamed: 0,model,datasplit,accuracy,f1_score,precision,recall
2,logistic_regression,test,0.788747,0.789863,0.785714,0.794055
0,logistic_regression,train,0.847305,0.847625,0.845851,0.849407
1,logistic_regression,val,0.81304,0.813808,0.81048,0.817164
8,random_forest,test,0.764862,0.769151,0.755374,0.783439
6,random_forest,train,1.0,1.0,1.0,1.0
7,random_forest,val,0.763971,0.763063,0.76601,0.760138
5,xgboost,test,0.815817,0.81708,0.811518,0.822718
3,xgboost,train,1.0,1.0,1.0,1.0
4,xgboost,val,0.85429,0.855159,0.850092,0.860287


2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: logistic_regression: train_acc=0.8473, test_acc=0.7887, train-test gap=0.0586
2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: xgboost: train_acc=1.0000, test_acc=0.8158, train-test gap=0.1842
2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: random_forest: train_acc=1.0000, test_acc=0.7649, train-test gap=0.2351


In [6]:
# Cell 6: shuffled performance (null baseline)

logger.info("Summarizing shuffled (null) performance for all models...")

shuf_lr = summarize_shuffled(metrics_lr, "logistic_regression")
shuf_xgb = summarize_shuffled(metrics_xgb, "xgboost")
shuf_rf = summarize_shuffled(metrics_rf, "random_forest")

shuffled_summary = pd.concat([shuf_lr, shuf_xgb, shuf_rf], ignore_index=True)

print("=== Shuffled (null) performance ===")
display(shuffled_summary.sort_values(["model", "datasplit", "plate"]))

for model_name in ["logistic_regression", "xgboost", "random_forest"]:
    df_m = shuffled_summary.query("model == @model_name")
    if df_m.empty:
        logger.info(f"{model_name}: no shuffled rows found.")
        continue
    avg_acc = df_m["accuracy"].mean()
    avg_f1 = df_m["f1_score"].mean()
    logger.info(
        f"{model_name}: mean shuffled accuracy={avg_acc:.4f}, "
        f"mean shuffled f1={avg_f1:.4f} over {len(df_m)} rows"
    )


2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: Summarizing shuffled (null) performance for all models...


=== Shuffled (null) performance ===


Unnamed: 0,model,datasplit,plate,accuracy,f1_score
9,logistic_regression,shuffled_test,Plate_3,0.464143,0.469428
10,logistic_regression,shuffled_test,Plate_3_prime,0.481567,0.444444
11,logistic_regression,shuffled_test,Plate_5,0.517937,0.514673
8,logistic_regression,shuffled_test,all_plates,0.480892,0.474758
5,logistic_regression,shuffled_train,Plate_3,0.496141,0.495776
6,logistic_regression,shuffled_train,Plate_3_prime,0.509132,0.507508
7,logistic_regression,shuffled_train,Plate_5,0.509893,0.502831
4,logistic_regression,shuffled_train,all_plates,0.502658,0.500342
1,logistic_regression,shuffled_val,Plate_3,0.500511,0.500287
2,logistic_regression,shuffled_val,Plate_3_prime,0.499467,0.498488


2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: logistic_regression: mean shuffled accuracy=0.4970, mean shuffled f1=0.4924 over 12 rows
2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: xgboost: mean shuffled accuracy=0.5050, mean shuffled f1=0.5209 over 12 rows
2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: random_forest: mean shuffled accuracy=0.4957, mean shuffled f1=0.5109 over 12 rows


In [7]:
# Cell 7: optional persistence of comparison tables

out_dir = pathlib.Path("model_evaluation_data_ensemble")
out_dir.mkdir(exist_ok=True)

logger.info(f"Saving ensemble comparison tables to {out_dir}...")

test_comparison.to_parquet(out_dir / "ensemble_test_allplates.parquet", index=False)
plate_comparison.to_parquet(out_dir / "ensemble_test_per_plate.parquet", index=False)
summary_all.to_parquet(out_dir / "ensemble_train_val_test.parquet", index=False)
shuffled_summary.to_parquet(out_dir / "ensemble_shuffled_summary.parquet", index=False)

logger.info("Finished saving ensemble comparison tables.")


2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: Saving ensemble comparison tables to model_evaluation_data_ensemble...


2025-12-06T08:14:46 [evaluate_12_08_08_12_ensemble_eval] INFO: Finished saving ensemble comparison tables.
