In [None]:
%load_ext autoreload
%autoreload 2


In [1]:
import anndata

mlrepo = anndata.read_h5ad("../data/mlrepo5.h5ad")

In [8]:
import numpy as np
import pandas as pd
import torch
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import os
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# Configuration
# TASK = "classification"  # or "regression"
TASK = "regression"
MODEL = RandomForestRegressor if TASK == "regression" else RandomForestClassifier
FOLDS = 5

# Dataset and embedding configuration
DATASETS = mlrepo.obs.columns.drop("dataset")
REGRESSION_DATASETS = [
    "ravel_nugent-score",
    "ravel_ph",
    "gevers_pcdai-ileum",
    "gevers_pcdai-rectum",
    "yatsunenko_baby-age",
]
CLASSIFICATION_DATASETS = [d for d in DATASETS if d not in REGRESSION_DATASETS]
DATASETS_FINAL = REGRESSION_DATASETS if TASK == "regression" else CLASSIFICATION_DATASETS

EMBEDDINGS = [
    "raw",
    "H2",
    "H4",
    "H8",
    "H16",
    "H32",
    "H64",
    "H128",
    "E2",
    "E4",
    "E8",
    "E16",
    "E32",
    "E64",
    "E128",
    "dnabert-s",
]
METRICS = ["accuracy", "f1", "auc"] if TASK == "classification" else ["r2", "mae", "rmse"]


def get_embedding(mlrepo_filtered, embedding_name):
    """Get embedding data and product manifold"""
    if embedding_name == "raw":
        X = mlrepo_filtered.X
    else:
        X = mlrepo_filtered.obsm[embedding_name]

    return X


def calculate_score(y_true, y_pred, metric):
    """Calculate the specified evaluation metric"""
    if metric == "accuracy":
        return accuracy_score(y_true, y_pred)
    elif metric == "f1":
        return f1_score(y_true, y_pred)
    elif metric == "auc":
        return roc_auc_score(y_true, y_pred)
    elif metric == "r2":
        return r2_score(y_true, y_pred)
    elif metric == "mae":
        return mean_absolute_error(y_true, y_pred)
    elif metric == "rmse":
        return np.sqrt(mean_squared_error(y_true, y_pred))


scores = pd.DataFrame(columns=["task", "embedding", "fold", "metric", "score"])

# Calculate total iterations and initialize progress bar
total_iterations = len(DATASETS_FINAL) * len(EMBEDDINGS) * FOLDS * len(METRICS)
my_tqdm = tqdm(total=total_iterations)
my_tqdm.set_description(f"Completed: 0/{total_iterations}")

for task in DATASETS_FINAL:
    # Filter mlrepo; drop empty columns
    mlrepo_filtered = mlrepo[mlrepo.obs[task].notna()]
    mlrepo_filtered = mlrepo_filtered[:, (mlrepo_filtered.X > 0).sum(axis=0) > 0]

    # Get target values
    y = np.array(mlrepo_filtered.obs[task].values)
    if TASK == "classification":
        y = OrdinalEncoder().fit_transform(y.reshape(-1, 1)).flatten()
    else:
        y = y.flatten()

    # Set up cross-validation
    if TASK == "classification":
        kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
    else:
        kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

    # Create folds once to ensure consistency
    folds = list(kf.split(np.zeros(len(y)), y))

    for embedding in EMBEDDINGS:
        # Get embedding data
        X = get_embedding(mlrepo_filtered, embedding)

        for fold_idx, (train_index, test_index) in enumerate(folds):
            # Convert data to tensors
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # Train model
            model = MODEL()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            # Calculate and store scores for each metric
            for metric in METRICS:
                score = calculate_score(y_test, y_pred, metric)
                scores.loc[len(scores)] = [task, embedding, fold_idx, metric, score]

                # Update progress bar
                my_tqdm.update(1)
                my_tqdm.set_postfix(task=task, embedding=embedding, fold=fold_idx, metric=metric, score=score)

        # Save checkpoint after each embedding
        task_scores = scores[scores["task"] == task]

my_tqdm.close()

scores

  0%|          | 0/1200 [00:00<?, ?it/s]

Unnamed: 0,task,embedding,fold,metric,score
0,ravel_nugent-score,raw,0,r2,0.723556
1,ravel_nugent-score,raw,0,mae,1.155000
2,ravel_nugent-score,raw,0,rmse,1.750783
3,ravel_nugent-score,raw,1,r2,0.726074
4,ravel_nugent-score,raw,1,mae,1.216667
...,...,...,...,...,...
1195,yatsunenko_baby-age,dnabert-s,3,mae,0.183120
1196,yatsunenko_baby-age,dnabert-s,3,rmse,0.226213
1197,yatsunenko_baby-age,dnabert-s,4,r2,0.596286
1198,yatsunenko_baby-age,dnabert-s,4,mae,0.210222


In [9]:
scores.to_csv(f"../results/benchmark_scores_sklearn_rf_None_{TASK}.csv", index=False)

In [11]:
# Save as LaTeX table
METRIC = "auc" if TASK == "classification" else "rmse"

scores_auc = scores[scores["metric"] == METRIC]
scores_auc_mean = scores_auc.groupby(["embedding", "task"])["score"].mean()
scores_auc_mean_pivot = scores_auc_mean.unstack()
scores_auc_mean_pivot = scores_auc_mean_pivot.reindex(EMBEDDINGS)

scores_auc_mean_pivot.T[["raw", "H128", "E128", "dnabert-s"]].to_latex(
    buf=f"../figures/benchmark_scores_sklearn_rf_None_{TASK}.tex", index=True, float_format="%.3f"
)