In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import anndata

mlrepo = anndata.read_h5ad("../data/mlrepo6.h5ad")

In [59]:
import numpy as np
import pandas as pd
import torch
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import os
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# Configuration
# TASK = "classification"  # or "regression"
TASK = "regression"
MODEL = RandomForestRegressor if TASK == "regression" else RandomForestClassifier
FOLDS = 5

# Dataset and embedding configuration
DATASETS = mlrepo.obs.columns.drop("dataset")
REGRESSION_DATASETS = [
    "ravel_nugent-score",
    "ravel_ph",
    "gevers_pcdai-ileum",
    "gevers_pcdai-rectum",
    "yatsunenko_baby-age",
]
CLASSIFICATION_DATASETS = [d for d in DATASETS if d not in REGRESSION_DATASETS]
DATASETS_FINAL = REGRESSION_DATASETS if TASK == "regression" else CLASSIFICATION_DATASETS

EMBEDDINGS = [
    "raw",
    # "H2",
    # "H4",
    # "H8",
    # "H16",
    # "H32",
    # "H64",
    "H128",
    # "E2",
    # "E4",
    # "E8",
    # "E16",
    # "E32",
    # "E64",
    "E128",
    "PCA128",
    "dnabert-s",
    "random",
    "random_mix",
]
METRICS = ["accuracy", "f1", "auc"] if TASK == "classification" else ["r2", "mae", "rmse"]


def get_embedding(mlrepo_filtered, embedding_name):
    """Get embedding data and product manifold"""
    if embedding_name == "raw":
        X = mlrepo_filtered.X
    elif embedding_name == "random":
        X = np.random.randn(mlrepo_filtered.X.shape[0], 128)
    elif embedding_name == "random_mix":
        X = mlrepo_filtered.X @ np.random.randn(mlrepo_filtered.n_vars, 128)
    else:
        X = mlrepo_filtered.obsm[embedding_name]

    return X


def calculate_score(y_true, y_pred, metric):
    """Calculate the specified evaluation metric"""
    if metric in ["accuracy", "f1"]:
        y_pred = np.argmax(y_pred, axis=1)
    elif metric == "auc":
        y_pred = y_pred[:, 1]

    if metric == "accuracy":
        return accuracy_score(y_true, y_pred)
    elif metric == "f1":
        return f1_score(y_true, y_pred)
    elif metric == "auc":
        return roc_auc_score(y_true, y_pred)
    elif metric == "r2":
        return r2_score(y_true, y_pred)
    elif metric == "mae":
        return mean_absolute_error(y_true, y_pred)
    elif metric == "rmse":
        return np.sqrt(mean_squared_error(y_true, y_pred))
    # elif metric == "percent_rmse":
    # return (np.sqrt(mean_squared_error(y_true, y_pred, multioutput="raw_values")) / y_true).mean()


scores = pd.DataFrame(columns=["task", "embedding", "fold", "metric", "score"])

# Calculate total iterations and initialize progress bar
total_iterations = len(DATASETS_FINAL) * len(EMBEDDINGS) * FOLDS * len(METRICS)
my_tqdm = tqdm(total=total_iterations)

for task in DATASETS_FINAL:
    # Filter mlrepo; drop empty columns
    mlrepo_filtered = mlrepo[mlrepo.obs[task].notna()]
    mlrepo_filtered = mlrepo_filtered[:, (mlrepo_filtered.X > 0).sum(axis=0) > 0]

    # Get target values
    y = np.array(mlrepo_filtered.obs[task].values)
    if TASK == "classification":
        y = OrdinalEncoder().fit_transform(y.reshape(-1, 1)).flatten()
    else:
        y = y.flatten()

    # Set up cross-validation
    if TASK == "classification":
        kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
    else:
        kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

    # Create folds once to ensure consistency
    folds = list(kf.split(np.zeros(len(y)), y))

    for embedding in EMBEDDINGS:
        # Get embedding data
        X = get_embedding(mlrepo_filtered, embedding)

        for fold_idx, (train_index, test_index) in enumerate(folds):
            # Convert data to tensors
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # Train model
            model = MODEL()
            model.fit(X_train, y_train)
            if TASK == "classification":
                y_out = model.predict_proba(X_test)  # Need this for ROC-AUC calculations
            else:
                y_out = model.predict(X_test)

            # Calculate and store scores for each metric
            for metric in METRICS:
                score = calculate_score(y_test, y_out, metric)
                scores.loc[len(scores)] = [task, embedding, fold_idx, metric, score]

                # Update progress bar
                my_tqdm.update(1)
                my_tqdm.set_postfix(task=task, embedding=embedding, fold=fold_idx, metric=metric, score=score)

        # Save checkpoint after each embedding
        task_scores = scores[scores["task"] == task]

my_tqdm.close()

scores

  0%|          | 0/525 [00:00<?, ?it/s]

Unnamed: 0,task,embedding,fold,metric,score
0,ravel_nugent-score,raw,0,r2,0.737442
1,ravel_nugent-score,raw,0,mae,1.119231
2,ravel_nugent-score,raw,0,rmse,1.706244
3,ravel_nugent-score,raw,1,r2,0.725823
4,ravel_nugent-score,raw,1,mae,1.212308
...,...,...,...,...,...
520,yatsunenko_baby-age,random_mix,3,mae,0.308550
521,yatsunenko_baby-age,random_mix,3,rmse,0.341535
522,yatsunenko_baby-age,random_mix,4,r2,0.340589
523,yatsunenko_baby-age,random_mix,4,mae,0.287767


In [61]:
scores.to_csv(f"../results/benchmark_scores_sklearn_rf_None_{TASK}.csv", index=False)

In [62]:
# Save as LaTeX table
METRIC = "auc" if TASK == "classification" else "rmse"

scores_auc = scores[scores["metric"] == METRIC]
scores_auc_mean = scores_auc.groupby(["embedding", "task"])["score"].mean()
scores_auc_mean_pivot = scores_auc_mean.unstack()
scores_auc_mean_pivot = scores_auc_mean_pivot.reindex(EMBEDDINGS)

# Color each row based on the max score (if classification) or min score (if regression)
if TASK == "classification":
    scores_auc_mean_pivot_fancy = scores_auc_mean_pivot.T.style.apply(
        lambda x: ["background-color: darkgreen" if x.max() == x.values[i] else "" for i in range(len(x))],
        axis=1,
    ).apply(
        lambda x: [
            "background-color: green" if x.iloc[i] == x.nlargest(2).iloc[-1] and x.iloc[i] != x.max() else ""
            for i in range(len(x))
        ],
        axis=1,
    )
else:
    scores_auc_mean_pivot_fancy = scores_auc_mean_pivot.T.style.apply(
        lambda x: ["background-color: darkgreen" if x.min() == x.values[i] else "" for i in range(len(x))],
        axis=1,
    )

scores_auc_mean_pivot_fancy

embedding,raw,H128,E128,PCA128,dnabert-s,random,random_mix
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
gevers_pcdai-ileum,16.259076,15.64857,16.447425,15.511502,15.88075,16.518823,15.73162
gevers_pcdai-rectum,17.474227,17.346752,18.047597,15.803861,17.338155,13.999359,16.565879
ravel_nugent-score,1.700946,1.780565,1.799264,1.76501,1.83781,3.608426,1.833295
ravel_ph,0.51608,0.500351,0.483967,0.496704,0.486776,0.685133,0.512432
yatsunenko_baby-age,0.250692,0.310933,0.317515,0.24336,0.275145,0.430145,0.346892


In [63]:
def create_nice_latex_table(input_data, output_file=None, include_raw=True, find_maximum=True):
    # Parse the input data if it's a string
    if isinstance(input_data, str):
        # Extract data from the LaTeX table string
        lines = input_data.strip().split("\n")

        # Find header line and data lines
        header_line = None
        data_lines = []
        for i, line in enumerate(lines):
            if "embedding & raw" in line:
                header_line = i
            if header_line is not None and i > header_line + 2 and "\\bottomrule" not in line:
                data_lines.append(line)

        # Extract column names
        header = lines[header_line].strip().replace("\\\\", "")
        columns = [col.strip() for col in header.split("&")]

        # Extract data
        data = []
        for line in data_lines:
            if "\\bottomrule" in line or "\\midrule" in line:
                continue
            row_data = line.strip().replace("\\\\", "").split("&")
            row_data = [val.strip() for val in row_data]
            data.append(row_data)

        # Create DataFrame
        df = pd.DataFrame(data, columns=columns)
    else:
        # Assume input is already a DataFrame
        df = input_data.copy()

        # If the index contains the task names, reset the index to make it a column
        if df.index.name == "task" or (isinstance(df.index, pd.MultiIndex) and "task" in df.index.names):
            df = df.reset_index()

    # Check if we have a 'task' column, if not try to see if it's the first column
    if "task" not in df.columns and len(df.columns) > 0:
        # Assume first column is the task column
        df = df.rename(columns={df.columns[0]: "task"})

    # Convert numeric columns to float
    numeric_cols = ["raw", "random", "random_mix", "PCA128", "H128", "E128", "dnabert-s"]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = df[col].astype(float)

    # Extract dataset name and task from the task column
    df["dataset"] = df["task"].apply(lambda x: str(x).split("_")[0] if "_" in str(x) else "")
    df["task_name"] = df["task"].apply(lambda x: str(x).split("_")[1] if "_" in str(x) else str(x))

    # Count samples per task (dummy values for now, would be replaced with actual counts)
    # In a real scenario, you might have this data available
    sample_counts = {
        # Classification datasets
        "cho_control-ct-cecal": 17,
        "cho_control-ct-fecal": 18,
        "cho_penicillin-vancomycin-cecal": 20,
        "cho_penicillin-vancomycin-fecal": 19,
        "gevers_ileum": 140,
        "gevers_rectum": 160,
        "hmp_gastro-oral": 2070,
        "hmp_sex": 180,
        "hmp_stool-tongue-paired": 404,
        "hmp_sub-supragingivalplaque-paired": 408,
        "ravel_black-hispanic": 199,
        "ravel_nugent-category": 342,
        "ravel_white-black": 200,
        "sokol_healthy-cd": 74,
        "sokol_healthy-uc": 59,
        "turnbaugh_obese-lean-all": 142,
        "yatsunenko_malawi-venezuela": 54,
        "yatsunenko_sex": 129,
        "yatsunenko_usa-malawi": 150,
        # Regression datasets
        "gevers_pcdai-ileum": 67,
        "gevers_pcdai-rectum": 51,
        "ravel_nugent-score": 388,
        "ravel_ph": 388,
        "yatsunenko_baby-age": 49,
    }

    df["samples"] = df["task"].map(sample_counts)

    # Determine which columns to consider for scoring
    scoring_cols = numeric_cols if include_raw else [col for col in numeric_cols if col != "raw"]

    # Count top scores for each method (including ties)
    top_scores = {col: 0 for col in numeric_cols}

    # For each row, find methods that have the max score (could be multiple in case of ties)
    for idx, row in df.iterrows():
        row_scores = {col: row[col] for col in scoring_cols if col in row}
        if row_scores:
            best_score = max(row_scores.values()) if find_maximum else min(row_scores.values())
            for col in numeric_cols:
                if (
                    col in row_scores and abs(row_scores[col] - best_score) < 1e-6
                ):  # Using small epsilon for float comparison
                    top_scores[col] += 1

    # Calculate averages for each method
    averages = {col: df[col].mean() if col in df.columns else 0 for col in numeric_cols}

    # For averages, determine best based on find_maximum setting
    scoring_averages = {col: averages[col] for col in scoring_cols}
    best_avg = (
        max(scoring_averages.values()) if find_maximum else min(scoring_averages.values()) if scoring_averages else 0
    )

    # Start building the LaTeX table
    latex_output = []
    latex_output.append("\\begin{tabular}{rrcccccccc}")
    latex_output.append("\\toprule")
    latex_output.append("& & \\multicolumn{7}{c}{Embedding} \\\\")
    latex_output.append(
        "Dataset& Task& \\#Samples& Raw& Random & Random Mix &  PCA$_{128}$& $\\mathcal{H}_{128}$& $\\mathcal{E}_{128}$& DNABERT-S\\\\"
    )
    latex_output.append("\\midrule")

    # Group by dataset
    datasets = df["dataset"].unique()

    for i, dataset in enumerate(datasets):
        dataset_rows = df[df["dataset"] == dataset]
        n_rows = len(dataset_rows)

        # Add multirow for dataset
        if n_rows > 1:
            latex_output.append(f"\\multirow{{{n_rows}}}{{*}}{{{dataset}}}")
        else:
            latex_output.append(f"\\multirow{{1}}{{*}}{{{dataset}}}")

        # Add each task row
        for j, (_, row) in enumerate(dataset_rows.iterrows()):
            task_part = f"& {row['task_name']} & {row['samples']} "

            # Only add the dataset name for the first row of the dataset
            if j > 0:
                task_part = "& " + task_part[2:]

            # Add scores with bold for best score(s)
            scores_part = ""
            # Find max score for this row to handle ties
            row_scores = [row[col] for col in scoring_cols if col in row]
            if row_scores:
                max_score = max(row_scores)
                for col in numeric_cols:
                    if col in row and abs(row[col] - max_score) < 1e-2:  # Using small epsilon for float comparison
                        scores_part += f"& \\textbf{{{row[col]:.2f}}} "
                    else:
                        scores_part += f"& {row[col] if col in row else 0:.2f} "
            else:
                scores_part = "& " * len(numeric_cols)

            latex_output.append(task_part + scores_part + "\\\\")

        # Add midrule between datasets
        if i < len(datasets) - 1:
            latex_output.append("\\midrule")

    # Add average row
    latex_output.append("\\midrule")
    average_row = "\\multicolumn{3}{r}{Averages:} "
    for col in numeric_cols:
        if col in scoring_cols and abs(averages[col] - best_avg) < 1e-6:
            average_row += f"& \\textbf{{{averages[col]:.2f}}}"
        else:
            average_row += f"& {averages[col]:.2f}"
    latex_output.append(average_row + "\\\\")

    # Add top scores row
    top_score_row = "\\multicolumn{3}{r}{Top scores:} "
    scoring_top_scores = {col: top_scores[col] for col in scoring_cols}
    max_top_score = max(scoring_top_scores.values()) if scoring_top_scores else 0
    for col in numeric_cols:
        if col in scoring_cols and top_scores[col] == max_top_score:
            top_score_row += f"& \\textbf{{{top_scores[col]}}}"
        else:
            top_score_row += f"& {top_scores[col]}"
    latex_output.append(top_score_row + "\\\\")

    # Finish the table
    latex_output.append("\\bottomrule")
    latex_output.append("\\end{tabular}")

    # Join all lines
    latex_table = "\n".join(latex_output)

    # Write to file if output_file is provided
    if output_file:
        with open(output_file, "w") as f:
            f.write(latex_table)

    return latex_table


create_nice_latex_table(
    scores_auc_mean_pivot.T[["raw", "random", "random_mix", "PCA128", "H128", "E128", "dnabert-s"]],
    # include_raw=False,
    output_file=f"../figures/benchmark_scores_sklearn_rf_None_{TASK}.tex",
    find_maximum=True,
)

'\\begin{tabular}{rrcccccccc}\n\\toprule\n& & \\multicolumn{7}{c}{Embedding} \\\\\nDataset& Task& \\#Samples& Raw& Random & Random Mix &  PCA$_{128}$& $\\mathcal{H}_{128}$& $\\mathcal{E}_{128}$& DNABERT-S\\\\\n\\midrule\n\\multirow{2}{*}{gevers}\n& pcdai-ileum & 67 & 16.26 & \\textbf{16.52} & 15.73 & 15.51 & 15.65 & 16.45 & 15.88 \\\\\n& pcdai-rectum & 51 & 17.47 & 14.00 & 16.57 & 15.80 & 17.35 & \\textbf{18.05} & 17.34 \\\\\n\\midrule\n\\multirow{2}{*}{ravel}\n& nugent-score & 388 & 1.70 & \\textbf{3.61} & 1.83 & 1.77 & 1.78 & 1.80 & 1.84 \\\\\n& ph & 388 & 0.52 & \\textbf{0.69} & 0.51 & 0.50 & 0.50 & 0.48 & 0.49 \\\\\n\\midrule\n\\multirow{1}{*}{yatsunenko}\n& baby-age & 49 & 0.25 & \\textbf{0.43} & 0.35 & 0.24 & 0.31 & 0.32 & 0.28 \\\\\n\\midrule\n\\multicolumn{3}{r}{Averages:} & 7.24& 7.05& 7.00& 6.76& 7.12& \\textbf{7.42}& 7.16\\\\\n\\multicolumn{3}{r}{Top scores:} & 0& \\textbf{4}& 0& 0& 0& 1& 0\\\\\n\\bottomrule\n\\end{tabular}'

: 