## MuPPI Dataset Tutorial: Proteins Classification with MuCombo

### Step 1: Install packages, import functions, and download MuPPI dataset.

In [None]:
# Install scikit multimodallearn  and summit-multi-learn module
!pip install scikit multimodallearn
!pip install summit-multi-learn
!pip install -e .

In [None]:
# Imports
import h5py
import numpy as np
from multimodal.boosting.combo import MuComboClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [None]:
# Download the dataset
!wget -O MuPPI.hdf5 "https://huggingface.co/datasets/kossikevin/muppi/resolve/main/MuPPI_2025_14_fullview_EMF.hdf5"

### Step 2: Create a not full version
Initially, some samples in the dataset have missing values in some views. The function below selects only samples without missing values.

In [None]:
def create_not_full_version_from_full(full_path, output_path):
    with h5py.File(full_path, "r") as f:
        labels = f["Labels"][:]
        nb_views = f["Metadata"].attrs["nbView"]
        example_ids = f["Metadata"]["example_ids"][:].astype(str)
        views = [f[f"View{i}"][:] for i in range(nb_views)]

    # Identify samples with -1 (missing value) in each view
    n_samples = labels.shape[0]
    valid_mask = np.ones(n_samples, dtype=bool)
    for view in views:
        valid_mask &= ~(view == -1).any(axis=1)

    print(f"Valids samples : {valid_mask.sum()} / {n_samples}")

    filtered_labels = labels[valid_mask]
    filtered_ids = example_ids[valid_mask]
    filtered_views = [view[valid_mask] for view in views]

    # Save in a new file
    with h5py.File(output_path, "w") as f_out:
        f_out.create_dataset("Labels", data=filtered_labels)

        for i, view_data in enumerate(filtered_views):
            dset = f_out.create_dataset(f"View{i}", data=view_data)
            dset.attrs["name"] = f"View{i}"  # Ou autre nom si tu veux les garder
            dset.attrs["sparse"] = False

        meta_grp = f_out.create_group("Metadata")
        meta_grp.attrs["nbView"] = len(filtered_views)
        meta_grp.attrs["nbClass"] = len(set(filtered_labels))
        meta_grp.attrs["datasetLength"] = len(filtered_labels)
        meta_grp.create_dataset("example_ids", data=np.array(filtered_ids, dtype="S10"))

    print(f"Dataset 'not full' version saved in: {output_path}")

create_not_full_version_from_full(
    full_path="MuPPI.hdf5",
    output_path="MuPPI_NoNA.hdf5"
)

### Step 3: Start classification
We will now test the performance of MuCombo, an algorithm designed for multi-view datasets with class imbalance.

In [None]:
# Load the dataset
with h5py.File("MuPPI_NoNA.hdf5", "r") as f:
    view_names = [v for v in f.keys() if "View" in v]
    X_views = [f[x][:] for x in view_names]
    y = f["Labels"][:]  # shape (n_samples,)


In this tutorial, we focus on a binary classification task: distinguishing **multi_clustered** (label 1) from **EMF** (label 2).
The third class, **mono_clustered**, is also present in the dataset with label 0, but it will not be used in this task.

In [None]:
# Choose task 
mask = np.isin(y, [1, 2]) # multi vs EMF
X = np.concatenate([view[mask] for view in X_views], axis=1)
y = y[mask]

# Printing some stats
print(f"X shape: {X.shape}")
classes, counts = np.unique(y, return_counts=True)
proportions = counts / counts.sum()
for c, p in zip(classes, proportions):
    print(f"Classe {c}: {p:.2%}")

In [None]:
# Compute the view indices requires by MuCombo fit method
views_ind = [0]
for view in X_views:
    views_ind.append(views_ind[-1] + view.shape[1])
print(views_ind)

In [None]:
from sklearn.utils import resample

random_state = 42

scores = []
for _ in range(10):
    # Split des données
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=random_state)

    # Fitting
    clf = MuComboClassifier(random_state=random_state, n_estimators=200)
    clf.fit(X_train, y_train, views_ind=views_ind)

    # Evaluation
    y_pred = clf.predict(X_test)
    scores.append(f1_score(y_test, y_pred, average="binary", pos_label=2))

# Calculation of the confidence interval (95%)
# SOURCE : https://medium.com/@ranjitmaity95/confidence-intervals-in-machinelearning-b727d9dbdfcd
n_iterations = 1000
bootstrapped_scores = []
for _ in range(n_iterations):
    sample = resample(scores, replace=True, random_state=random_state)
    bootstrapped_scores.append(np.mean(sample))
delta = (np.percentile(bootstrapped_scores, 97.5) - np.percentile(bootstrapped_scores, 2.5)) / 2
print(f"F1 score (EMF): {np.mean(bootstrapped_scores)} +/- {delta}")