In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

### 1. Read dataset and verify it

In [13]:
df = pd.read_parquet("ChemBERTa_sample_embeddings_classification.parquet")
print("Data shape:", df.shape)
print("Columns:", df.columns[:7], "...")

# All embedding columns
emb_cols = [c for c in df.columns if c.startswith("emb_")]

X = df[emb_cols].to_numpy(dtype=float)        # shape (220, 768)
y_diag = df["Diagnosis"].to_numpy()          # shape (220,)

print("X shape:", X.shape)
print("y shape:", y_diag.shape)
print("Unique diagnoses:", np.unique(y_diag))

Data shape: (220, 770)
Columns: Index(['SampleID', 'Diagnosis', 'emb_0', 'emb_1', 'emb_2', 'emb_3', 'emb_4'], dtype='object') ...
X shape: (220, 768)
y shape: (220,)
Unique diagnoses: ['CD' 'Control' 'UC']


### 2. Multiclass: UC vs CD vs Control

In [16]:
le = LabelEncoder()
y_multi = le.fit_transform(y_diag)
print("Classes:", le.classes_)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

clf_multi = LogisticRegression(
    penalty="l2",
    C=0.1,
    max_iter=1000,
    multi_class="multinomial"
)

acc_multi = cross_val_score(clf_multi, X, y_multi, cv=cv, scoring="accuracy")
print("Multiclass accuracy:", acc_multi.mean(), "±", acc_multi.std())


Classes: ['CD' 'Control' 'UC']




Multiclass accuracy: 0.5681818181818182 ± 0.07186994682200862


### 3. Binary Classification

In [17]:
# ---------- Binary IBD vs Control ----------
ibd = {"UC", "CD"}
y_binary = np.array([1 if d in ibd else 0 for d in y_diag], dtype=int)

clf_bin = LogisticRegression(
    penalty="l2",
    C=0.1,
    max_iter=1000
)

auc_bin = cross_val_score(clf_bin, X, y_binary, cv=cv, scoring="roc_auc")
print("Binary AUC:", auc_bin.mean(), "±", auc_bin.std())

Binary AUC: 0.8393336776859505 ± 0.056683121156275704


### Test the model on subgroups of populations