In [26]:
import joblib
import pandas as pd

from rdkit import Chem
from sklearn.metrics import roc_auc_score
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool, global_add_pool
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn.functional as F

import numpy as np
from rdkit.Chem import rdFingerprintGenerator
import deepchem as dc
import random
import optuna

from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    log_loss,
    roc_auc_score,
    precision_score,
    recall_score,
    confusion_matrix,
    f1_score,
)

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split

In [2]:
# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Load pretrain models

In [3]:
# Fingerprints

In [4]:
# load the results back
loaded_results = joblib.load("models/tpot_results.joblib")

# Convert results to DataFrame
tpot_df = pd.DataFrame(loaded_results)
tpot_df.head()

Unnamed: 0,Best model,PCA Components,Model Name,Parameters,AUC,Precision,Recall,Sensitivity,Specificity
0,"((ExtraTreeClassifier(max_features=0.55, min_s...",10,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.860355,0.911765,0.841629,0.841629,0.723077
1,"((ExtraTreeClassifier(criterion='entropy', max...",20,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.901636,0.930348,0.846154,0.846154,0.784615
2,"((ExtraTreeClassifier(criterion='entropy', max...",50,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.899617,0.919811,0.882353,0.882353,0.738462
3,"(MinMaxScaler(), (ExtraTreeClassifier(max_feat...",100,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.916881,0.92093,0.895928,0.895928,0.738462
4,"(KNeighborsClassifier(n_neighbors=4, weights='...",200,KNeighborsClassifier,"{'algorithm': 'auto', 'leaf_size': 30, 'metric...",0.89906,0.938144,0.823529,0.823529,0.815385


In [5]:
fingerprints_model = loaded_results[3]["Best model"]
fingerprints_model

In [6]:
# NLP

In [7]:
# load the results back
loaded_results = joblib.load("models/tpot_nlp_openai_pca.joblib")

# Convert results to DataFrame
tpot_df = pd.DataFrame(loaded_results)
tpot_df.head()

Unnamed: 0,Best model,PCA Components,Model Name,Parameters,AUC,Precision,Recall,Sensitivity,Specificity
0,"(ZeroCount(), (ExtraTreeClassifier(criterion='...",10,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.689871,0.82243,0.79638,0.79638,0.415385


In [8]:
nlp_model = loaded_results[0]["Best model"]
nlp_model

In [9]:
# Graph

In [10]:
class GCN(torch.nn.Module):
    def __init__(
        self,
        num_node_features,
        num_classes,
        num_layers=3,
        hidden_dim=64,
        dropout_prob=0.5,
        activation="relu",
    ):
        super(GCN, self).__init__()

        # Store activation function dynamically
        if activation == "relu":
            self.activation = F.relu
        elif activation == "tanh":
            self.activation = F.tanh
        else:
            raise ValueError("Unsupported activation function")

        self.dropout_prob = dropout_prob

        # Dynamically define the GCN layers
        self.convs = torch.nn.ModuleList()
        self.convs.append(GCNConv(num_node_features, hidden_dim))
        for _ in range(num_layers - 1):
            self.convs.append(GCNConv(hidden_dim, hidden_dim))

        # Final fully connected layer
        self.fc = torch.nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch

        # Apply GCN layers dynamically
        for conv in self.convs:
            x = conv(x, edge_index)
            x = self.activation(x)

        # Global pooling (combine different pooling methods)
        x = torch.cat([global_mean_pool(x, batch), global_add_pool(x, batch)], dim=1)

        # Apply dropout
        x = F.dropout(x, p=self.dropout_prob, training=self.training)

        # Final classification layer
        return F.log_softmax(self.fc(x), dim=1)

In [11]:
# load model from pth
model2_params = {
    "hidden_dim": 210,
    "dropout_prob": 0.4169127419744815,
    "learning_rate": 0.0007529508911659432,
    "weight_decay": 9.903281752862725e-06,
    "num_layers": 5,
    "activation": "relu",
}


graph_model = GCN(
    num_node_features=70,
    num_classes=2,
    num_layers=model2_params["num_layers"],
    hidden_dim=model2_params["hidden_dim"],
    dropout_prob=model2_params["dropout_prob"],
    activation=model2_params["activation"],
).to(device)

graph_model.load_state_dict(torch.load("models/graph_gcn_x20_filtered_outliers.pth"))
graph_model

  graph_model.load_state_dict(torch.load("models/graph_gcn_x20_filtered_outliers.pth"))


GCN(
  (convs): ModuleList(
    (0): GCNConv(70, 210)
    (1-4): 4 x GCNConv(210, 210)
  )
  (fc): Linear(in_features=420, out_features=2, bias=True)
)

# Data preparation

## Fingerprints

In [12]:
# Load pd_train_fp
pd_train_fp = pd.read_parquet("data/training_class.parquet")
pd_train_fp["label"] = pd_train_fp["Class"].apply(
    lambda x: 1 if x == "Hepatotoxicity" else 0
)
print(pd_train_fp.shape)

(1241, 16094)


In [13]:
# Load pd_test_fp
pd_test_fp = pd.read_parquet("data/testing_class.parquet")
pd_test_fp["label"] = pd_test_fp["Class"].apply(
    lambda x: 1 if x == "Hepatotoxicity" else 0
)
print(pd_test_fp.shape)

(286, 16094)


In [14]:
X_train_fp = pd_train_fp.drop(columns=["Class", "label"])
y_train_fp = pd_train_fp["label"]

X_test_fp = pd_test_fp.drop(columns=["Class", "label"])
y_test_fp = pd_test_fp["label"]

In [15]:
# get X PCA components 100
pca = PCA(n_components=100)
X_train_fp_pca = pca.fit_transform(X_train_fp)
X_test_fp_pca = pca.transform(X_test_fp)

## NLP

In [16]:
# load the embeddings
pd_train_nlp = joblib.load("data_smiles/Training_Group_openai_embeddings.pkl")
pd_test_nlp = joblib.load("data_smiles/Testing_Group_openai_embeddings.pkl")

In [17]:
# Prepare features and labels for the split
X_train_nlp = np.vstack(
    pd_train_nlp["Embeddings"].values
)  # Stack embeddings into a 2D array
y_train_nlp = pd_train_nlp["label"].values

print("X_train_nlp Shape:", X_train_nlp.shape)
print("y_train_nlp Shape:", y_train_nlp.shape)

X_train_nlp Shape: (1241, 1536)
y_train_nlp Shape: (1241,)


In [18]:
X_test_nlp = np.vstack(
    pd_test_nlp["Embeddings"].values
)  # Stack embeddings into a 2D array
y_test_nlp = pd_test_nlp["label"].values

print("X_test_nlp Shape:", X_test_nlp.shape)
print("y_test_nlp Shape:", y_test_nlp.shape)

X_test_nlp Shape: (286, 1536)
y_test_nlp Shape: (286,)


In [19]:
# get X PCA components 10
pca = PCA(n_components=10)
X_train_nlp_pca = pca.fit_transform(X_train_nlp)
X_test_nlp_pca = pca.transform(X_test_nlp)

## Graph

In [20]:
def featurize_smiles(smiles):
    featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)
    graph_data = featurizer.featurize([smiles])[0]

    # Get DeepChem atom features
    atom_features_deepchem = graph_data.node_features

    return atom_features_deepchem


# Function to generate Morgan Fingerprints (ECFP)
def generate_ecfp(smiles):
    # Morgan fingerprint generator
    mfgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=4096)

    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return None
    return mfgen.GetFingerprintAsNumPy(molecule)


# Function to convert SMILES to PyTorch Geometric Data object using DeepChem featurizer
def smiles_to_graph_featurizer(smiles):
    # Featurization using DeepChem
    featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)

    # Featurize the SMILES string using DeepChem
    graph_data = featurizer.featurize([smiles])[0]
    return graph_data.node_features, graph_data.edge_features, graph_data.edge_index


# Function to extract atom features
def atom_features(atom, ecfp):
    # Get the atom index for corresponding ECFP value
    atom_idx = atom.GetIdx()

    return [
        atom.GetAtomicNum(),  # Atomic number
        atom.GetDegree(),  # Number of bonds
        atom.GetTotalNumHs(),  # Total number of hydrogens
        atom.GetFormalCharge(),  # Formal charge of the atom
        atom.GetImplicitValence(),  # Implicit valence
        atom.GetNumRadicalElectrons(),  # Number of radical electrons
        int(atom.GetIsAromatic()),  # Is the atom aromatic?
        atom.GetMass(),  # Atomic mass
        atom.GetHybridization().real,  # Hybridization state (SP, SP2, SP3, etc.)
        ecfp[atom_idx],  # Morgan fingerprint (ECFP) for the atom
    ]


# Function to extract bond features
def bond_features(bond):
    bond_type = bond.GetBondTypeAsDouble()  # Bond type as a float
    is_aromatic = bond.GetIsAromatic()  # Aromatic bond
    is_conjugated = bond.GetIsConjugated()  # Conjugated bond
    is_in_ring = bond.IsInRing()  # Whether the bond is part of a ring
    stereo = bond.GetStereo()  # Bond stereochemistry

    # Convert stereo information to a one-hot encoded format
    stereo_one_hot = [0, 0, 0, 0]  # Stereo options: None, E, Z, Other
    if stereo == Chem.BondStereo.STEREONONE:
        stereo_one_hot[0] = 1
    elif stereo == Chem.BondStereo.STEREOE:
        stereo_one_hot[1] = 1
    elif stereo == Chem.BondStereo.STEREOZ:
        stereo_one_hot[2] = 1
    else:
        stereo_one_hot[3] = 1

    # Combine all features into a single tensor
    return [
        bond_type,
        float(is_aromatic),
        float(is_conjugated),
        float(is_in_ring),
    ] + stereo_one_hot


# Convert SMILES to PyTorch Geometric Data object
def smiles_to_graph(smiles, label):
    mol = Chem.MolFromSmiles(smiles)

    atom_features_list = []
    edge_index = []
    edge_attr = []

    # DeepChem features
    atom_features_deepchem = featurize_smiles(smiles)

    # Generate Morgan Fingerprint (ECFP)
    ecfp_features = generate_ecfp(smiles)

    # Generate Molecule Graph Convolution features
    mol_graph_node_features, mol_graph_edge_features, mol_graph_edge_index = (
        smiles_to_graph_featurizer(smiles)
    )

    # Nodes (atoms)
    for atom in mol.GetAtoms():
        atom_features_list.append(atom_features(atom, ecfp_features))

    atom_features_list = np.array(atom_features_list)

    # Edges (bonds)
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()

        # Append bidirectional edges for undirected graphs
        edge_index.append([i, j])
        edge_index.append([j, i])

        # Append bond features for both directions
        edge_attr.append(bond_features(bond))
        edge_attr.append(bond_features(bond))

    # Convert atom features to a tensor
    combined_features = np.concatenate(
        (atom_features_list, atom_features_deepchem, mol_graph_node_features), axis=1
    )
    x = torch.tensor(combined_features, dtype=torch.float)

    # Convert edge indices and features to tensors, handle empty edge case
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    # combine edge features from ECFP and MolGraphConv
    edge_attr = np.array(edge_attr)
    edge_attr = np.concatenate((edge_attr, mol_graph_edge_features), axis=1)
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)

    # Label (target)
    y = torch.tensor([label], dtype=torch.long)

    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)


# Function to load data from CSV and apply SMILES augmentation for training
def load_data_from_csv(file_path):
    df = pd.read_csv(file_path)

    smiles_list = df["Smiles"].values
    labels = df["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0).values

    data_list = []

    # Initialize the SmilesEnumerator for data augmentation
    for smiles, label in zip(smiles_list, labels):
        # For test data, no augmentation, just use canonical SMILES
        graph_data = smiles_to_graph(smiles, label)
        data_list.append(graph_data)

    return data_list

In [21]:
# Load training and testing data
training_data = load_data_from_csv("data_smiles/Training_Group.csv")
testing_data = load_data_from_csv("data_smiles/Testing_Group.csv")

In [22]:
# Create data loaders
train_graph_loader = DataLoader(training_data, batch_size=32, shuffle=True)
test_graph_loader = DataLoader(testing_data, batch_size=32, shuffle=False)

# Model training

In [27]:
def find_optimal_threshold(y_true, y_pred_proba):
    """
    Find optimal threshold based on sensitivity >= 0.7 or best F1 score.
    """
    best_threshold = 0.5
    best_metrics = {
        "accuracy": 0,
        "precision": 0,
        "recall": 0,
        "sensitivity": 0,
        "specificity": 0,
        "f1": 0,
    }

    for threshold in np.arange(0.0, 1.0, 0.01):
        y_pred = (y_pred_proba >= threshold).astype(int)

        # Calculate metrics
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        accuracy = accuracy_score(y_true, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        f1 = f1_score(y_true, y_pred)

        if sensitivity >= 0.7 or f1 > best_metrics["f1"]:
            best_threshold = threshold
            best_metrics = {
                "accuracy": accuracy,
                "precision": precision,
                "recall": recall,
                "sensitivity": sensitivity,
                "specificity": specificity,
                "f1": f1,
            }

    return best_threshold, best_metrics

## Weighted Averaging Ensembles

In [30]:
# Initialize global variables to store the best model and its AUC
best_model_fp = None
best_model_gcn = None
best_model_nlp = None
best_model_auc = {"auc": 0.0}
best_weights = {"weight_etc": 0, "weight_gcn": 0, "weight_nlp": 0}


def objective(trial):
    # Hyperparameter Suggestions
    weight_etc = trial.suggest_float("weight_etc", 0.0, 1.0)
    weight_gcn = trial.suggest_float("weight_gcn", 0.0, 1.0 - weight_etc)
    weight_nlp = 1.0 - weight_etc - weight_gcn  # Ensures weights sum to 1

    # Device setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Ensure y_train_fp is aligned with indices
    y_train_fp_reset = (
        y_train_fp.reset_index(drop=True)
        if isinstance(y_train_fp, pd.Series)
        else y_train_fp
    )

    y_train_nlp_reset = (
        y_train_nlp.reset_index(drop=True)
        if isinstance(y_train_nlp, pd.Series)
        else y_train_fp
    )
    auc_scores = []

    for train_idx, test_idx in skf.split(X_train_fp_pca, y_train_fp_reset):
        # Use .iloc for Pandas or index directly for NumPy arrays
        X_test_fp_i = X_train_fp_pca[test_idx]
        y_test_fp_i = y_train_fp_reset.iloc[test_idx]

        # NLP
        X_test_nlp_i = X_train_nlp_pca[test_idx]
        y_test_nlp_i = y_train_nlp_reset.iloc[test_idx]

        # Prepare datasets for GCN
        test_dataset = [training_data[i] for i in test_idx]
        test_loader_i = DataLoader(test_dataset, batch_size=32, shuffle=False)

        # Get predictions from ExtraTreesClassifier
        fp_probs_i = fingerprints_model.predict_proba(X_test_fp_i)[:, 1]

        # Get predictions from ExtraTreesClassifier NLP
        nlp_probs_i = nlp_model.predict_proba(X_test_nlp_i)[:, 1]

        # Get predictions from GCN
        graph_model.eval()
        gcn_probs_i = []
        for batch in test_loader_i:
            batch = batch.to(device)
            with torch.no_grad():
                out = graph_model(batch)
            gcn_probs_i.append(out[:, 1].cpu().numpy())

        # Concatenate GCN probabilities
        gcn_probs_i = np.concatenate(gcn_probs_i)

        # Ensemble predictions with average weights
        final_probs_i = (
            weight_etc * fp_probs_i
            + weight_gcn * gcn_probs_i
            + weight_nlp * nlp_probs_i
        )

        # Evaluate AUC
        auc = roc_auc_score(y_test_fp_i, final_probs_i)
        auc_scores.append(auc)

    # Save the best model
    global best_model_fp, best_model_gcn, best_model_nlp, best_model_auc, best_weights
    mean_auc = np.mean(auc_scores)
    if mean_auc > best_model_auc["auc"]:
        best_model_auc["auc"] = mean_auc
        best_model_fp = fingerprints_model
        best_model_gcn = graph_model
        best_model_nlp = nlp_model
        best_weights["weight_etc"] = weight_etc
        best_weights["weight_gcn"] = weight_gcn
        best_weights["weight_nlp"] = weight_nlp

    # Return Mean AUC
    return mean_auc

In [31]:
# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2024-12-20 22:14:35,034] A new study created in memory with name: no-name-4adb5746-a1b1-40d1-b183-3b842019d6a7
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_pr

In [32]:
# Output the Best Parameters and AUC
print("Best AUC Score:", best_model_auc["auc"])
print("Best Weights:", best_weights)

Best AUC Score: 0.9946895819138465
Best Weights: {'weight_etc': 0.9420651369083504, 'weight_gcn': 0.00607746467208521, 'weight_nlp': 0.051857398419564424}


In [33]:
etc_probs = best_model_fp.predict_proba(X_test_fp_pca)[:, 1]

# Get predictions from GCN
gcn_probs = []
for batch in test_graph_loader:
    batch = batch.to(device)
    with torch.no_grad():
        out = best_model_gcn(batch)
    gcn_probs.append(out[:, 1].cpu().numpy())  # Probabilities for class 1
gcn_probs = np.concatenate(gcn_probs)


# Get predictions from NLP
nlp_probs = best_model_nlp.predict_proba(X_test_nlp_pca)[:, 1]

In [34]:
# Optimal weights from the best trial
optimal_weight_etc = best_weights["weight_etc"]
optimal_weight_gcn = best_weights["weight_gcn"]
optimal_weight_nlp = best_weights["weight_nlp"]

# Combine predictions using optimal weights
final_probs = (
    optimal_weight_etc * etc_probs
    + optimal_weight_gcn * gcn_probs
    + optimal_weight_nlp * nlp_probs
)

# Evaluate using AUC
auc = roc_auc_score(y_test_fp, final_probs)
print(f"Test Set AUC: {auc}")

Test Set AUC: 0.9155586494953011


In [35]:
find_optimal_threshold(y_test_fp, final_probs)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.56,
 {'accuracy': 0.7937062937062938,
  'precision': 0.95,
  'recall': 0.7737556561085973,
  'sensitivity': 0.7737556561085973,
  'specificity': 0.8615384615384616,
  'f1': 0.8528678304239401})

## Meta model

In [104]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline

# Initialize global variables
best_meta_model = None
best_model_auc = {"auc": 0.0}


def objective(trial):
    # Suggest hyperparameters for the meta-model (MLP in this example)
    hidden_layer_sizes = trial.suggest_categorical(
        "hidden_layer_sizes", [(32,), (64,), (128,), (64, 32), (128, 64)]
    )
    activation = trial.suggest_categorical("activation", ["relu", "tanh", "logistic"])
    learning_rate_init = trial.suggest_loguniform("learning_rate_init", 1e-4, 1e-1)
    degree = trial.suggest_int("poly_degree", 1, 3)  # Degree of polynomial features

    # Device setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, test_idx in skf.split(X_train_fp_pca, y_train_fp):
        # Split the data for each fold
        X_test_fp_i = X_train_fp_pca[test_idx]
        X_train_fp_i = X_train_fp_pca[train_idx]
        y_test_fp_i = y_train_fp.iloc[test_idx]
        y_train_fp_i = y_train_fp.iloc[train_idx]

        X_test_nlp_i = X_train_nlp_pca[test_idx]
        X_train_nlp_i = X_train_nlp_pca[train_idx]

        test_dataset = [training_data[i] for i in test_idx]
        train_dataset = [training_data[i] for i in train_idx]
        test_loader_i = DataLoader(test_dataset, batch_size=32, shuffle=False)
        train_loader_i = DataLoader(train_dataset, batch_size=32, shuffle=True)

        # GCN predictions for train and test
        def get_gcn_predictions(loader):
            graph_model.eval()
            gcn_probs = []
            for batch in loader:
                batch = batch.to(device)
                with torch.no_grad():
                    out = graph_model(batch)
                gcn_probs.append(out[:, 1].cpu().numpy())
            return np.concatenate(gcn_probs)

        gcn_train_probs = get_gcn_predictions(train_loader_i)
        gcn_test_probs = get_gcn_predictions(test_loader_i)

        # Base model predictions
        fp_train_probs = fingerprints_model.predict_proba(X_train_fp_i)[:, 1]
        fp_test_probs = fingerprints_model.predict_proba(X_test_fp_i)[:, 1]

        nlp_train_probs = nlp_model.predict_proba(X_train_nlp_i)[:, 1]
        nlp_test_probs = nlp_model.predict_proba(X_test_nlp_i)[:, 1]

        # Create meta-model training and test sets
        meta_X_train_raw = np.column_stack(
            [fp_train_probs, gcn_train_probs, nlp_train_probs]
        )
        meta_X_test_raw = np.column_stack(
            [fp_test_probs, gcn_test_probs, nlp_test_probs]
        )

        # Add polynomial features
        poly = PolynomialFeatures(degree=degree, include_bias=False)
        meta_X_train_poly = poly.fit_transform(meta_X_train_raw)
        meta_X_test_poly = poly.transform(meta_X_test_raw)

        # Apply scalar normalization
        scaler = StandardScaler()
        meta_X_train = scaler.fit_transform(meta_X_train_poly)
        meta_X_test = scaler.transform(meta_X_test_poly)

        # Train the meta-model (MLP in this example with early stopping)
        meta_model = MLPClassifier(
            hidden_layer_sizes=hidden_layer_sizes,
            activation=activation,
            learning_rate_init=learning_rate_init,
            # alpha=trial.suggest_loguniform("alpha", 1e-6, 1e-2),  # L2 regularization
            alpha=0.001,  # L2 regularization
            max_iter=200,
            early_stopping=True,
            n_iter_no_change=10,
            random_state=42,
        )
        meta_model.fit(meta_X_train, y_train_fp_i)

        # Predict with the meta-model
        meta_probs = meta_model.predict_proba(meta_X_test)[:, 1]

        # Evaluate AUC
        auc = roc_auc_score(y_test_fp_i, meta_probs)
        auc_scores.append(auc)

    # Save the best meta-model
    global best_meta_model, best_model_auc
    mean_auc = np.mean(auc_scores)
    if mean_auc > best_model_auc["auc"]:
        best_model_auc["auc"] = mean_auc
        best_meta_model = meta_model

    # Return Mean AUC
    return mean_auc

In [105]:
# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2024-11-21 21:21:56,502] A new study created in memory with name: no-name-bc25f1a2-f65b-4b3e-92ba-53192fcf3e6d
  learning_rate_init = trial.suggest_loguniform("learning_rate_init", 1e-4, 1e-1)
[I 2024-11-21 21:21:58,192] Trial 0 finished with value: 0.9953190750555498 and parameters: {'hidden_layer_sizes': (32,), 'activation': 'relu', 'learning_rate_init': 0.09272199466384129, 'poly_degree': 2}. Best is trial 0 with value: 0.9953190750555498.
  learning_rate_init = trial.suggest_loguniform("learning_rate_init", 1e-4, 1e-1)
[I 2024-11-21 21:22:00,345] Trial 1 finished with value: 0.9878894142486349 and parameters: {'hidden_layer_sizes': (128, 64), 'activation': 'tanh', 'learning_rate_init': 0.00018210306419258042, 'poly_degree': 2}. Best is trial 0 with value: 0.9953190750555498.
  learning_rate_init = trial.suggest_loguniform("learning_rate_init", 1e-4, 1e-1)
[I 2024-11-21 21:22:02,648] Trial 2 finished with value: 0.971024411441436 and parameters: {'hidden_layer_sizes': (32,), 'act

In [106]:
print("Best AUC Score:", best_model_auc["auc"])
print("Trial params:", study.best_params)
best_meta_model

Best AUC Score: 0.9960274550488967
Trial params: {'hidden_layer_sizes': (32,), 'activation': 'logistic', 'learning_rate_init': 0.07945015827969165, 'poly_degree': 1}


In [107]:
def get_predictions(X_fp_pca, X_nlp_pca, graph_loader):
    # Get predictions from ExtraTreesClassifier
    etc_probs = fingerprints_model.predict_proba(X_fp_pca)[:, 1]

    # Get predictions from GCN
    gcn_probs = []
    for batch in graph_loader:
        batch = batch.to(device)
        with torch.no_grad():
            out = graph_model(batch)
        gcn_probs.append(out[:, 1].cpu().numpy())  # Probabilities for class 1
    gcn_probs = np.concatenate(gcn_probs)

    # Get predictions from NLP
    nlp_probs = nlp_model.predict_proba(X_nlp_pca)[:, 1]

    # Combine predictions using optimal weights
    final_probs = np.column_stack([etc_probs, gcn_probs, nlp_probs])

    return final_probs

In [108]:
# Get raw predictions for training and testing data
meta_X_train_raw = get_predictions(X_train_fp_pca, X_train_nlp_pca, train_graph_loader)
meta_X_test_raw = get_predictions(X_test_fp_pca, X_test_nlp_pca, test_graph_loader)

# Get the best degree from the Optuna study
best_poly_degree = study.best_params["poly_degree"]

# Define a pipeline for polynomial features and scaling
pipeline = Pipeline(
    [
        (
            "polynomial_features",
            PolynomialFeatures(degree=best_poly_degree, include_bias=False),
        ),
        ("scaler", StandardScaler()),
    ]
)

# Transform the raw predictions using the pipeline
meta_X_train = pipeline.fit_transform(meta_X_train_raw)
meta_X_test = pipeline.transform(meta_X_test_raw)

# Evaluate using the best meta-model
meta_probs = best_meta_model.predict_proba(meta_X_test)[
    :, 1
]  # Probabilities for class 1

# Evaluate using AUC
auc = roc_auc_score(y_test_fp, meta_probs)
print(f"Test Set AUC: {auc}")

Test Set AUC: 0.9106856943961016


## Randomforest

In [None]:
def objective(trial):
    # Suggest hyperparameters for the meta-model (SVM in this example)
    meta_c = trial.suggest_loguniform("meta_c", 1e-4, 1e2)  # Regularization parameter
    meta_kernel = trial.suggest_categorical(
        "meta_kernel", ["linear", "rbf", "poly", "sigmoid"]
    )
    degree = trial.suggest_int("poly_degree", 1, 3)  # Degree of polynomial features

    # Device setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, test_idx in skf.split(X_train_fp_pca, y_train_fp):
        # Split the data for each fold
        X_test_fp_i = X_train_fp_pca[test_idx]
        X_train_fp_i = X_train_fp_pca[train_idx]
        y_test_fp_i = y_train_fp.iloc[test_idx]
        y_train_fp_i = y_train_fp.iloc[train_idx]

        X_test_nlp_i = X_train_nlp_pca[test_idx]
        X_train_nlp_i = X_train_nlp_pca[train_idx]

        test_dataset = [training_data[i] for i in test_idx]
        train_dataset = [training_data[i] for i in train_idx]
        test_loader_i = DataLoader(test_dataset, batch_size=32, shuffle=False)
        train_loader_i = DataLoader(train_dataset, batch_size=32, shuffle=True)

        # GCN predictions for train and test
        def get_gcn_predictions(loader):
            graph_model.eval()
            gcn_probs = []
            for batch in loader:
                batch = batch.to(device)
                with torch.no_grad():
                    out = graph_model(batch)
                gcn_probs.append(out[:, 1].cpu().numpy())
            return np.concatenate(gcn_probs)

        gcn_train_probs = get_gcn_predictions(train_loader_i)
        gcn_test_probs = get_gcn_predictions(test_loader_i)

        # Base model predictions
        fp_train_probs = fingerprints_model.predict_proba(X_train_fp_i)[:, 1]
        fp_test_probs = fingerprints_model.predict_proba(X_test_fp_i)[:, 1]

        nlp_train_probs = nlp_model.predict_proba(X_train_nlp_i)[:, 1]
        nlp_test_probs = nlp_model.predict_proba(X_test_nlp_i)[:, 1]

        # Create meta-model training and test sets
        meta_X_train_raw = np.column_stack(
            [fp_train_probs, gcn_train_probs, nlp_train_probs]
        )
        meta_X_test_raw = np.column_stack(
            [fp_test_probs, gcn_test_probs, nlp_test_probs]
        )

        # Add polynomial features
        poly = PolynomialFeatures(degree=degree, include_bias=False)
        meta_X_train = poly.fit_transform(meta_X_train_raw)
        meta_X_test = poly.transform(meta_X_test_raw)

        # Train the meta-model (SVM with probability=True for AUC evaluation)
        meta_model = SVC(C=meta_c, kernel=meta_kernel, probability=True)
        meta_model.fit(meta_X_train, y_train_fp_i)

        # Predict with the meta-model
        meta_probs = meta_model.predict_proba(meta_X_test)[:, 1]

        # Evaluate AUC
        auc = roc_auc_score(y_test_fp_i, meta_probs)
        auc_scores.append(auc)

    # Save the best meta-model
    global best_meta_model, best_model_auc
    mean_auc = np.mean(auc_scores)
    if mean_auc > best_model_auc["auc"]:
        best_model_auc["auc"] = mean_auc
        best_meta_model = meta_model

    # Return Mean AUC
    return mean_auc

In [None]:
# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

In [58]:
print("Best AUC Score:", best_model_auc["auc"])
print("Best Parameters:", best_meta_model.get_params())
best_meta_model

Best AUC Score: 0.9957921837604642
Best Parameters: {'C': 5.650347638927031, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'linear', 'max_iter': -1, 'probability': True, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [59]:
etc_probs = fingerprints_model.predict_proba(X_test_fp_pca)[:, 1]

# Get predictions from GCN
gcn_probs = []
for batch in test_graph_loader:
    batch = batch.to(device)
    with torch.no_grad():
        out = graph_model(batch)
    gcn_probs.append(out[:, 1].cpu().numpy())  # Probabilities for class 1
gcn_probs = np.concatenate(gcn_probs)


# Get predictions from NLP
nlp_probs = nlp_model.predict_proba(X_test_nlp_pca)[:, 1]

In [61]:
# Combine predictions into meta-model input
meta_X_test_raw = np.column_stack([etc_probs, gcn_probs, nlp_probs])

# Apply polynomial features transformation
best_poly_degree = best_meta_model.get_params()[
    "degree"
]  # Get the best degree from the best trial
poly = PolynomialFeatures(
    degree=best_poly_degree, include_bias=False
)  # Use the degree from the best trial
meta_X_test = poly.fit_transform(meta_X_test_raw)

# Evaluate using the best meta-model
meta_probs = best_meta_model.predict_proba(meta_X_test)[
    :, 1
]  # Probabilities for class 1

# Evaluate using AUC
auc = roc_auc_score(y_test_fp, meta_probs)
print(f"Test Set AUC: {auc}")

Test Set AUC: 0.8913679081099896
