In [1]:
import joblib
import pandas as pd

from rdkit import Chem
from sklearn.metrics import roc_auc_score
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool, global_add_pool
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn.functional as F

import numpy as np
from rdkit.Chem import rdFingerprintGenerator
import deepchem as dc
import random
import optuna

from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    log_loss,
    roc_auc_score,
    precision_score,
    recall_score,
    confusion_matrix,
)

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier

  from .autonotebook import tqdm as notebook_tqdm
No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'dgl'
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'
Skipped loading some PyTorch models, missing a dependency. No module named 'tensorflow'


In [2]:
# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Load pre-trained models

In [3]:
# load the results back
loaded_results = joblib.load("models/tpot_results.joblib")

# Convert results to DataFrame
tpot_df = pd.DataFrame(loaded_results)
tpot_df.head()

Unnamed: 0,Best model,PCA Components,Model Name,Parameters,AUC,Precision,Recall,Sensitivity,Specificity
0,"((ExtraTreeClassifier(max_features=0.55, min_s...",10,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.860355,0.911765,0.841629,0.841629,0.723077
1,"((ExtraTreeClassifier(criterion='entropy', max...",20,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.901636,0.930348,0.846154,0.846154,0.784615
2,"((ExtraTreeClassifier(criterion='entropy', max...",50,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.899617,0.919811,0.882353,0.882353,0.738462
3,"(MinMaxScaler(), (ExtraTreeClassifier(max_feat...",100,ExtraTreesClassifier,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.916881,0.92093,0.895928,0.895928,0.738462
4,"(KNeighborsClassifier(n_neighbors=4, weights='...",200,KNeighborsClassifier,"{'algorithm': 'auto', 'leaf_size': 30, 'metric...",0.89906,0.938144,0.823529,0.823529,0.815385


In [4]:
etc_model = loaded_results[3]["Best model"]
etc_model

In [5]:
model1_params = etc_model.steps[-1][1].get_params()  # Params of the final estimator
model1_params

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 0.35000000000000003,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 3,
 'min_samples_split': 13,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [6]:
class GCN(torch.nn.Module):
    def __init__(
        self,
        num_node_features,
        num_classes,
        num_layers=3,
        hidden_dim=64,
        dropout_prob=0.5,
        activation="relu",
    ):
        super(GCN, self).__init__()

        # Store activation function dynamically
        if activation == "relu":
            self.activation = F.relu
        elif activation == "tanh":
            self.activation = F.tanh
        else:
            raise ValueError("Unsupported activation function")

        self.dropout_prob = dropout_prob

        # Dynamically define the GCN layers
        self.convs = torch.nn.ModuleList()
        self.convs.append(GCNConv(num_node_features, hidden_dim))
        for _ in range(num_layers - 1):
            self.convs.append(GCNConv(hidden_dim, hidden_dim))

        # Final fully connected layer
        self.fc = torch.nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch

        # Apply GCN layers dynamically
        for conv in self.convs:
            x = conv(x, edge_index)
            x = self.activation(x)

        # Global pooling (combine different pooling methods)
        x = torch.cat([global_mean_pool(x, batch), global_add_pool(x, batch)], dim=1)

        # Apply dropout
        x = F.dropout(x, p=self.dropout_prob, training=self.training)

        # Final classification layer
        return F.log_softmax(self.fc(x), dim=1)

In [7]:
# load model from pth
model2_params = {
    "hidden_dim": 210,
    "dropout_prob": 0.4169127419744815,
    "learning_rate": 0.0007529508911659432,
    "weight_decay": 9.903281752862725e-06,
    "num_layers": 5,
    "activation": "relu",
}


gcn_model = GCN(
    num_node_features=70,
    num_classes=2,
    num_layers=model2_params["num_layers"],
    hidden_dim=model2_params["hidden_dim"],
    dropout_prob=model2_params["dropout_prob"],
    activation=model2_params["activation"],
).to(device)

gcn_model.load_state_dict(torch.load("models/graph_gcn_x20_filtered_outliers.pth"))

  gcn_model.load_state_dict(torch.load("models/graph_gcn_x20_filtered_outliers.pth"))


<All keys matched successfully>

# Data preparation

## PCA prep

In [8]:
# Load pd_train
pd_train = pd.read_parquet("data/training_class.parquet")
pd_train["label"] = pd_train["Class"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_train.shape)

(1241, 16094)


In [9]:
# Load pd_test
pd_test = pd.read_parquet("data/testing_class.parquet")
pd_test["label"] = pd_test["Class"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_test.shape)

(286, 16094)


In [10]:
X_train = pd_train.drop(columns=["Class", "label"])
y_train = pd_train["label"]

X_test = pd_test.drop(columns=["Class", "label"])
y_test = pd_test["label"]

In [11]:
# get X PCA components 100
pca = PCA(n_components=100)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

## Graph prep

In [12]:
def featurize_smiles(smiles):
    featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)
    graph_data = featurizer.featurize([smiles])[0]

    # Get DeepChem atom features
    atom_features_deepchem = graph_data.node_features

    return atom_features_deepchem


# Function to generate Morgan Fingerprints (ECFP)
def generate_ecfp(smiles):
    # Morgan fingerprint generator
    mfgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=4096)

    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return None
    return mfgen.GetFingerprintAsNumPy(molecule)


# Function to convert SMILES to PyTorch Geometric Data object using DeepChem featurizer
def smiles_to_graph_featurizer(smiles):
    # Featurization using DeepChem
    featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)

    # Featurize the SMILES string using DeepChem
    graph_data = featurizer.featurize([smiles])[0]
    return graph_data.node_features, graph_data.edge_features, graph_data.edge_index


# Function to extract atom features
def atom_features(atom, ecfp):
    # Get the atom index for corresponding ECFP value
    atom_idx = atom.GetIdx()

    return [
        atom.GetAtomicNum(),  # Atomic number
        atom.GetDegree(),  # Number of bonds
        atom.GetTotalNumHs(),  # Total number of hydrogens
        atom.GetFormalCharge(),  # Formal charge of the atom
        atom.GetImplicitValence(),  # Implicit valence
        atom.GetNumRadicalElectrons(),  # Number of radical electrons
        int(atom.GetIsAromatic()),  # Is the atom aromatic?
        atom.GetMass(),  # Atomic mass
        atom.GetHybridization().real,  # Hybridization state (SP, SP2, SP3, etc.)
        ecfp[atom_idx],  # Morgan fingerprint (ECFP) for the atom
    ]


# Function to extract bond features
def bond_features(bond):
    bond_type = bond.GetBondTypeAsDouble()  # Bond type as a float
    is_aromatic = bond.GetIsAromatic()  # Aromatic bond
    is_conjugated = bond.GetIsConjugated()  # Conjugated bond
    is_in_ring = bond.IsInRing()  # Whether the bond is part of a ring
    stereo = bond.GetStereo()  # Bond stereochemistry

    # Convert stereo information to a one-hot encoded format
    stereo_one_hot = [0, 0, 0, 0]  # Stereo options: None, E, Z, Other
    if stereo == Chem.BondStereo.STEREONONE:
        stereo_one_hot[0] = 1
    elif stereo == Chem.BondStereo.STEREOE:
        stereo_one_hot[1] = 1
    elif stereo == Chem.BondStereo.STEREOZ:
        stereo_one_hot[2] = 1
    else:
        stereo_one_hot[3] = 1

    # Combine all features into a single tensor
    return [
        bond_type,
        float(is_aromatic),
        float(is_conjugated),
        float(is_in_ring),
    ] + stereo_one_hot


# Convert SMILES to PyTorch Geometric Data object
def smiles_to_graph(smiles, label):
    mol = Chem.MolFromSmiles(smiles)

    atom_features_list = []
    edge_index = []
    edge_attr = []

    # DeepChem features
    atom_features_deepchem = featurize_smiles(smiles)

    # Generate Morgan Fingerprint (ECFP)
    ecfp_features = generate_ecfp(smiles)

    # Generate Molecule Graph Convolution features
    mol_graph_node_features, mol_graph_edge_features, mol_graph_edge_index = (
        smiles_to_graph_featurizer(smiles)
    )

    # Nodes (atoms)
    for atom in mol.GetAtoms():
        atom_features_list.append(atom_features(atom, ecfp_features))

    atom_features_list = np.array(atom_features_list)

    # Edges (bonds)
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()

        # Append bidirectional edges for undirected graphs
        edge_index.append([i, j])
        edge_index.append([j, i])

        # Append bond features for both directions
        edge_attr.append(bond_features(bond))
        edge_attr.append(bond_features(bond))

    # Convert atom features to a tensor
    combined_features = np.concatenate(
        (atom_features_list, atom_features_deepchem, mol_graph_node_features), axis=1
    )
    x = torch.tensor(combined_features, dtype=torch.float)

    # Convert edge indices and features to tensors, handle empty edge case
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    # combine edge features from ECFP and MolGraphConv
    edge_attr = np.array(edge_attr)
    edge_attr = np.concatenate((edge_attr, mol_graph_edge_features), axis=1)
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)

    # Label (target)
    y = torch.tensor([label], dtype=torch.long)

    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)


# Function to load data from CSV and apply SMILES augmentation for training
def load_data_from_csv(file_path):
    df = pd.read_csv(file_path)

    smiles_list = df["Smiles"].values
    labels = df["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0).values

    data_list = []

    # Initialize the SmilesEnumerator for data augmentation
    for smiles, label in zip(smiles_list, labels):
        # For test data, no augmentation, just use canonical SMILES
        graph_data = smiles_to_graph(smiles, label)
        data_list.append(graph_data)

    return data_list

In [13]:
# Load training and testing data
training_data = load_data_from_csv("data_smiles/Training_Group.csv")
testing_data = load_data_from_csv("data_smiles/Testing_Group.csv")

In [14]:
# Create data loaders
train_loader = DataLoader(training_data, batch_size=32, shuffle=True)
test_loader = DataLoader(testing_data, batch_size=32, shuffle=False)

# Model training

## Stacking

In [21]:
import optuna
import torch
import numpy as np
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader

In [31]:
from torch_geometric.data import DataLoader  # Use this DataLoader

# Initialize global variables to store the best model and its AUC
best_model = None
best_model_auc = {"auc": 0.0}


def objective(trial):
    # Device setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Ensure y_train is aligned with indices
    y_train_reset = (
        y_train.reset_index(drop=True) if isinstance(y_train, pd.Series) else y_train
    )
    auc_scores = []

    for train_idx, test_idx in skf.split(X_train_pca, y_train_reset):
        # Split data into training and testing folds
        X_train_i, X_test_i = X_train_pca[train_idx], X_train_pca[test_idx]
        y_train_i, y_test_i = (
            y_train_reset.iloc[train_idx],
            y_train_reset.iloc[test_idx],
        )

        # Prepare datasets for GCN
        train_dataset = [training_data[i] for i in train_idx]
        test_dataset = [training_data[i] for i in test_idx]

        # Use torch_geometric's DataLoader
        train_loader_i = DataLoader(train_dataset, batch_size=32, shuffle=True)
        test_loader_i = DataLoader(test_dataset, batch_size=32, shuffle=False)

        # Get predictions from ExtraTreesClassifier for training data
        etc_train_probs_i = etc_model.predict_proba(X_train_i)[:, 1]

        # Get predictions from GCN for training data
        gcn_train_probs_i = []
        gcn_model.eval()
        for batch in train_loader_i:
            batch = batch.to(device)
            with torch.no_grad():
                out = gcn_model(batch)
                gcn_train_probs_i.extend(out[:, 1].cpu().numpy())
        gcn_train_probs_i = np.array(gcn_train_probs_i)

        # Combine predictions as stacked features
        stacked_features_train = np.vstack([etc_train_probs_i, gcn_train_probs_i]).T

        # Train a Logistic Regression model on the stacked features
        lr = LogisticRegression(max_iter=1000, random_state=42)
        lr.fit(stacked_features_train, y_train_i)

        # Get predictions from ExtraTreesClassifier for testing data
        etc_test_probs_i = etc_model.predict_proba(X_test_i)[:, 1]

        # Get predictions from GCN for testing data
        gcn_test_probs_i = []
        for batch in test_loader_i:
            batch = batch.to(device)
            with torch.no_grad():
                out = gcn_model(batch)
                gcn_test_probs_i.extend(out[:, 1].cpu().numpy())
        gcn_test_probs_i = np.array(gcn_test_probs_i)

        # Combine testing data predictions as stacked features
        stacked_features_test = np.vstack([etc_test_probs_i, gcn_test_probs_i]).T

        # Get final ensemble predictions using the trained Logistic Regression model
        final_probs_i = lr.predict_proba(stacked_features_test)[:, 1]

        # Evaluate AUC
        auc = roc_auc_score(y_test_i, final_probs_i)
        auc_scores.append(auc)

    # Save the best model
    global best_model, best_model_auc
    mean_auc = np.mean(auc_scores)
    if mean_auc > best_model_auc["auc"]:
        best_model_auc["auc"] = mean_auc
        best_model = lr

    # Return Mean AUC
    return mean_auc

In [32]:
# Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2024-11-18 21:07:04,738] A new study created in memory with name: no-name-c6b63476-8389-4c94-87b7-0b0b1a30508d
[I 2024-11-18 21:07:06,016] Trial 0 finished with value: 0.9945926773377529 and parameters: {}. Best is trial 0 with value: 0.9945926773377529.
[I 2024-11-18 21:07:07,338] Trial 1 finished with value: 0.9945533176651011 and parameters: {}. Best is trial 0 with value: 0.9945926773377529.
[I 2024-11-18 21:07:08,691] Trial 2 finished with value: 0.9944941020154363 and parameters: {}. Best is trial 0 with value: 0.9945926773377529.
[I 2024-11-18 21:07:10,053] Trial 3 finished with value: 0.994572725519484 and parameters: {}. Best is trial 0 with value: 0.9945926773377529.
[I 2024-11-18 21:07:11,449] Trial 4 finished with value: 0.994527035575031 and parameters: {}. Best is trial 0 with value: 0.9945926773377529.
[I 2024-11-18 21:07:12,787] Trial 5 finished with value: 0.994415181489838 and parameters: {}. Best is trial 0 with value: 0.9945926773377529.
[I 2024-11-18 21:07:14,17

In [33]:
# Output the Best Parameters and AUC
print("Best AUC Score:", best_model_auc["auc"])

Best AUC Score: 0.9946253112854926


In [36]:
# Logistic Regression as meta-learner
lr_model = best_model

# Get predictions for ExtraTreesClassifier on the test set
etc_test_probs = etc_model.predict_proba(X_test_pca)[:, 1]

# GCN predictions on the test set
gcn_test_probs = []
gcn_model.eval()
for batch in test_loader:
    batch = batch.to(device)
    with torch.no_grad():
        out = gcn_model(batch)
        gcn_test_probs.extend(out[:, 1].cpu().numpy())
gcn_test_probs = np.array(gcn_test_probs)

# Combine predictions as stacked features
stacked_features_test = np.vstack([etc_test_probs, gcn_test_probs]).T

# Final ensemble predictions
final_probs = lr_model.predict_proba(stacked_features_test)[:, 1]

# Evaluate metrics
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report

auc = roc_auc_score(y_test, final_probs)
accuracy = accuracy_score(y_test, (final_probs > 0.5).astype(int))

print("Test AUC:", auc)
print("Test Accuracy:", accuracy)
print(
    "Classification Report:\n",
    classification_report(y_test, (final_probs > 0.2).astype(int)),
)

Test AUC: 0.9103376261747302
Test Accuracy: 0.8566433566433567
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.62      0.73        65
           1       0.90      0.98      0.94       221

    accuracy                           0.90       286
   macro avg       0.89      0.80      0.83       286
weighted avg       0.89      0.90      0.89       286



## Weighted Averaging Ensembles

In [15]:
# Initialize global variables to store the best model and its AUC
best_model_1 = None
best_model_2 = None
best_model_auc = {"auc": 0.0}
best_weight_etc = 0


def objective(trial):
    # Hyperparameter Suggestions
    weight_etc = trial.suggest_float("weight_etc", 0.0, 1.0)
    weight_gcn = 1.0 - weight_etc  # Ensures weights sum to 1

    # Device setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Ensure y_train is aligned with indices
    y_train_reset = (
        y_train.reset_index(drop=True) if isinstance(y_train, pd.Series) else y_train
    )
    auc_scores = []

    for train_idx, test_idx in skf.split(X_train_pca, y_train_reset):
        # Use .iloc for Pandas or index directly for NumPy arrays
        X_test_i = X_train_pca[test_idx]
        y_test_i = y_train_reset.iloc[test_idx]

        # Prepare datasets for GCN
        test_dataset = [training_data[i] for i in test_idx]
        test_loader_i = DataLoader(test_dataset, batch_size=32, shuffle=False)

        # Get predictions from ExtraTreesClassifier
        etc_probs_i = etc_model.predict_proba(X_test_i)[:, 1]

        # Get predictions from GCN
        gcn_model.eval()
        gcn_probs_i = []
        for batch in test_loader_i:
            batch = batch.to(device)
            with torch.no_grad():
                out = gcn_model(batch)
            gcn_probs_i.append(out[:, 1].cpu().numpy())

        # Concatenate GCN probabilities
        gcn_probs_i = np.concatenate(gcn_probs_i)

        # Ensemble predictions
        final_probs_i = weight_etc * etc_probs_i + weight_gcn * gcn_probs_i

        # Evaluate AUC
        auc = roc_auc_score(y_test_i, final_probs_i)
        auc_scores.append(auc)

    # Save the best model
    global best_model_1, best_model_2, best_model_auc, best_weight_etc
    mean_auc = np.mean(auc_scores)
    if mean_auc > best_model_auc["auc"]:
        best_model_auc["auc"] = mean_auc
        best_model_1 = etc_model
        best_model_2 = gcn_model
        best_weight_etc = weight_etc

    # Return Mean AUC
    return mean_auc

In [16]:
# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2024-11-18 20:46:18,075] A new study created in memory with name: no-name-166f2008-e472-4bc3-ba35-a8bc710f0f44
[I 2024-11-18 20:46:18,662] Trial 0 finished with value: 0.983918090194255 and parameters: {'weight_etc': 0.8514459328069176}. Best is trial 0 with value: 0.983918090194255.
[I 2024-11-18 20:46:18,940] Trial 1 finished with value: 0.975626185323212 and parameters: {'weight_etc': 0.6195749928411297}. Best is trial 0 with value: 0.983918090194255.
[I 2024-11-18 20:46:19,222] Trial 2 finished with value: 0.9672795214209179 and parameters: {'weight_etc': 0.29024316239735437}. Best is trial 0 with value: 0.983918090194255.
[I 2024-11-18 20:46:19,507] Trial 3 finished with value: 0.982560474192143 and parameters: {'weight_etc': 0.8218229189541337}. Best is trial 0 with value: 0.983918090194255.
[I 2024-11-18 20:46:19,782] Trial 4 finished with value: 0.9710065910118238 and parameters: {'weight_etc': 0.44838926181505834}. Best is trial 0 with value: 0.983918090194255.
[I 2024-11-1

In [17]:
# Output the Best Parameters and AUC
print("Best AUC Score:", best_model_auc["auc"])
print("Best Weight for ETC:", best_weight_etc)

Best AUC Score: 0.9947969008080644
Best Weight for ETC: 0.9945807092980802


## Evaluate

In [18]:
# Get predictions from ExtraTreesClassifier
etc_probs = best_model_1.predict_proba(X_test_pca)[:, 1]  # Probabilities for class 1

In [19]:
# Get predictions from GCN
gcn_probs = []
for batch in test_loader:
    batch = batch.to(device)
    with torch.no_grad():
        out = best_model_2(batch)
    gcn_probs.append(out[:, 1].cpu().numpy())  # Probabilities for class 1
gcn_probs = np.concatenate(gcn_probs)

In [20]:
# Optimal weights from the best trial
optimal_weight_etc = best_weight_etc
optimal_weight_gcn = 1.0 - optimal_weight_etc

# Combine predictions
final_probs = optimal_weight_etc * etc_probs + optimal_weight_gcn * gcn_probs

# Evaluate using AUC
auc = roc_auc_score(y_test, final_probs)
print(f"Test Set AUC: {auc}")

Test Set AUC: 0.916533240515141
