In [1]:
import joblib
import pandas as pd

from rdkit import Chem
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, global_mean_pool, global_add_pool
import torch
import torch.nn.functional as F
import numpy as np
from rdkit.Chem import rdFingerprintGenerator
import deepchem as dc
import optuna
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from torch_geometric.loader import DataLoader

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'dgl'
Skipped loading modules with transformers dependency. No module named 'transformers'
cannot import name 'HuggingFaceModel' from 'deepchem.models.torch_models' (/home/m12gbs1/miniconda3/envs/dili/lib/python3.9/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'
Skipped loading some PyTorch models, missing a dependency. No module named 'tensorflow'


In [2]:
# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Load pre-trained models

In [3]:
# Load the best model
model1 = joblib.load("models_cv5/best_extratrees_model_pca_mixed.pkl")
model1

In [4]:
# Load the best hyperparameters (optional)
model1_params = joblib.load("models_cv5/best_extratrees_params_pca_mixed.pkl")
model1_params

{'n_estimators': 255,
 'max_depth': 16,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt'}

In [5]:
class GCN(torch.nn.Module):
    def __init__(
        self,
        num_node_features,
        num_classes,
        num_layers=3,
        hidden_dim=64,
        dropout_prob=0.5,
        activation="relu",
    ):
        super(GCN, self).__init__()

        # Store activation function dynamically
        if activation == "relu":
            self.activation = F.relu
        elif activation == "tanh":
            self.activation = F.tanh
        else:
            raise ValueError("Unsupported activation function")

        self.dropout_prob = dropout_prob

        # Dynamically define the GCN layers
        self.convs = torch.nn.ModuleList()
        self.convs.append(GCNConv(num_node_features, hidden_dim))
        for _ in range(num_layers - 1):
            self.convs.append(GCNConv(hidden_dim, hidden_dim))

        # Final fully connected layer
        self.fc = torch.nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch

        # Apply GCN layers dynamically
        for conv in self.convs:
            x = conv(x, edge_index)
            x = self.activation(x)

        # Global pooling (combine different pooling methods)
        x = torch.cat([global_mean_pool(x, batch), global_add_pool(x, batch)], dim=1)

        # Apply dropout
        x = F.dropout(x, p=self.dropout_prob, training=self.training)

        # Final classification layer
        return F.log_softmax(self.fc(x), dim=1)

In [6]:
# Load model parameters
model2_params = joblib.load("models_cv5/best_model_params_graph_gcn_mixed.pkl")
model2_params

{'hidden_dim': 92,
 'num_layers': 3,
 'learning_rate': 0.0007376684210691645,
 'weight_decay': 1.2322840596113695e-05,
 'activation': 'tanh',
 'dropout_prob': 0.15206713363798266}

In [7]:
model2 = GCN(
    num_node_features=70,
    num_classes=2,
    num_layers=model2_params["num_layers"],
    hidden_dim=model2_params["hidden_dim"],
    dropout_prob=model2_params["dropout_prob"],
    activation=model2_params["activation"],
).to(device)

model2.load_state_dict(torch.load("models_cv5/best_model_graph_gcn_mixed.pth"))
model2

  model2.load_state_dict(torch.load("models_cv5/best_model_graph_gcn_mixed.pth"))


GCN(
  (convs): ModuleList(
    (0): GCNConv(70, 92)
    (1-2): 2 x GCNConv(92, 92)
  )
  (fc): Linear(in_features=184, out_features=2, bias=True)
)

# Data preparation

## PCA prep

In [8]:
# Load pd_train
pd_train = pd.read_parquet("data/training_class_mixed.parquet")
pd_train["label"] = pd_train["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_train.shape)

(1221, 16095)


In [9]:
# Load pd_test
pd_test = pd.read_parquet("data/testing_class_mixed.parquet")
pd_test["label"] = pd_test["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_test.shape)

(306, 16095)


In [10]:
X_train = pd_train.drop(columns=["Liver", "label", "Smiles"])
y_train = pd_train["label"]

X_test = pd_test.drop(columns=["Liver", "label", "Smiles"])
y_test = pd_test["label"]

In [11]:
# get X PCA components 50
pca = PCA(n_components=50)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

## Graph prep

In [12]:
def featurize_smiles(smiles):
    featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)
    graph_data = featurizer.featurize([smiles])[0]

    # Get DeepChem atom features
    atom_features_deepchem = graph_data.node_features

    return atom_features_deepchem


# Function to generate Morgan Fingerprints (ECFP)
def generate_ecfp(smiles):
    # Morgan fingerprint generator
    mfgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=4096)

    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return None
    return mfgen.GetFingerprintAsNumPy(molecule)


# Function to convert SMILES to PyTorch Geometric Data object using DeepChem featurizer
def smiles_to_graph_featurizer(smiles):
    # Featurization using DeepChem
    featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)

    # Featurize the SMILES string using DeepChem
    graph_data = featurizer.featurize([smiles])[0]
    return graph_data.node_features, graph_data.edge_features, graph_data.edge_index


# Function to extract atom features
def atom_features(atom, ecfp):
    # Get the atom index for corresponding ECFP value
    atom_idx = atom.GetIdx()

    return [
        atom.GetAtomicNum(),  # Atomic number
        atom.GetDegree(),  # Number of bonds
        atom.GetTotalNumHs(),  # Total number of hydrogens
        atom.GetFormalCharge(),  # Formal charge of the atom
        atom.GetImplicitValence(),  # Implicit valence
        atom.GetNumRadicalElectrons(),  # Number of radical electrons
        int(atom.GetIsAromatic()),  # Is the atom aromatic?
        atom.GetMass(),  # Atomic mass
        atom.GetHybridization().real,  # Hybridization state (SP, SP2, SP3, etc.)
        ecfp[atom_idx],  # Morgan fingerprint (ECFP) for the atom
    ]


# Function to extract bond features
def bond_features(bond):
    bond_type = bond.GetBondTypeAsDouble()  # Bond type as a float
    is_aromatic = bond.GetIsAromatic()  # Aromatic bond
    is_conjugated = bond.GetIsConjugated()  # Conjugated bond
    is_in_ring = bond.IsInRing()  # Whether the bond is part of a ring
    stereo = bond.GetStereo()  # Bond stereochemistry

    # Convert stereo information to a one-hot encoded format
    stereo_one_hot = [0, 0, 0, 0]  # Stereo options: None, E, Z, Other
    if stereo == Chem.BondStereo.STEREONONE:
        stereo_one_hot[0] = 1
    elif stereo == Chem.BondStereo.STEREOE:
        stereo_one_hot[1] = 1
    elif stereo == Chem.BondStereo.STEREOZ:
        stereo_one_hot[2] = 1
    else:
        stereo_one_hot[3] = 1

    # Combine all features into a single tensor
    return [
        bond_type,
        float(is_aromatic),
        float(is_conjugated),
        float(is_in_ring),
    ] + stereo_one_hot


# Convert SMILES to PyTorch Geometric Data object
def smiles_to_graph(smiles, label):
    mol = Chem.MolFromSmiles(smiles)

    atom_features_list = []
    edge_index = []
    edge_attr = []

    # DeepChem features
    atom_features_deepchem = featurize_smiles(smiles)

    # Generate Morgan Fingerprint (ECFP)
    ecfp_features = generate_ecfp(smiles)

    # Generate Molecule Graph Convolution features
    mol_graph_node_features, mol_graph_edge_features, mol_graph_edge_index = (
        smiles_to_graph_featurizer(smiles)
    )

    # Nodes (atoms)
    for atom in mol.GetAtoms():
        atom_features_list.append(atom_features(atom, ecfp_features))

    atom_features_list = np.array(atom_features_list)

    # Edges (bonds)
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()

        # Append bidirectional edges for undirected graphs
        edge_index.append([i, j])
        edge_index.append([j, i])

        # Append bond features for both directions
        edge_attr.append(bond_features(bond))
        edge_attr.append(bond_features(bond))

    # Convert atom features to a tensor
    combined_features = np.concatenate(
        (atom_features_list, atom_features_deepchem, mol_graph_node_features), axis=1
    )
    x = torch.tensor(combined_features, dtype=torch.float)

    # Convert edge indices and features to tensors, handle empty edge case
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    # combine edge features from ECFP and MolGraphConv
    edge_attr = np.array(edge_attr)
    edge_attr = np.concatenate((edge_attr, mol_graph_edge_features), axis=1)
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)

    # Label (target)
    y = torch.tensor([label], dtype=torch.long)

    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)


# Function to load data from parquet and apply SMILES augmentation for training
def load_data_from_parquet(file_path):
    df = pd.read_parquet(file_path)

    smiles_list = df["Smiles"].values
    labels = df["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0).values

    data_list = []

    # Initialize the SmilesEnumerator for data augmentation
    for smiles, label in zip(smiles_list, labels):
        # For test data, no augmentation, just use canonical SMILES
        graph_data = smiles_to_graph(smiles, label)
        data_list.append(graph_data)

    return data_list

In [13]:
# Load training and testing data
training_data = load_data_from_parquet("data/training_class_mixed.parquet")
testing_data = load_data_from_parquet("data/testing_class_mixed.parquet")

In [19]:
# Create data loaders
train_loader = DataLoader(training_data, batch_size=32, shuffle=True)
test_loader = DataLoader(testing_data, batch_size=32, shuffle=False)

# Model training

## Weighted Averaging Ensembles

In [14]:
# Initialize global variables to store the best model and its AUC
best_model_1 = None
best_model_2 = None
best_model_auc = {"auc": 0.0}


def objective(trial):
    # Hyperparameter Suggestions
    weight_etc = trial.suggest_float("weight_etc", 0.0, 1.0)
    weight_gcn = 1.0 - weight_etc  # Ensures weights sum to 1

    # Device setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Ensure y_train is aligned with indices
    y_train_reset = (
        y_train.reset_index(drop=True) if isinstance(y_train, pd.Series) else y_train
    )
    auc_scores = []

    for train_idx, test_idx in skf.split(X_train_pca, y_train_reset):
        # Use .iloc for Pandas or index directly for NumPy arrays
        X_train_i, X_test_i = X_train_pca[train_idx], X_train_pca[test_idx]
        y_train_i, y_test_i = (
            y_train_reset.iloc[train_idx],
            y_train_reset.iloc[test_idx],
        )

        # Train ExtraTreesClassifier
        etc_model = ExtraTreesClassifier(**model1_params)
        etc_model.fit(X_train_i, y_train_i)

        # Prepare datasets for GCN
        train_dataset = [training_data[i] for i in train_idx]
        test_dataset = [training_data[i] for i in test_idx]

        train_loader_i = DataLoader(train_dataset, batch_size=32, shuffle=True)
        test_loader_i = DataLoader(test_dataset, batch_size=32, shuffle=False)

        # Define GCN model
        gcn_model = GCN(
            num_node_features=70,
            num_classes=2,
            num_layers=model2_params["num_layers"],
            hidden_dim=model2_params["hidden_dim"],
            dropout_prob=model2_params["dropout_prob"],
            activation=model2_params["activation"],
        ).to(device)

        optimizer = torch.optim.Adam(
            gcn_model.parameters(),
            lr=model2_params["learning_rate"],
            weight_decay=model2_params["weight_decay"],
        )
        criterion = torch.nn.CrossEntropyLoss()

        # Train GCN
        gcn_model.train()
        for epoch in range(50):  # Fixed number of epochs
            for batch in train_loader_i:
                batch = batch.to(device)
                optimizer.zero_grad()
                out = gcn_model(batch)
                loss = criterion(out, batch.y)
                loss.backward()
                optimizer.step()

        # Get predictions from ExtraTreesClassifier
        etc_probs_i = etc_model.predict_proba(X_test_i)[
            :, 1
        ]  # Probabilities for class 1

        # Get predictions from GCN
        gcn_model.eval()
        gcn_probs_i = []
        for batch in test_loader_i:
            batch = batch.to(device)
            with torch.no_grad():
                out = gcn_model(batch)
            gcn_probs_i.append(out[:, 1].cpu().numpy())  # Probabilities for class 1

        # Concatenate GCN probabilities
        gcn_probs_i = np.concatenate(gcn_probs_i)

        # Ensemble predictions
        final_probs_i = weight_etc * etc_probs_i + weight_gcn * gcn_probs_i

        # Evaluate AUC
        auc = roc_auc_score(y_test_i, final_probs_i)
        auc_scores.append(auc)

    # Save the best model
    global best_model_1, best_model_2, best_model_auc
    mean_auc = np.mean(auc_scores)
    if mean_auc > best_model_auc["auc"]:
        best_model_auc["auc"] = mean_auc
        best_model_1 = etc_model
        best_model_2 = gcn_model

    # Return Mean AUC
    return mean_auc

In [15]:
# Run Optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

[I 2024-11-17 15:00:43,805] A new study created in memory with name: no-name-76158529-90da-4dca-aad6-79ec56222e06
[I 2024-11-17 15:01:01,538] Trial 0 finished with value: 0.6777503570184604 and parameters: {'weight_etc': 0.010677047934205985}. Best is trial 0 with value: 0.6777503570184604.
[I 2024-11-17 15:01:19,061] Trial 1 finished with value: 0.7822371821664925 and parameters: {'weight_etc': 0.8047690139030934}. Best is trial 1 with value: 0.7822371821664925.
[I 2024-11-17 15:01:36,458] Trial 2 finished with value: 0.7403012626262626 and parameters: {'weight_etc': 0.5245087668045538}. Best is trial 1 with value: 0.7822371821664925.
[I 2024-11-17 15:01:53,917] Trial 3 finished with value: 0.7371136102403344 and parameters: {'weight_etc': 0.389681542203992}. Best is trial 1 with value: 0.7822371821664925.
[I 2024-11-17 15:02:11,384] Trial 4 finished with value: 0.7774205416231278 and parameters: {'weight_etc': 0.7573005876123421}. Best is trial 1 with value: 0.7822371821664925.
[I 20

In [16]:
# Output the Best Parameters and AUC
print("Best AUC Score:", best_model_auc["auc"])

Best AUC Score: 0.7919899599442702


In [17]:
joblib.dump(best_model_1, "models_cv5/wa_model_pca_mixed.pkl")

# Save the best model as PyTorch and Joblib
if best_model_2 is not None:
    torch.save(best_model_2.state_dict(), "models_cv5/wa_graph_gcn_mixed.pth")  # Save PyTorch model


## Evaluate

In [18]:
# Get predictions from ExtraTreesClassifier
etc_probs = best_model_1.predict_proba(X_test_pca)[:, 1]  # Probabilities for class 1


In [20]:
# Get predictions from GCN
gcn_probs = []
for batch in test_loader:
    batch = batch.to(device)
    with torch.no_grad():
        out = best_model_2(batch)
    gcn_probs.append(out[:, 1].cpu().numpy())  # Probabilities for class 1
gcn_probs = np.concatenate(gcn_probs)

In [21]:
# Optimal weights from the best trial
optimal_weight_etc = 0.9891295210909025
optimal_weight_gcn = 1.0 - optimal_weight_etc

# Combine predictions
final_probs = optimal_weight_etc * etc_probs + optimal_weight_gcn * gcn_probs

# Evaluate using AUC
auc = roc_auc_score(y_test, final_probs)
print(f"Test Set AUC: {auc}")

Test Set AUC: 0.7852817679558012
