In [None]:
# Cell 1: Installations
!pip install -q condacolab
import condacolab
condacolab.install()
!conda install -c conda-forge mamba -y
!mamba install -q -y -c conda-forge pandas matplotlib seaborn rdkit
!pip install --upgrade keras scikit-learn

In [None]:
# Cell 2: Complete Workflow
from pathlib import Path
from warnings import filterwarnings
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import MACCSkeys, Draw, rdFingerprintGenerator
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import Sequential, load_model, model_from_json
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import ModelCheckpoint
%matplotlib inline

# Silence warnings
filterwarnings("ignore")

# Function to convert SMILES to fingerprints
def smiles_to_fp(smiles, method="maccs", n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if method == "maccs":
        return np.array(MACCSkeys.GenMACCSKeys(mol))
    elif method == "morgan2":
        fpg = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_bits)
        return np.array(fpg.GetCountFingerprint(mol))
    elif method == "morgan3":
        fpg = rdFingerprintGenerator.GetMorganGenerator(radius=3, fpSize=n_bits)
        return np.array(fpg.GetCountFingerprint(mol))
    else:
        print(f"Warning: Wrong method specified: {method}. Default used.")
        return np.array(MACCSkeys.GenMACCSKeys(mol))

# Load and prepare data
df = pd.read_csv("EGFR_compounds_new.csv").reset_index(drop=True)
chembl_df = df[["smiles", "pIC50"]]
chembl_df["fingerprints_df"] = chembl_df["smiles"].apply(smiles_to_fp)

# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(
    chembl_df["fingerprints_df"], chembl_df[["pIC50"]], test_size=0.3, random_state=42
)

# Define neural network model
def neural_network_model(hidden1, hidden2):
    model = Sequential([
        Dense(hidden1, activation="relu", name="layer1"),
        Dense(hidden2, activation="relu", name="layer2"),
        Dense(1, activation="linear", name="layer3")
    ])
    model.compile(loss="mean_squared_error", optimizer="adam", metrics=["mse", "mae"])
    return model

# Neural network parameters
batch_sizes = [16, 32, 64]
nb_epoch = 50
layer1_size = 64
layer2_size = 32

# Plot loss for different batch sizes
plt.figure(figsize=(12, 6))
sns.set(color_codes=True)
for index, batch in enumerate(batch_sizes):
    plt.subplot(1, len(batch_sizes), index + 1)
    model = neural_network_model(layer1_size, layer2_size)
    history = model.fit(
        np.array(list(x_train)).astype(float), y_train.values,
        batch_size=batch, validation_data=(np.array(list(x_test)).astype(float), y_test.values),
        verbose=0, epochs=nb_epoch
    )
    plt.plot(history.history["loss"], label="train")
    plt.plot(history.history["val_loss"], label="test")
    plt.legend(["train", "test"], loc="upper right")
    plt.ylabel("loss")
    plt.xlabel("epoch")
    plt.ylim((0, 15))
    plt.title(f"test loss = {history.history['val_loss'][nb_epoch-1]:.2f}, batch size = {batch}")
plt.show()

# Train model with best batch size (64) and save best weights
model = neural_network_model(layer1_size, layer2_size)
filepath = "best_weights.weights.h5"
checkpoint = ModelCheckpoint(filepath, monitor="loss", verbose=0, save_best_only=True, mode="min", save_weights_only=True)
callbacks_list = [checkpoint]
model.fit(np.array(list(x_train)).astype(float), y_train.values, epochs=nb_epoch, batch_size=64, callbacks=callbacks_list, verbose=0)

# Save model to JSON and weights to HDF5
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("model.weights.h5")

# Evaluate model
scores = model.evaluate(np.array(list(x_test)), y_test.values, verbose=0)
print(f"Evaluate the model on the test data\n loss: {scores[0]:.2f}\n mse: {scores[1]:.2f}\n mae: {scores[2]:.2f}")

# Predict on test set
y_pred = model.predict(np.array(list(x_test)))
print("\nFirst 5 predicted pIC50 values:")
[print(f"{value[0]:.2f}") for value in y_pred[:5]]

# Scatter plot of predicted vs true values
plt.figure()
plt.scatter(y_pred, y_test, marker=".")
lin = np.linspace(0, 15, 100)
plt.plot(lin, lin)
plt.gca().set_aspect("equal", adjustable="box")
plt.xlabel("Predicted values")
plt.ylabel("True values")
plt.title("Scatter plot: pIC50 values")
plt.xlim((0, 15))
plt.ylim((0, 15))
plt.show()

# Load and predict on external data
external_data = pd.read_csv("test.csv").reset_index(drop=True)
external_data["fingerprints_df"] = external_data["canonical_smiles"].apply(smiles_to_fp)

# Load model and predict
json_file = open("model.json", "r")
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
model.load_weights("model.weights.h5")
predictions = model.predict(np.array(list(external_data["fingerprints_df"])).astype(float), callbacks=callbacks_list)

# Save predictions
predicted_pIC50_df = external_data.join(pd.DataFrame(predictions, columns=["predicted_pIC50"]))
predicted_pIC50_df.to_csv("predicted_pIC50_df.csv")

# Select and display top 3 compounds
top3_drug = predicted_pIC50_df.nlargest(3, "predicted_pIC50")
mols_EGFR = [Chem.MolFromSmiles(smile) for smile in top3_drug["canonical_smiles"]]
pIC50_values = [f"pIC50 value: {value:.2f}" for value in top3_drug["predicted_pIC50"]]
Draw.MolsToGridImage(mols_EGFR, molsPerRow=3, subImgSize=(450, 300), legends=pIC50_values)