# RDKit-Based Approach

In [1]:
import pandas as pd
from rdkit import Chem
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

## 1. Preprocessing SMILES Data into Graphs

In [2]:
# Function to extract atom features
def atom_features(atom):
    return torch.tensor(
        [
            atom.GetAtomicNum(),  # Atomic number
            atom.GetDegree(),  # Number of bonds
            atom.GetImplicitValence(),  # Implicit valence
            int(atom.GetIsAromatic()),  # Is aromatic
        ],
        dtype=torch.float,
    )


# Function to extract bond features
def bond_features(bond):
    return torch.tensor(
        [bond.GetBondTypeAsDouble()], dtype=torch.float  # Bond type as a float
    )


# Convert SMILES to PyTorch Geometric Data object
def smiles_to_graph(smiles, label):
    mol = Chem.MolFromSmiles(smiles)

    atom_features_list = []
    edge_index = []
    edge_attr = []

    # Nodes (atoms)
    for atom in mol.GetAtoms():
        atom_features_list.append(atom_features(atom))

    # Edges (bonds)
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edge_index.append([i, j])
        edge_index.append([j, i])  # Add both directions for undirected graphs
        edge_attr.append(bond_features(bond))
        edge_attr.append(bond_features(bond))

    x = torch.stack(atom_features_list)
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_attr = torch.stack(edge_attr)

    y = torch.tensor([label], dtype=torch.long)

    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)


# Load data from CSV
def load_data_from_csv(file_path):
    df = pd.read_csv(file_path)
    smiles_list = df["Smiles"].values
    labels = df["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0).values

    data_list = []
    for smiles, label in zip(smiles_list, labels):
        graph_data = smiles_to_graph(smiles, label)
        data_list.append(graph_data)

    return data_list

In [4]:
# Load training and testing data
training_data = load_data_from_csv("data_smiles/Training_Group.csv")
testing_data = load_data_from_csv("data_smiles/Testing_Group.csv")

# Create data loaders
train_loader = DataLoader(training_data, batch_size=32, shuffle=True)
test_loader = DataLoader(testing_data, batch_size=32, shuffle=False)

In [5]:
for data in train_loader:
    print(data)
    break

DataBatch(x=[973, 4], edge_index=[2, 2054], edge_attr=[2054, 1], y=[32], batch=[973], ptr=[33])


## 2. Define GCN Model

In [9]:
from torch_geometric.nn import global_mean_pool


class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 64)
        self.conv2 = GCNConv(64, 64)
        self.fc = torch.nn.Linear(64, num_classes)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch

        # Apply graph convolution layers
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        x = F.relu(x)

        # Global pooling (average pooling) to get graph-level representations
        x = global_mean_pool(x, batch)  # Pooling over nodes in each graph

        # Final classification layer
        return F.log_softmax(self.fc(x), dim=1)

In [10]:
# Initialize the model, optimizer, and loss function
model = GCN(num_node_features=4, num_classes=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()


# Training function
def train(model, data_loader):
    model.train()

    for epoch in range(50):  # 50 epochs
        total_loss = 0
        for data in data_loader:
            optimizer.zero_grad()
            out = model(data)
            loss = criterion(out, data.y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

In [11]:
# Train the model on the training data
train(model, train_loader)

Epoch 1, Loss: 27.3317
Epoch 2, Loss: 26.8665
Epoch 3, Loss: 26.6688
Epoch 4, Loss: 27.0128
Epoch 5, Loss: 26.6740
Epoch 6, Loss: 26.1469
Epoch 7, Loss: 25.9940
Epoch 8, Loss: 25.6781
Epoch 9, Loss: 25.5442
Epoch 10, Loss: 25.4485
Epoch 11, Loss: 25.4367
Epoch 12, Loss: 25.4290
Epoch 13, Loss: 25.4247
Epoch 14, Loss: 25.5633
Epoch 15, Loss: 25.5268
Epoch 16, Loss: 25.2362
Epoch 17, Loss: 25.4831
Epoch 18, Loss: 25.3682
Epoch 19, Loss: 25.3603
Epoch 20, Loss: 25.5353
Epoch 21, Loss: 25.3558
Epoch 22, Loss: 25.1048
Epoch 23, Loss: 25.6165
Epoch 24, Loss: 25.2927
Epoch 25, Loss: 25.1852
Epoch 26, Loss: 25.5483
Epoch 27, Loss: 25.2417
Epoch 28, Loss: 25.1769
Epoch 29, Loss: 25.1897
Epoch 30, Loss: 25.1714
Epoch 31, Loss: 25.2801
Epoch 32, Loss: 25.0546
Epoch 33, Loss: 25.0823
Epoch 34, Loss: 25.0177
Epoch 35, Loss: 25.2474
Epoch 36, Loss: 25.1695
Epoch 37, Loss: 25.2873
Epoch 38, Loss: 25.1001
Epoch 39, Loss: 25.0107
Epoch 40, Loss: 24.9007
Epoch 41, Loss: 24.9690
Epoch 42, Loss: 25.0805
E

## 3. Evaluation function

In [12]:
def test(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in data_loader:
            out = model(data)
            pred = out.argmax(dim=1)
            correct += (pred == data.y).sum().item()
            total += data.num_graphs
    return correct / total


# Test the model on the testing data
accuracy = test(model, test_loader)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 71.33%
