In [3]:
import torch
import numpy as np
import random
import pandas as pd
from rdkit import Chem
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import torch.nn.functional as F

# Set seeds for full reproducibility
seed_value = 42
np.random.seed(seed_value)
torch.manual_seed(seed_value)
random.seed(seed_value)

# Ensure deterministic behavior in PyTorch
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True)

# Allowed atoms (C, H, N, S, Cl, Br, O, F, I)
allowed_atoms = [6, 1, 7, 16, 17, 35, 8, 9, 53]

# Function to convert SMILES to molecular graphs
def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    
    # Get atom features
    atoms = []
    for atom in mol.GetAtoms():
        atomic_num = atom.GetAtomicNum()
        if atomic_num not in allowed_atoms:
            return None  # Skip molecules with invalid atoms
        atoms.append(atomic_num)
    
    # Get bond information
    edges = []
    edge_attr = []
    for bond in mol.GetBonds():
        start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        bond_type = bond.GetBondType()
        
        if bond_type == Chem.rdchem.BondType.SINGLE:
            bond_feature = [1, 0, 0, 0]
        elif bond_type == Chem.rdchem.BondType.DOUBLE:
            bond_feature = [0, 1, 0, 0]
        elif bond_type == Chem.rdchem.BondType.TRIPLE:
            bond_feature = [0, 0, 1, 0]
        elif bond_type == Chem.rdchem.BondType.AROMATIC:
            bond_feature = [0, 0, 0, 1]
        
        edges.append((start, end))
        edge_attr.append(bond_feature)
        edges.append((end, start))  # Bidirectional edge
        edge_attr.append(bond_feature)
    
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
    x = torch.tensor(atoms, dtype=torch.float).view(-1, 1)
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)

    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

# Load data
AA = "C:/Users/ganes/OneDrive/Desktop/AI/Drug/DNA/DNA_Smiles_Final.xlsx"
df = pd.read_excel(AA)
smiles_list = df['Smiles'].tolist()
graph_data = [smiles_to_graph(smiles) for smiles in smiles_list]
graph_data = [g for g in graph_data if g is not None]

# Create DataLoader for batched training
batch_size = 32
loader = DataLoader(graph_data, batch_size=batch_size, shuffle=False, num_workers=0)

class GCNGenerator(torch.nn.Module):
    def __init__(self, num_node_features, hidden_channels, num_edge_features):
        super(GCNGenerator, self).__init__()
        self.conv1 = GCNConv(num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.fc_mu = torch.nn.Linear(hidden_channels, hidden_channels)
        self.fc_logvar = torch.nn.Linear(hidden_channels, hidden_channels)
        self.fc_decode = torch.nn.Linear(hidden_channels, num_node_features)

        self.edge_mlp = torch.nn.Linear(num_edge_features, hidden_channels)
        self.fc_bond = torch.nn.Linear(hidden_channels, num_edge_features)

    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        mu = self.fc_mu(x)
        logvar = self.fc_logvar(x)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.zeros_like(std)  # Always use zeros to eliminate randomness
        return mu + eps * std

    def decode(self, z):
        # Decode node features
        x = self.fc_decode(z)
        return x

    def forward(self, data):
        x, edge_index, edge_attr = data.x, data.edge_index, data.edge_attr
        mu, logvar = self.encode(x, edge_index)
        z = self.reparameterize(mu, logvar)
        decoded_x = self.decode(z)
        
        # Edge feature processing
        edge_attr = F.relu(self.edge_mlp(edge_attr))
        bond_pred = self.fc_bond(edge_attr)
        
        return decoded_x, bond_pred, mu, logvar

# Initialize the model
model = GCNGenerator(num_node_features=1, hidden_channels=64, num_edge_features=4)

# Apply fixed initialization for full determinism
def reset_weights(m):
    if isinstance(m, torch.nn.Linear) or isinstance(m, GCNConv):
        torch.manual_seed(seed_value)  # Fix seed before initialization
        m.reset_parameters()

model.apply(reset_weights)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Function to convert graph back to SMILES (canonical form)
def graph_to_smiles(data):
    mol = Chem.RWMol()
    atom_map = {}
    
    # Add atoms
    for i, atom_feature in enumerate(data.x):
        atomic_num = int(atom_feature.item())
        if atomic_num not in allowed_atoms:
            return None  # Skip if invalid atom is generated
        atom_idx = mol.AddAtom(Chem.Atom(atomic_num))
        atom_map[i] = atom_idx
    
    # Add bonds
    for i, (start, end) in enumerate(data.edge_index.t()):
        start, end = start.item(), end.item()
        bond_pred = data.edge_attr[i].argmax().item()
        
        if bond_pred == 0:
            bond_type = Chem.rdchem.BondType.SINGLE
        elif bond_pred == 1:
            bond_type = Chem.rdchem.BondType.DOUBLE
        elif bond_pred == 2:
            bond_type = Chem.rdchem.BondType.TRIPLE
        elif bond_pred == 3:
            bond_type = Chem.rdchem.BondType.AROMATIC
        
        # Check if bond already exists
        if mol.GetBondBetweenAtoms(atom_map[start], atom_map[end]) is None:
            mol.AddBond(atom_map[start], atom_map[end], bond_type)
    
    # Convert to canonical SMILES
    try:
        smiles = Chem.MolToSmiles(mol, canonical=True, doRandom=False)  # Disable randomness
    except:
        return None  # In case RDKit fails to generate valid SMILES
    return smiles

# Function to train the model and collect generated SMILES
def train_and_collect_smiles(loader, max_smiles=200):
    model.train()  # Set model to training mode
    total_loss = 0
    generated_smiles_list = set()

    for data in loader:
        if len(generated_smiles_list) >= max_smiles:
            break  # Stop once we have enough SMILES

        optimizer.zero_grad()
        
        # Forward pass
        decoded_x, bond_pred, mu, logvar = model(data)
        
        # Loss for atom type prediction (mean squared error for simplicity)
        atom_loss = F.mse_loss(decoded_x, data.x)
        
        # Loss for bond type prediction
        bond_loss = F.mse_loss(bond_pred, data.edge_attr)
        
        # Kullback-Leibler divergence
        kl_loss = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
        
        # Total loss
        loss = atom_loss + bond_loss + kl_loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

        # Collect generated SMILES
        generated_smiles = graph_to_smiles(data)
        if generated_smiles:
            generated_smiles_list.add(generated_smiles)
        
        if len(generated_smiles_list) >= max_smiles:
            break  # Stop once we have enough SMILES
    
    return total_loss / len(loader), generated_smiles_list

# Train and collect generated SMILES
for epoch in range(20):  # Single epoch since we only need 200 SMILES
    avg_loss, generated_smiles = train_and_collect_smiles(loader)
    print(f'Epoch: {epoch + 1}, Avg Loss: {avg_loss:.4f}, Generated SMILES Count: {len(generated_smiles)}')
    if len(generated_smiles) >= 200:
        break  # Stop training if we have enough unique SMILES



Epoch: 1, Avg Loss: 12.2082, Generated SMILES Count: 42
Epoch: 2, Avg Loss: 5.2339, Generated SMILES Count: 42
Epoch: 3, Avg Loss: 5.0397, Generated SMILES Count: 42
Epoch: 4, Avg Loss: 4.9272, Generated SMILES Count: 42
Epoch: 5, Avg Loss: 4.8409, Generated SMILES Count: 42
Epoch: 6, Avg Loss: 4.7667, Generated SMILES Count: 42
Epoch: 7, Avg Loss: 4.6998, Generated SMILES Count: 42
Epoch: 8, Avg Loss: 4.6389, Generated SMILES Count: 42
Epoch: 9, Avg Loss: 4.5808, Generated SMILES Count: 42
Epoch: 10, Avg Loss: 4.5213, Generated SMILES Count: 42
Epoch: 11, Avg Loss: 4.4542, Generated SMILES Count: 42
Epoch: 12, Avg Loss: 4.3685, Generated SMILES Count: 42
Epoch: 13, Avg Loss: 4.2462, Generated SMILES Count: 42
Epoch: 14, Avg Loss: 4.0736, Generated SMILES Count: 42
Epoch: 15, Avg Loss: 3.8579, Generated SMILES Count: 42
Epoch: 16, Avg Loss: 3.6247, Generated SMILES Count: 42
Epoch: 17, Avg Loss: 3.3649, Generated SMILES Count: 42
Epoch: 18, Avg Loss: 3.1276, Generated SMILES Count: 42


In [4]:
# Split SMILES strings and collect them in a list
final_smiles_list = sorted(list(generated_smiles))
split_smiles_list = []
for smiles in final_smiles_list:
    split_smiles_list.extend(smiles.split('.'))

# Create a DataFrame from the split SMILES list
df_split_smiles = pd.DataFrame(split_smiles_list, columns=['SMILES'])

# Save the DataFrame to an Excel file
output_file_path = "split_smiles_list.xlsx"
df_split_smiles.to_excel(output_file_path, index=False)

print(f"SMILES strings have been successfully saved to {output_file_path}")


SMILES strings have been successfully saved to split_smiles_list.xlsx


In [5]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools

# Load your original Excel file with the SMILES
AA = "C:/Users/ganes/OneDrive/Desktop/AI/Drug/DNA/Valid_Smiles.xlsx"
df = pd.read_excel(AA)

# Add molecule column to DataFrame
PandasTools.AddMoleculeColumnToFrame(df, smilesCol='Smiles', molCol='Mol')

# Create a list to store invalid SMILES and their indices
invalid_smiles = []

# Loop through the DataFrame and check each SMILES
for index, row in df.iterrows():
    try:
        mol = Chem.MolFromSmiles(row['Smiles'])  # Attempt to create the molecule from SMILES

        if mol is None:  # Check if the molecule is valid
            invalid_smiles.append({'Index': index, 'SMILES': row['Smiles'], 'Error': 'Invalid SMILES'})
    except ValueError as e:
        # Capture the SMILES and index number if there's a ValueError related to valence
        if "Explicit valence" in str(e):
            invalid_smiles.append({'Index': index, 'SMILES': row['Smiles'], 'Error': 'Explicit valence error'})

# Convert the list of invalid SMILES to a DataFrame
invalid_smiles_df = pd.DataFrame(invalid_smiles)

# Save the invalid SMILES DataFrame to a new Excel file
output_path = "C:/Users/ganes/OneDrive/Desktop/AI/Drug/DNA/Invalid_Smiles.xlsx"
invalid_smiles_df.to_excel(output_path, index=False)

print(f"Invalid SMILES saved to {output_path}")


Invalid SMILES saved to C:/Users/ganes/OneDrive/Desktop/AI/Drug/DNA/Invalid_Smiles.xlsx
