In [None]:
pip install torch torchvision torchaudio

In [None]:
pip install pandas

In [None]:
pip install rdkit

In [None]:
pip install scikit-learn

# Function for calculating descriptors using rdkit

In [12]:
import pandas as pd
from rdkit.Chem import Descriptors

# This calculates relevant descriptors for each molecule in a list
# This returns a list of descriptors for each molecule
# to convert the list to a df just do df = pd.DataFrame(data)
def get_descriptors(mol_list):
    data = []
    for mol in mol_list:
        desc = {
            'MolWt': Descriptors.MolWt(mol),
            'MolLogP': Descriptors.MolLogP(mol),
            'TPSA': Descriptors.TPSA(mol),
            'NumHDonors': Descriptors.NumHDonors(mol),
            'NumHAcceptors': Descriptors.NumHAcceptors(mol)
        }
        data.append(desc)

    return data
    
print('get_descriptors function initialized')

get_descriptors function initialized


# Function for taking a dataframe as input and using it to train a model

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

"""
Train a neural network using molecular descriptors to predict labels eg.
ligand efficiency or fragment hit probability.

Input is a pandas dataframe
"""

def train_fragment_model(df, target_col="target", epochs=50, batch_size=16, lr=1e-3, hidden_dim=128, test_split=0.2):

    # Split features and target
    X = df.drop(columns=[target_col]).values.astype(np.float32)
    y = df[target_col].values.astype(np.float32)

    # Scale features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Convert to tensors
    X_tensor = torch.tensor(X)
    y_tensor = torch.tensor(y).view(-1, 1)

    # Train/test split
    n_total = len(df)
    n_test = int(n_total * test_split)
    n_train = n_total - n_test
    
    dataset = TensorDataset(X_tensor, y_tensor)
    train_ds, test_ds = random_split(dataset, [n_train, n_test])

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)


    # Model definition
    input_dim = X.shape[1] # Width of layers
    model = nn.Sequential(
        nn.Linear(input_dim, hidden_dim),
        nn.ReLU(),
        nn.Linear(hidden_dim, hidden_dim//2),
        nn.ReLU(),
        nn.Linear(hidden_dim//2, hidden_dim//4),
        nn.ReLU(),
        nn.Linear(hidden_dim//4, 1),

        # Remove if for regression task
        nn.Sigmoid()  
    )

    # Set criterion depending on expected output eg. for classification or regression
    # For regression use MSELoss
    criterion = nn.BCELoss()

    # As usual Adam is used
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Training loop
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        if (epoch+1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}")

    # Get test data as tensors
    X_test, y_test = next(iter(DataLoader(test_ds, batch_size=len(test_ds))))
    
    return model, (X_test, y_test)

print('train_fragment_model function initialized')

train_fragment_model function initialized


# Function to define evaluation metrics to evaluate model

In [14]:
from sklearn.metrics import accuracy_score, confusion_matrix

def evaluate_model(model, X_test, y_test, threshold=0.5):

    model.eval()
    with torch.no_grad():
        probs = model(X_test).numpy().flatten()
        
    preds = (probs >= threshold).astype(int)
    y_true = y_test.numpy().flatten().astype(int)

    cm = confusion_matrix(y_true, preds)
    tn, fp, fn, tp = cm.ravel()

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

    print(f"Accuracy: {accuracy:.3f}")
    print(f"Sensitivity (Recall): {sensitivity:.3f}")
    print(f"Specificity: {specificity:.3f}")
    print("Confusion Matrix:")
    print(cm)

    eval_metrics = {"accuracy": accuracy,
                    "sensitivity": sensitivity,
                    "specificity": specificity,
                    "confusion_matrix": cm
                   }
    

    return eval_metrics

print('evaluate_model function initialized')    

evaluate_model function initialized
