In [3]:
import torch

In [6]:
import torch.nn.functional as F

In [16]:
import torch.optim as optim

In [4]:
from torch import nn

In [5]:
from torch.utils import data

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
import sys

In [18]:
import numpy as np

# Practice Exercise

For the exercise, you will train a network, but this time using the toxicity data from notebook 5.
First, load all the required libraries and data again. This time, also use Batchnorm, Dropout and the ADAM algorithm.

In [8]:

if 'google.colab' in sys.modules: # checks whether the notebook runs on collab
    !wget https://raw.githubusercontent.com/kochgroup/intro_pharma_ai/main/utils/utils.py
    !pip install rdkit==2025.3.6
    %run utils.py
else:
    %run ../utils/utils.py # loads pre-written functions

--2025-09-30 08:47:57--  https://raw.githubusercontent.com/kochgroup/intro_pharma_ai/main/utils/utils.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8367 (8.2K) [text/plain]
Saving to: ‘utils.py’


2025-09-30 08:47:58 (88.9 MB/s) - ‘utils.py’ saved [8367/8367]

Collecting rdkit==2025.3.6
  Downloading rdkit-2025.3.6-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit-2025.3.6-cp312-cp312-manylinux_2_28_x86_64.whl (36.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.1/36.1 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.3.6


In [9]:
data_tox = pd.read_csv("https://raw.githubusercontent.com/filipsPL/tox21_dataset/master/compounds/sr-mmp.tab", sep = "\t")
data_tox = data_tox.iloc[:,1:] # all columns except the first (index 0) are chosen
data_tox.columns = ["smiles", "activity"]
data_tox.head()

Unnamed: 0,smiles,activity
0,OC(=O)[C@H](O)[C@@H](O)[C@H](O)C(=O)CO,0
1,C[C@]12CC[C@H]3[C@@H](CCc4cc(O)ccc43)[C@@H]1CC...,1
2,CC(C)(C)c1cc(O)ccc1O,1
3,CN(C)c1ccc(cc1)C(c1ccccc1)=C1C=CC(C=C1)=[N+](C)C,1
4,NC(Cc1ccccc1)C(O)=O,0


Next, you calculate the fingerprints. As in notebook 5, the function `get_fingerprints` is available for this purpose.

In [10]:
fps = get_fingerprints(data_tox)
fps["activity"] = data_tox.activity
fps.head()

100%|██████████| 2246/2246 [00:04<00:00, 525.01it/s]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,activity
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,1
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Before you can use them in Pytorch, you need to convert both the fingerprints and the `acitivty` to `tensors`. Note that both are  in the DataFrame `fps`.

`.values` converts a DataFrame into an `np.array`.

Then the data is split into a training set and a test set.

In [11]:
fps = torch.tensor(fps.values, dtype=torch.float32)

In [12]:
train, test=train_test_split(fps,test_size= 0.2 , train_size= 0.8, random_state=1234)


train_x = train[:,:-1]
train_y = train[:,-1]
test_x = test[:,:-1]
test_y = test[:,-1]

Now we want to use minibatches again. For this we still have to convert our training data into a `DataLoader`. Why only the training data? The use of minibatches is only relevant for training. As long as your computer is able to run the test dataset through the network all at once, we don`t need to split the test dataset into minibatches.

In [13]:
train_data=data.TensorDataset(train_x, train_y) # input are our tensors, for the fingerprints and the activities
loader=data.DataLoader(train_data, batch_size = 32)
len(loader)

57

In [14]:
train_x.shape[1]

2048

Adjust the net so that the input and output are the right size. So the length of the fingerprints and the number of classes we predict.

In [23]:
net= nn.Sequential(nn.Linear(2048, 512),
                   nn.BatchNorm1d(512),
                   nn.ReLU(),
                   nn.Dropout(0.5),
                   nn.Linear(512, 128),
                   nn.BatchNorm1d(128),
                   nn.ReLU(),
                   nn.Dropout(0.5),
                   nn.Linear(128, 1))

loss_function = nn.BCEWithLogitsLoss()
update = optim.SGD(net.parameters(), lr=0.01)
EPOCHS = 10

Last, fill the `for loop`.

`.squeeze` converts the `(n,1)` `output` tensor to a 1-dimensional `tensor` of length `n`.

In [25]:
for i in range(EPOCHS):
    loss_list = [] # in this list we save the loss of each minibatch
    net.train()
    for minibatch in loader: # loop through all minibatches
        update.zero_grad() #reset grad
        molecules, activity = minibatch # divide minibatches in labels and molecules
        output = net(molecules) # forward propagation
        loss   = loss_function(output.squeeze(), activity.float()) #ADDED
        loss.backward() # backprop ADDED
        loss_list.append(loss.item())
        update.step()
    # here the accuracy for the testset is calculated
    net.eval()
    output = net(test_x)
    acc = torch.sum((output>0).squeeze().int() == test_y)/float(test_y.shape[0])

    print(
        "Training Loss: %.2f Test Accuracy: %.2f"
        % (np.mean(loss_list), acc.item())
    )

Training Loss: 0.19 Test Accuracy: 0.81
Training Loss: 0.16 Test Accuracy: 0.81
Training Loss: 0.15 Test Accuracy: 0.82
Training Loss: 0.13 Test Accuracy: 0.81
Training Loss: 0.12 Test Accuracy: 0.81
Training Loss: 0.10 Test Accuracy: 0.81
Training Loss: 0.10 Test Accuracy: 0.80
Training Loss: 0.09 Test Accuracy: 0.81
Training Loss: 0.07 Test Accuracy: 0.81
Training Loss: 0.08 Test Accuracy: 0.81
