<a href="https://colab.research.google.com/github/lmVl12/AI-Biotech-course/blob/main/Assignment_3_Pubchem_Fingerprint_Calculation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **AI And Biotechnology/Bioinformatics**

## **AI and Drug Discovery Course: QSAR Modeling**
This notebook demonstrates how to collect and preprocess bioactivity data from ChEMBL for QSAR modeling

# **Part 3: Descriptor Calculation**

PaDELPy is a Python wrapper for the PaDEL-Descriptor (molecular descriptor calculation) software.  

It provide the following descriptors/fingerprint:  
* 1444 - 2D Descriptors
* 431 - 3D Descriptors
* 881 bits - PubChem Fingerprints

## **Install PaDELpy**

In [None]:
!pip install padelpy

Collecting padelpy
  Downloading padelpy-0.1.16-py3-none-any.whl.metadata (7.7 kB)
Downloading padelpy-0.1.16-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: padelpy
Successfully installed padelpy-0.1.16


## **Import libraries**

In [None]:
import pandas as pd
import numpy as np
from google.colab import files
from padelpy import padeldescriptor

## **Load dataset**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [None]:
path = "/content/gdrive/My Drive/Colab Notebooks/data/df_lipinski.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,molecule_chembl_id,bioactivity_class,pIC50,canonical_smiles,MW,LogP,NumHDonors,NumHAcceptors
0,CHEMBL4208168,inactive,5.922632,Brc1ccc(Nc2nc(N3CCOCC3)nc3c2ncn3C2CCCCO2)cc1,459.348,3.8681,1.0,8.0
1,CHEMBL1173420,inactive,4.69897,Brc1ccc(Nc2nc(N3CCOCC3)nc3c2ncn3Cc2ccccc2)cc1,465.355,4.2173,1.0,7.0
2,CHEMBL6005160,active,7.070581,Brc1ccc2ncc(-c3cccc(NC4CNC4)n3)n2c1,344.216,2.5425,2.0,5.0
3,CHEMBL3900620,inactive,5.0,C#CCN(c1cc(OC)cc(OC)c1)c1ccc2ncc(-c3cnn(C)c3)n...,399.454,3.8188,0.0,7.0
4,CHEMBL3939018,inactive,5.004971,C#CCN(c1cc(OC)cc(OC)c1)c1ccc2ncc(-c3cnn(CC4CCO...,483.572,4.7084,0.0,8.0


In [None]:
data = df[['canonical_smiles', 'molecule_chembl_id']]
data.head()

Unnamed: 0,canonical_smiles,molecule_chembl_id
0,Brc1ccc(Nc2nc(N3CCOCC3)nc3c2ncn3C2CCCCO2)cc1,CHEMBL4208168
1,Brc1ccc(Nc2nc(N3CCOCC3)nc3c2ncn3Cc2ccccc2)cc1,CHEMBL1173420
2,Brc1ccc2ncc(-c3cccc(NC4CNC4)n3)n2c1,CHEMBL6005160
3,C#CCN(c1cc(OC)cc(OC)c1)c1ccc2ncc(-c3cnn(C)c3)n...,CHEMBL3900620
4,C#CCN(c1cc(OC)cc(OC)c1)c1ccc2ncc(-c3cnn(CC4CCO...,CHEMBL3939018


## **Convert to .smi format**

In [None]:
df_smi = data['canonical_smiles'].to_csv('smiles_chembl.smi', index=None, header=None)

In [None]:
! cat smiles_chembl.smi | head

Brc1ccc(Nc2nc(N3CCOCC3)nc3c2ncn3C2CCCCO2)cc1
Brc1ccc(Nc2nc(N3CCOCC3)nc3c2ncn3Cc2ccccc2)cc1
Brc1ccc2ncc(-c3cccc(NC4CNC4)n3)n2c1
C#CCN(c1cc(OC)cc(OC)c1)c1ccc2ncc(-c3cnn(C)c3)nc2c1
C#CCN(c1cc(OC)cc(OC)c1)c1ccc2ncc(-c3cnn(CC4CCOCC4)c3)nc2c1
C#CCN(c1ccc2ncc(-c3cnn(C)c3)nc2c1)c1c(Cl)c(OC)cc(OC)c1Cl
C#CCN(c1ccc2ncc(-c3cnn(C)c3)nc2c1)c1c(F)c(OC)cc(OC)c1F
C#CCN1CCc2cc(Nc3ncc(C)c(-c4cnn(C(C)C)c4)n3)ccc2C1
C#CCOc1ccc(Nc2ccc3ncc(N4CCOCC4)nc3c2C#N)cc1OC
C#CCn1cc(-c2ccc(NC(=O)Nc3cc(C(C)(C)C)on3)c(F)c2)c2c(N)ncnc21


## **Calculate molecular Pubchem Fingerprints using "padeldescriptor" function**


In [None]:
padeldescriptor(mol_dir= "smiles_chembl.smi",
                d_file='pubchem_fingerprints.csv',
                fingerprints = True,
                retainorder= True,
                #removesalt = True, standardizetautomers = True, standardizenitro=True
                )

In [None]:
!ls -lh pubchem_fingerprints.csv

-rw-r--r-- 1 root root 8.0M Feb 10 21:28 pubchem_fingerprints.csv


In [None]:
df_fingerprint = pd.read_csv("pubchem_fingerprints.csv")
df_fingerprint.head()

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,AUTOGEN_smiles_chembl_1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AUTOGEN_smiles_chembl_2,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AUTOGEN_smiles_chembl_3,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AUTOGEN_smiles_chembl_4,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AUTOGEN_smiles_chembl_5,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## **Prepare Dataset for ML**

In [None]:
df.head()

Unnamed: 0,molecule_chembl_id,bioactivity_class,pIC50,canonical_smiles,MW,LogP,NumHDonors,NumHAcceptors
0,CHEMBL4208168,inactive,5.922632,Brc1ccc(Nc2nc(N3CCOCC3)nc3c2ncn3C2CCCCO2)cc1,459.348,3.8681,1.0,8.0
1,CHEMBL1173420,inactive,4.69897,Brc1ccc(Nc2nc(N3CCOCC3)nc3c2ncn3Cc2ccccc2)cc1,465.355,4.2173,1.0,7.0
2,CHEMBL6005160,active,7.070581,Brc1ccc2ncc(-c3cccc(NC4CNC4)n3)n2c1,344.216,2.5425,2.0,5.0
3,CHEMBL3900620,inactive,5.0,C#CCN(c1cc(OC)cc(OC)c1)c1ccc2ncc(-c3cnn(C)c3)n...,399.454,3.8188,0.0,7.0
4,CHEMBL3939018,inactive,5.004971,C#CCN(c1cc(OC)cc(OC)c1)c1ccc2ncc(-c3cnn(CC4CCO...,483.572,4.7084,0.0,8.0


In [None]:
# Select only the columns we need for ML
meta_cols = df[['molecule_chembl_id', 'bioactivity_class', 'pIC50']]

# Reset index to ensure proper alignment
meta_cols = meta_cols.reset_index(drop=True)
df_fingerprint = df_fingerprint.reset_index(drop=True)

# Combine meta data with fingerprints
combined_df = pd.concat([meta_cols, df_fingerprint.drop(df_fingerprint.columns[0], axis=1)], axis=1)

# Inspect the first few rows
combined_df.head()


Unnamed: 0,molecule_chembl_id,bioactivity_class,pIC50,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,CHEMBL4208168,inactive,5.922632,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL1173420,inactive,4.69897,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL6005160,active,7.070581,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL3900620,inactive,5.0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL3939018,inactive,5.004971,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## **Save and download the dataset**

In [None]:
# Save as CSV
path = "/content/gdrive/My Drive/Colab Notebooks/data/QSAR_dataset.csv"
combined_df.to_csv(path, index=False)
print("Combined dataset saved as QSAR_dataset.csv")

# Download file in Colab
files.download(path)

Combined dataset saved as QSAR_dataset.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **Calculate other fingerprints**

## **Download xml Files from Github**

In [None]:
!wget https://github.com/AI-Biotechnology-Bioinformatics/Drug_Discovery_AI_Course_2026/raw/main/padel_descriptors_xml.zip

--2026-02-10 21:38:09--  https://github.com/AI-Biotechnology-Bioinformatics/Drug_Discovery_AI_Course_2026/raw/main/padel_descriptors_xml.zip
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/AI-Biotechnology-Bioinformatics/Drug_Discovery_AI_Course_2026/main/padel_descriptors_xml.zip [following]
--2026-02-10 21:38:09--  https://raw.githubusercontent.com/AI-Biotechnology-Bioinformatics/Drug_Discovery_AI_Course_2026/main/padel_descriptors_xml.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10871 (11K) [application/zip]
Saving to: ‘padel_descriptors_xml.zip’


2026-02-10 21:38:09 (108 MB/s) - ‘pad

## **Unzip all files**

In [None]:
!unzip padel_descriptors_xml.zip

Archive:  padel_descriptors_xml.zip
  inflating: AtomPairs2DFingerprintCount.xml  
  inflating: AtomPairs2DFingerprinter.xml  
  inflating: EStateFingerprinter.xml  
  inflating: ExtendedFingerprinter.xml  
  inflating: Fingerprinter.xml       
  inflating: GraphOnlyFingerprinter.xml  
  inflating: KlekotaRothFingerprintCount.xml  
  inflating: KlekotaRothFingerprinter.xml  
  inflating: MACCSFingerprinter.xml  
  inflating: PubchemFingerprinter.xml  
  inflating: SubstructureFingerprintCount.xml  
  inflating: SubstructureFingerprinter.xml  


## **Calculate Fingerprints**

In [None]:
# Specify the XML file for SubstructureFingerprinter directly
Substruc_fp = "SubstructureFingerprinter.xml"

# Calculate Substructure fingerprints
padeldescriptor(
    mol_dir='smiles_chembl.smi',
    d_file='Substructure_fingerprints.csv',
    fingerprints=True,
    descriptortypes= Substruc_fp,
    retainorder=True
    # removesalt=True, standardizetautomers=True
)

In [None]:
df_substructure = pd.read_csv('Substructure_fingerprints.csv')
df_substructure.head()

Unnamed: 0,Name,SubFP1,SubFP2,SubFP3,SubFP4,SubFP5,SubFP6,SubFP7,SubFP8,SubFP9,...,SubFP298,SubFP299,SubFP300,SubFP301,SubFP302,SubFP303,SubFP304,SubFP305,SubFP306,SubFP307
0,AUTOGEN_smiles_chembl_1,0,1,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
1,AUTOGEN_smiles_chembl_2,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
2,AUTOGEN_smiles_chembl_3,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
3,AUTOGEN_smiles_chembl_4,0,0,0,0,0,1,0,0,0,...,0,0,1,1,1,0,0,0,0,1
4,AUTOGEN_smiles_chembl_5,0,1,1,0,0,1,0,0,0,...,0,0,1,1,1,0,0,0,0,1


This set of fingerprints contains less columns (307 FP)

In [None]:
df_sub_fp = pd.concat([meta_cols, df_substructure.drop(df_substructure.columns[0], axis=1)], axis=1)
df_sub_fp.head(3)

Unnamed: 0,molecule_chembl_id,bioactivity_class,pIC50,SubFP1,SubFP2,SubFP3,SubFP4,SubFP5,SubFP6,SubFP7,...,SubFP298,SubFP299,SubFP300,SubFP301,SubFP302,SubFP303,SubFP304,SubFP305,SubFP306,SubFP307
0,CHEMBL4208168,inactive,5.922632,0,1,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
1,CHEMBL1173420,inactive,4.69897,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1
2,CHEMBL6005160,active,7.070581,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,1


In [None]:
path_sub = "/content/gdrive/My Drive/Colab Notebooks/data/QSAR_dataset_substructure.csv"
df_sub_fp.to_csv(path_sub, index=False)
print(f"Dataset with substructure fingerprints saved as QSAR_dataset_substructure.csv")
files.download(path_sub)

Dataset with substructure fingerprints saved as QSAR_dataset_substructure.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>