<a href="https://colab.research.google.com/github/lmVl12/AI_and_Drug_Discovery_Course_2026/blob/main/Assignment_3_Task3_Pubchem_Fingerprint_Calculation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **AI And Biotechnology/Bioinformatics**

## **AI and Drug Discovery Course: QSAR Modeling**


# **Part 3: Fingerprint descriptor Calculation**

## **1. Technical framework**

In [None]:
!pip install padelpy

Collecting padelpy
  Downloading padelpy-0.1.16-py3-none-any.whl.metadata (7.7 kB)
Downloading padelpy-0.1.16-py3-none-any.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: padelpy
Successfully installed padelpy-0.1.16


In [None]:
import pandas as pd
import numpy as np
from google.colab import files
from padelpy import padeldescriptor

## **2. Load dataset**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
results_path = "/content/gdrive/My Drive/Colab Notebooks/data/"
df = pd.read_csv(results_path + 'df_lipinski.csv')
df.head()

Unnamed: 0,molecule_chembl_id,bioactivity_class,pIC50,canonical_smiles,MW,LogP,NumHDonors,NumHAcceptors
0,CHEMBL4208168,inactive,5.922632,Brc1ccc(Nc2nc(N3CCOCC3)nc3c2ncn3C2CCCCO2)cc1,459.348,3.8681,1.0,8.0
1,CHEMBL1173420,inactive,4.69897,Brc1ccc(Nc2nc(N3CCOCC3)nc3c2ncn3Cc2ccccc2)cc1,465.355,4.2173,1.0,7.0
2,CHEMBL6005160,active,7.070581,Brc1ccc2ncc(-c3cccc(NC4CNC4)n3)n2c1,344.216,2.5425,2.0,5.0
3,CHEMBL3900620,inactive,5.0,C#CCN(c1cc(OC)cc(OC)c1)c1ccc2ncc(-c3cnn(C)c3)n...,399.454,3.8188,0.0,7.0
4,CHEMBL3939018,inactive,5.004971,C#CCN(c1cc(OC)cc(OC)c1)c1ccc2ncc(-c3cnn(CC4CCO...,483.572,4.7084,0.0,8.0


In [None]:
data = df[['canonical_smiles', 'molecule_chembl_id']]
data.head()

Unnamed: 0,canonical_smiles,molecule_chembl_id
0,Brc1ccc(Nc2nc(N3CCOCC3)nc3c2ncn3C2CCCCO2)cc1,CHEMBL4208168
1,Brc1ccc(Nc2nc(N3CCOCC3)nc3c2ncn3Cc2ccccc2)cc1,CHEMBL1173420
2,Brc1ccc2ncc(-c3cccc(NC4CNC4)n3)n2c1,CHEMBL6005160
3,C#CCN(c1cc(OC)cc(OC)c1)c1ccc2ncc(-c3cnn(C)c3)n...,CHEMBL3900620
4,C#CCN(c1cc(OC)cc(OC)c1)c1ccc2ncc(-c3cnn(CC4CCO...,CHEMBL3939018


## **3. Convert to .smi format**

In [None]:
df_smi = data['canonical_smiles'].to_csv('smiles_chembl.smi', index=None, header=None)

In [None]:
! cat smiles_chembl.smi | head

Brc1ccc(Nc2nc(N3CCOCC3)nc3c2ncn3C2CCCCO2)cc1
Brc1ccc(Nc2nc(N3CCOCC3)nc3c2ncn3Cc2ccccc2)cc1
Brc1ccc2ncc(-c3cccc(NC4CNC4)n3)n2c1
C#CCN(c1cc(OC)cc(OC)c1)c1ccc2ncc(-c3cnn(C)c3)nc2c1
C#CCN(c1cc(OC)cc(OC)c1)c1ccc2ncc(-c3cnn(CC4CCOCC4)c3)nc2c1
C#CCN(c1ccc2ncc(-c3cnn(C)c3)nc2c1)c1c(Cl)c(OC)cc(OC)c1Cl
C#CCN(c1ccc2ncc(-c3cnn(C)c3)nc2c1)c1c(F)c(OC)cc(OC)c1F
C#CCN1CCc2cc(Nc3ncc(C)c(-c4cnn(C(C)C)c4)n3)ccc2C1
C#CCOc1ccc(Nc2ccc3ncc(N4CCOCC4)nc3c2C#N)cc1OC
C#CCn1cc(-c2ccc(NC(=O)Nc3cc(C(C)(C)C)on3)c(F)c2)c2c(N)ncnc21


## **4. Calculate molecular Pubchem Fingerprints using "padeldescriptor" function**


Molecular structures are converted into PubChem fingerprints, where each molecule is represented by an 881-bit binary vector corresponding to specific structural fragments

In [None]:
padeldescriptor(mol_dir= "smiles_chembl.smi",
                d_file='pubchem_fingerprints.csv',
                fingerprints = True,
                retainorder= True,
                #removesalt = True, standardizetautomers = True, standardizenitro=True
                )

In [None]:
!ls -lh pubchem_fingerprints.csv

-rw-r--r-- 1 root root 8.0M Feb 11 18:29 pubchem_fingerprints.csv


In [None]:
df_fingerprint = pd.read_csv("pubchem_fingerprints.csv")
df_fingerprint.head()

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,AUTOGEN_smiles_chembl_1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AUTOGEN_smiles_chembl_2,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AUTOGEN_smiles_chembl_3,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AUTOGEN_smiles_chembl_4,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AUTOGEN_smiles_chembl_5,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## **5. Prepare Dataset for ML**

In [None]:
df.head()

Unnamed: 0,molecule_chembl_id,bioactivity_class,pIC50,canonical_smiles,MW,LogP,NumHDonors,NumHAcceptors
0,CHEMBL4208168,inactive,5.922632,Brc1ccc(Nc2nc(N3CCOCC3)nc3c2ncn3C2CCCCO2)cc1,459.348,3.8681,1.0,8.0
1,CHEMBL1173420,inactive,4.69897,Brc1ccc(Nc2nc(N3CCOCC3)nc3c2ncn3Cc2ccccc2)cc1,465.355,4.2173,1.0,7.0
2,CHEMBL6005160,active,7.070581,Brc1ccc2ncc(-c3cccc(NC4CNC4)n3)n2c1,344.216,2.5425,2.0,5.0
3,CHEMBL3900620,inactive,5.0,C#CCN(c1cc(OC)cc(OC)c1)c1ccc2ncc(-c3cnn(C)c3)n...,399.454,3.8188,0.0,7.0
4,CHEMBL3939018,inactive,5.004971,C#CCN(c1cc(OC)cc(OC)c1)c1ccc2ncc(-c3cnn(CC4CCO...,483.572,4.7084,0.0,8.0


Constant features with zero variance are identified and removed to ensure that only informative bits are utilized during the model training phase

In [None]:
from sklearn.feature_selection import VarianceThreshold

# Get the fingerprints
X_fp = df_fingerprint.drop(df_fingerprint.columns[0], axis=1)

# Optional: select only meaningful descriptors and drop the default
selector = VarianceThreshold(threshold=0)
X_fp_reduced = selector.fit_transform(X_fp)
selected_fp_cols = X_fp.columns[selector.get_support()]
df_fp_clean = pd.DataFrame(X_fp_reduced, columns=selected_fp_cols)

# Check the status
print(f"Total PubChem bits generated: {X_fp.shape[1]}")
print(f"Unique informative bits:      {df_fp_clean.shape[1]}")

# Metadata
meta_cols = df[['molecule_chembl_id', 'bioactivity_class', 'pIC50']].reset_index(drop=True)
df_fp_clean = df_fp_clean.reset_index(drop=True)

# Prepare QSAR dataset
combined_df = pd.concat([meta_cols, df_fp_clean], axis=1)

# Chtck results
print(f"Final shape (rows, columns): {combined_df.shape}")
combined_df.head()


Total PubChem bits generated: 881
Unique informative bits:      599
Final shape (rows, columns): (4647, 602)


Unnamed: 0,molecule_chembl_id,bioactivity_class,pIC50,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP12,PubchemFP13,PubchemFP14,PubchemFP15,...,PubchemFP840,PubchemFP842,PubchemFP847,PubchemFP854,PubchemFP860,PubchemFP861,PubchemFP862,PubchemFP863,PubchemFP866,PubchemFP868
0,CHEMBL4208168,inactive,5.922632,1,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,CHEMBL1173420,inactive,4.69897,1,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,CHEMBL6005160,active,7.070581,1,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,CHEMBL3900620,inactive,5.0,1,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,CHEMBL3939018,inactive,5.004971,1,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0


## **6. Save and download the dataset**

In [None]:
combined_df.to_csv(results_path + 'QSAR_dataset_fp.csv', index=False)
print("Combined dataset saved as QSAR_dataset_fp.csv")

files.download(results_path + 'QSAR_dataset_fp.csv')

Combined dataset saved as QSAR_dataset_fp.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>