# LIBRARIES

In [3]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np

# DATASET READING 

In [4]:
df = pd.read_csv("SMILES_Big_Data_Set.csv")

# PREVIEW

In [5]:
print("Initial dataset shape:", df.shape)
print(df.head())

Initial dataset shape: (16087, 5)
                                              SMILES  pIC50  \
0         O=S(=O)(Nc1cccc(-c2cnc3ccccc3n2)c1)c1cccs1   4.26   
1  O=c1cc(-c2nc(-c3ccc(-c4cn(CCP(=O)(O)O)nn4)cc3)...   4.34   
2             NC(=O)c1ccc2c(c1)nc(C1CCC(O)CC1)n2CCCO   4.53   
3                NCCCn1c(C2CCNCC2)nc2cc(C(N)=O)ccc21   4.56   
4                  CNC(=S)Nc1cccc(-c2cnc3ccccc3n2)c1   4.59   

                                                mol  num_atoms    logP  
0  <rdkit.Chem.rdchem.Mol object at 0x7f59df45bc30>         25  4.1591  
1  <rdkit.Chem.rdchem.Mol object at 0x7f59a320c9e0>         36  3.6743  
2  <rdkit.Chem.rdchem.Mol object at 0x7f59a320cac0>         23  1.5361  
3  <rdkit.Chem.rdchem.Mol object at 0x7f59a320cba0>         22  0.9510  
4  <rdkit.Chem.rdchem.Mol object at 0x7f59a320c7b0>         21  3.2130  


# CONVERT SMILES TO FINGERPRINT

A fingerprint is a way of numerically representing a molecule's structure so that computers can understand and compare molecules efficiently.

In [6]:
def smiles_to_fingerprint(smiles, radius=2, n_bits=2048):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        return list(fp)
    except:
        return None

# GENERATING FINGERPRINTS FOR EACH COMPOUND

In [7]:
df['fingerprint'] = df['SMILES'].apply(smiles_to_fingerprint)

# REMOVING ROWS WITH INVALID SMILES 

In [8]:
df = df[df['fingerprint'].notnull()]
print("Cleaned dataset shape:", df.shape)

Cleaned dataset shape: (16087, 6)


# FINGERPRINTS TO NUMPY ARRAY 

In [9]:
fingerprint_matrix = np.array(df['fingerprint'].tolist())

# SAVING PROCESSED DATA 

In [10]:
np.save("compound_features.npy", fingerprint_matrix)
df[['SMILES']].to_csv("compound_metadata.csv", index=False)

# FINAL SHAPE 

In [11]:
print("Fingerprint matrix shape:", fingerprint_matrix.shape)

Fingerprint matrix shape: (16087, 2048)
