In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, Lipinski, Fragments

In [2]:
training_data_path = 'training_smiles.csv'
test_data_path = 'test_smiles.csv'

training_data = pd.read_csv(training_data_path, dtype = {'ACTIVE': int})
test_data = pd.read_csv(test_data_path)

In [3]:
def extract_fingerprints(smiles):
    mol = Chem.MolFromSmiles(smiles)

    features = {}

    # Try nBits 2048, 1024, 512, 256
    # Morgan Fingerprint
    morgan_fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=512)
    for i in range(512):
        features[f'fp_{i}'] = morgan_fp[i]

    return features


    
training_features_df = training_data['SMILES'].apply(extract_fingerprints)

training_features_df = training_features_df.apply(pd.Series)

training_data_fingerprint = training_data.join(training_features_df)

training_data_fingerprint.to_csv('training_data_fingerprint.csv', index=False)


  training_features_df = training_features_df.apply(pd.Series)


In [4]:

def extract_features(smiles):

    mol = Chem.MolFromSmiles(smiles)

    features = {}

    # Basic Properties
    features['num_atoms'] = mol.GetNumAtoms()
    features['num_bonds'] = mol.GetNumBonds()
    features['num_rings'] = mol.GetRingInfo().NumRings()

    # Molecular Descriptors
    for desc_name, desc_func in Descriptors.descList:
        features[desc_name] = desc_func(mol)

    # Lipinski Descriptors
    features['num_rotatable_bonds'] = Lipinski.NumRotatableBonds(mol)
    features['num_aromatic_rings'] = Lipinski.NumAromaticRings(mol)
    features['num_heteroatoms'] = Lipinski.NumHeteroatoms(mol)
    features['num_heavy_atoms'] = Lipinski.HeavyAtomCount(mol)
    features['num_h_donors'] = Lipinski.NumHDonors(mol)
    features['num_h_acceptors'] = Lipinski.NumHAcceptors(mol)
    features['num_aliphatic_rings'] = Lipinski.NumAliphaticRings(mol)
    features['num_saturated_rings'] = Lipinski.NumSaturatedRings(mol)
    features['num_aromatic_heterocycles'] = Lipinski.NumAromaticHeterocycles(mol)
    features['num_aromatic_carbocycles'] = Lipinski.NumAromaticCarbocycles(mol)
    features['num_aliphatic_heterocycles'] = Lipinski.NumAliphaticHeterocycles(mol)
    features['num_aliphatic_carbocycles'] = Lipinski.NumAliphaticCarbocycles(mol)

    # Fragment Descriptors
    for frag_func in dir(Fragments):
        if frag_func.startswith('fr_'):
            features[frag_func] = getattr(Fragments, frag_func)(mol)


    return features

training_features_df = training_data['SMILES'].apply(extract_features)

training_features_df = training_features_df.apply(pd.Series)

training_data_features = training_data.join(training_features_df)

training_data_features.to_csv('training_data_features.csv', index=False)



  training_features_df = training_features_df.apply(pd.Series)
