In [2]:
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors, Lipinski, GraphDescriptors, AllChem, MACCSkeys

In [None]:
def extract_molecule_info(smiles, name):

    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return pd.Series({})

    info = {
        f"{name}_mol_wt": Descriptors.MolWt(mol),
        f"{name}_logp": Descriptors.MolLogP(mol),
        f"{name}_atoms": mol.GetNumAtoms(),
        f"{name}_bonds": mol.GetNumBonds(),
        f"{name}_rotbonds": Lipinski.NumRotatableBonds(mol),
        f"{name}_hdonors": Lipinski.NumHDonors(mol),
        f"{name}_hacceptors": Lipinski.NumHAcceptors(mol),
        f"{name}_tpsa": rdMolDescriptors.CalcTPSA(mol),
        f"{name}_aromrings": rdMolDescriptors.CalcNumAromaticRings(mol),
        f"{name}_satrings": rdMolDescriptors.CalcNumSaturatedRings(mol),
        f"{name}_alirings": rdMolDescriptors.CalcNumAliphaticRings(mol),
        f"{name}_csp3": rdMolDescriptors.CalcFractionCSP3(mol),
        f"{name}_hka": rdMolDescriptors.CalcHallKierAlpha(mol),
        f"{name}_chi0n": GraphDescriptors.Chi0n(mol),
        f"{name}_chi1n": GraphDescriptors.Chi1n(mol),
        f"{name}_kappa1": GraphDescriptors.Kappa1(mol),
    }

    return pd.Series(info)

In [3]:

def generate_ecfp(smiles,
                  radius=2,
                  nBits=1024,
                  use_features=False,
                  use_chirality=False):

    molecule = Chem.MolFromSmiles(smiles)

    if molecule is None:
        return np.zeros((nBits,), dtype=np.uint8)

    feature_list = AllChem.GetMorganFingerprintAsBitVect(molecule,
                                                         radius=radius,
                                                         nBits=nBits,
                                                         useFeatures=use_features,
                                                         useChirality=use_chirality)
    # Add numpy to uint8

    return np.array(feature_list)

In [None]:
def smiles_to_maccs(smiles):

    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return None
    else:
        maccs_key = MACCSkeys.GenMACCSKeys(molecule)
        return [int(bit) for bit in maccs_key.ToBitString()]

In [4]:
df = pd.read_parquet("data/test.parquet")
df

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name
0,295246830,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,BRD4
1,295246831,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,HSA
2,295246832,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,C=Cc1ccc(N)cc1,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ccc(C=C...,sEH
3,295246833,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,CC(O)Cn1cnc2c(N)ncnc21,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2...,BRD4
4,295246834,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,C=Cc1ccc(N)cc1,CC(O)Cn1cnc2c(N)ncnc21,C#CCCC[C@H](Nc1nc(Nc2ccc(C=C)cc2)nc(Nc2ncnc3c2...,HSA
...,...,...,...,...,...,...
1674891,296921721,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,COC1CCC(CCN)CC1,COC1CCC(CCNc2nc(Nc3noc4ccc(F)cc34)nc(N[C@@H](C...,HSA
1674892,296921722,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,COC1CCC(CCN)CC1,COC1CCC(CCNc2nc(Nc3noc4ccc(F)cc34)nc(N[C@@H](C...,sEH
1674893,296921723,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,NCc1cccs1,[N-]=[N+]=NCCC[C@H](Nc1nc(NCc2cccs2)nc(Nc2noc3...,BRD4
1674894,296921724,[N-]=[N+]=NCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc...,Nc1noc2ccc(F)cc12,NCc1cccs1,[N-]=[N+]=NCCC[C@H](Nc1nc(NCc2cccs2)nc(Nc2noc3...,HSA


In [5]:
df.columns

Index(['id', 'buildingblock1_smiles', 'buildingblock2_smiles',
       'buildingblock3_smiles', 'molecule_smiles', 'protein_name'],
      dtype='object')

In [6]:
batch_size = 500
num_batches = (len(df) + batch_size - 1) // batch_size

all_fingerprints = []

for i in range(num_batches):
    batch = df.iloc[i*batch_size:(i+1)*batch_size]
    fingerprints = batch['molecule_smiles'].apply(generate_ecfp)
    all_fingerprints.append(np.vstack(fingerprints.values))
    print(f"Processed batch {i+1}/{num_batches}")

ecfp_matrix = np.vstack(all_fingerprints)
ecfp_matrix = ecfp_matrix.astype(np.uint8)

Processed batch 1/3350
Processed batch 2/3350
Processed batch 3/3350
Processed batch 4/3350
Processed batch 5/3350
Processed batch 6/3350
Processed batch 7/3350
Processed batch 8/3350
Processed batch 9/3350
Processed batch 10/3350
Processed batch 11/3350
Processed batch 12/3350
Processed batch 13/3350
Processed batch 14/3350
Processed batch 15/3350
Processed batch 16/3350
Processed batch 17/3350
Processed batch 18/3350
Processed batch 19/3350
Processed batch 20/3350
Processed batch 21/3350
Processed batch 22/3350
Processed batch 23/3350
Processed batch 24/3350
Processed batch 25/3350
Processed batch 26/3350
Processed batch 27/3350
Processed batch 28/3350
Processed batch 29/3350
Processed batch 30/3350
Processed batch 31/3350
Processed batch 32/3350
Processed batch 33/3350
Processed batch 34/3350
Processed batch 35/3350
Processed batch 36/3350
Processed batch 37/3350
Processed batch 38/3350
Processed batch 39/3350
Processed batch 40/3350
Processed batch 41/3350
Processed batch 42/3350
P

In [9]:
ecfp_matrix = ecfp_matrix.astype(np.uint8)

In [10]:
np.save("data/processed_data/ecfp_test.npy", ecfp_matrix)

In [None]:
ecfp_matrix.shape

In [None]:
ecfp_matrix

In [None]:
df['ecfp'] = df['molecule_smiles'].apply(generate_ecfp)
df

In [None]:
df = df[['ecfp']]

In [None]:
df.to_parquet("data/ecfp_train.parquet")

In [None]:
ecfp = df['ecfp'].apply(pd.Series)
ecfp.columns = [f'ecfp{i+1}' for i in range(ecfp.shape[1])]
ecfp

In [None]:
ecfp.to_parquet("data/ecfp_train.parquet")