In [3]:
import deepchem as dc
from rdkit import Chem

In [6]:
smiles = ["C1CCCCC1", "O1CCOCC1"]  # cyclohexane and dioxane
mols = [Chem.MolFromSmiles(smile) for smile in smiles]
feat = dc.feat.CircularFingerprint(size=1024)
arr = feat.featurize(mols)
print(type(arr))
arr

<class 'numpy.ndarray'>


array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [7]:
print(len(arr[0]))

1024


# Load and save list of building blocks

In [20]:
import duckdb
import pandas as pd

train_path = "leash-BELKA/train.parquet"
test_path = "leash-BELKA/test.parquet"

In [10]:
def get_building_blocks(bb_name, file_path):
    con = duckdb.connect()

    # Query using DuckDB
    sql_stm = f"SELECT DISTINCT({bb_name}) FROM parquet_scan('{file_path}')"
    result = con.execute(sql_stm)

    bb_smiles = []
    for bb in result.fetchall():
        bb_smiles.append(bb[0])

    return bb_smiles

In [11]:
bb1_smiles_train = get_building_blocks(
    bb_name="buildingblock1_smiles", file_path=train_path
)
bb2_smiles_train = get_building_blocks(
    bb_name="buildingblock2_smiles", file_path=train_path
)
bb3_smiles_train = get_building_blocks(
    bb_name="buildingblock3_smiles", file_path=train_path
)

print(f"No. BB1: {len(bb1_smiles_train)}")
print(f"No. BB2: {len(bb2_smiles_train)}")
print(f"No. BB3: {len(bb3_smiles_train)}")

No. BB1: 271
No. BB2: 693
No. BB3: 872


In [12]:
bb1_smiles_test = get_building_blocks(
    bb_name="buildingblock1_smiles", file_path=test_path
)
bb2_smiles_test = get_building_blocks(
    bb_name="buildingblock2_smiles", file_path=test_path
)
bb3_smiles_test = get_building_blocks(
    bb_name="buildingblock3_smiles", file_path=test_path
)

print(f"No. BB1: {len(bb1_smiles_test)}")
print(f"No. BB2: {len(bb2_smiles_test)}")
print(f"No. BB3: {len(bb3_smiles_test)}")

No. BB1: 341
No. BB2: 1140
No. BB3: 1389


In [14]:
# check overlap or not -> yes
overlap_train_set = bb1_smiles_train + bb2_smiles_train + bb3_smiles_train
print("Before:", len(overlap_train_set))
print("After:", len(set(overlap_train_set)))

Before: 1836
After: 1145


In [15]:
# check overlap or not -> yes
overlap_test_set = bb1_smiles_test + bb2_smiles_test + bb3_smiles_test
print("Before:", len(overlap_test_set))
print("After:", len(set(overlap_test_set)))

Before: 2870
After: 2110


In [17]:
# check train and test -> yes (35.17%)
overlap_all_set = list(set(overlap_train_set)) + list(set(overlap_test_set))
print("Before:", len(overlap_all_set))
print("After:", len(set(overlap_all_set)))

Before: 3255
After: 2110


In [26]:
# indexing all smiles
overlap_all_set = sorted(set(overlap_all_set))
pd_bb_smiles = pd.DataFrame(overlap_all_set)
pd_bb_smiles.columns = ["bb_smiles"]
pd_bb_smiles = pd_bb_smiles.reset_index()
print(pd_bb_smiles.shape)
pd_bb_smiles.head()

(2110, 2)


Unnamed: 0,index,bb_smiles
0,0,Br.Br.NCC1CCCN1c1cccnn1
1,1,Br.NCc1cccc(Br)n1
2,2,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
3,3,C#CCN(C)CCCN
4,4,C#CCOc1ccc(CN)cc1.Cl


In [25]:
pd_bb_smiles.to_csv("featurized/all_bb_smiles.csv", index=False)

### Feature 1: Extended-connectivity fingerprints (ECFPs)

In [11]:
import deepchem as dc
from rdkit import Chem
import pandas as pd
import numpy as np

In [2]:
pd_bb_smiles = pd.read_csv("featurized/all_bb_smiles.csv")
print(pd_bb_smiles.shape)
pd_bb_smiles.head()

(2110, 2)


Unnamed: 0,index,bb_smiles
0,0,Br.Br.NCC1CCCN1c1cccnn1
1,1,Br.NCc1cccc(Br)n1
2,2,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
3,3,C#CCN(C)CCCN
4,4,C#CCOc1ccc(CN)cc1.Cl


In [4]:
pd_bb_smiles["mols"] = pd_bb_smiles["bb_smiles"].apply(
    lambda smile: Chem.MolFromSmiles(smile)
)
pd_bb_smiles.head()

Unnamed: 0,index,bb_smiles,mols
0,0,Br.Br.NCC1CCCN1c1cccnn1,<rdkit.Chem.rdchem.Mol object at 0x0000022A4E4...
1,1,Br.NCc1cccc(Br)n1,<rdkit.Chem.rdchem.Mol object at 0x0000022A4E4...
2,2,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,<rdkit.Chem.rdchem.Mol object at 0x0000022A4E4...
3,3,C#CCN(C)CCCN,<rdkit.Chem.rdchem.Mol object at 0x0000022A4E4...
4,4,C#CCOc1ccc(CN)cc1.Cl,<rdkit.Chem.rdchem.Mol object at 0x0000022A4E4...


In [7]:
feat = dc.feat.CircularFingerprint(size=1024)
pd_bb_smiles["f1_ecfp"] = pd_bb_smiles["mols"].apply(
    lambda mols: feat.featurize(mols)[0]
)
pd_bb_smiles.head()



Unnamed: 0,index,bb_smiles,mols,f1_ecfp
0,0,Br.Br.NCC1CCCN1c1cccnn1,<rdkit.Chem.rdchem.Mol object at 0x0000022A4E4...,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,Br.NCc1cccc(Br)n1,<rdkit.Chem.rdchem.Mol object at 0x0000022A4E4...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,C#CCCC[C@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O,<rdkit.Chem.rdchem.Mol object at 0x0000022A4E4...,"[0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3,C#CCN(C)CCCN,<rdkit.Chem.rdchem.Mol object at 0x0000022A4E4...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4,C#CCOc1ccc(CN)cc1.Cl,<rdkit.Chem.rdchem.Mol object at 0x0000022A4E4...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [9]:
pd_bb_smiles.loc[0, "f1_ecfp"]

array([0., 0., 0., ..., 0., 0., 0.])

In [13]:
# Create a new dataframe where each element in the array is split into its own column
ecfp_df = pd.DataFrame(pd_bb_smiles["f1_ecfp"].tolist(), index=pd_bb_smiles.index)

# Rename the columns with the desired format (f1_ecfp_1, f1_ecfp_2, ..., f1_ecfp_1024)
ecfp_df.columns = [f"f1_ecfp_{i+1}" for i in range(ecfp_df.shape[1])]

# Convert values to byte (uint8) type
ecfp_df = ecfp_df.astype(np.uint8)
ecfp_df = ecfp_df.reset_index()

print(ecfp_df.shape)
ecfp_df.head()

(2110, 1025)


Unnamed: 0,index,f1_ecfp_1,f1_ecfp_2,f1_ecfp_3,f1_ecfp_4,f1_ecfp_5,f1_ecfp_6,f1_ecfp_7,f1_ecfp_8,f1_ecfp_9,...,f1_ecfp_1015,f1_ecfp_1016,f1_ecfp_1017,f1_ecfp_1018,f1_ecfp_1019,f1_ecfp_1020,f1_ecfp_1021,f1_ecfp_1022,f1_ecfp_1023,f1_ecfp_1024
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# check coverage
ecfp_df.mean().sort_values(ascending=False)

index          1054.500000
f1_ecfp_357       0.856872
f1_ecfp_850       0.818483
f1_ecfp_727       0.623697
f1_ecfp_148       0.605213
                  ...     
f1_ecfp_29        0.000948
f1_ecfp_49        0.000948
f1_ecfp_90        0.000948
f1_ecfp_543       0.000474
f1_ecfp_451       0.000474
Length: 1025, dtype: float64

In [14]:
ecfp_df.to_csv("featurized/all_f1_ecfp.csv", index=False)