In [7]:
import pandas as pd
import numpy as np
import os


from rdkit import Chem
from rdkit.Chem import AllChem

In [5]:
preprocessed_path = os.path.join('..', 'data', 'preprocessed')
train_dataset = pd.read_csv(preprocessed_path + '/001_preprocessed_train_data.csv')
test_dataset = pd.read_csv(preprocessed_path + '/001_preprocessed_test_data.csv')

train_dataset.head()

Unnamed: 0,CANONICAL_SMILES,ACTIVITY
0,C1CCN(C1)C(=O)C2=NOC(=C2)C3=CC=CC=C3Cl,1
1,C1C2CC3CC1CC(C2)(C3)C(=O)NCC4=NN=C(N4C5=CC=C(C...,1
2,CCN(CCC#N)C1=CC(=C(C=C1)/C=N/NC2=NC(=CC(=O)N2)C)C,0
3,CC1=C(N(C2=C1C=C(C=C2)C(=O)OCCCN(C)C)CC3=CC=CC...,1
4,C1=CC=C2C(=C1)NC3=C(C(C(=C(N23)N)C#N)C4=CC=C(C...,1


In [8]:
def morgan_fps(data):
    fps = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048) for mol in data]
    fp_array = [np.array(fp) for fp in fps]  
    column_names = ["morgan_" + str(i) for i in range(2048)]  
    return pd.DataFrame(fp_array, columns=column_names)


train_smiles_list = train_dataset['CANONICAL_SMILES'].tolist() 
test_smiles_list = test_dataset['CANONICAL_SMILES'].tolist() 
train_mols = [Chem.MolFromSmiles(smi) for smi in train_smiles_list]
test_mols = [Chem.MolFromSmiles(smi) for smi in test_smiles_list]

In [None]:
X_train_morgan_fps = morgan_fps(train_mols)
X_test_morgan_fps = morgan_fps(test_mols)

In [12]:
X_train_morgan_fps.head()

Unnamed: 0,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,morgan_9,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
X_train_morgan_fps.to_csv(preprocessed_path + '/002_train_morgan_fps.csv')
X_test_morgan_fps.to_csv(preprocessed_path + '/002_test_morgan_fps.csv')