## QSAR Challenge

In [9]:
## Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


import rdkit
from rdkit import Chem, DataStructs
from rdkit.Chem import rdFingerprintGenerator


In [12]:
## Load Dataset
data = pd.read_csv('resources/data_train.csv', index_col=0).reset_index(drop=True)
data.head()

## Data Preprocessing 
data['mol'] = [Chem.MolFromSmiles(smi) for smi in tqdm(data['smiles'], desc='Convert Smiles to Mol')]


#Convert the molecules to Morgan fingerprints
mfpgen = Chem.rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)
fps = np.array([mfpgen.GetFingerprintAsNumPy(mol) for mol in tqdm(data.mol, desc='Gen Morgen Fingerprints')])
fps


Convert Smiles to Mol:   0%|          | 0/12000 [00:00<?, ?it/s]



Gen Morgen Fingerprints:   0%|          | 0/12000 [00:00<?, ?it/s]

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]], dtype=uint8)

In [21]:
## Split the data
# Separate features (fingerprints) and targets
X = fps  # fingerprints we generated earlier
Y = data.drop(['smiles', 'mol'], axis=1)  # all columns except smiles and mol

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(f'Train shape: {X_train.shape}')

Train shape: (9600, 1024)


In [26]:
## The model for the tasks

def train_rf(X_train, X_test, Y_train, Y_test):
    seed = 1234
    n_tasks = Y_train.shape[1]
    y_hats_probability = np.empty((X_test.shape[0], n_tasks))
    y_hats_class = np.empty_like(y_hats_probability)

    #Train a random forest for each task
    for i in tqdm(range(n_tasks), desc='Training'):
        random_forest = RandomForestClassifier(n_estimators=100, random_state=seed)
        print(~np.isnan(Y_train[:, i]))
        index = ~np.isnan(Y_train[:, i])

        random_forest.fit(X_train[index], Y_train[index, i])
        y_hats_probability[:, i] = random_forest.predict_proba(X_test)[:, 1]
        y_hats_class[:, i] = random_forest.predict(X_test)
    return y_hats_probability, y_hats_class

#Train the model
y_hat_probability, y_hat_class = train_rf(X_train=X_train, X_test=X_test, Y_train=Y_train.values, Y_test=Y_test.values)


Training:   0%|          | 0/11 [00:00<?, ?it/s]

[ True  True  True ...  True  True  True]
[ True  True  True ...  True  True  True]
[ True  True  True ...  True  True  True]
[ True  True  True ...  True  True  True]
[ True  True  True ...  True  True  True]
[ True  True  True ...  True  True  True]
[ True  True  True ...  True  True  True]
[ True  True  True ...  True  True  True]
[ True  True  True ...  True  True  True]
[ True  True  True ...  True  True  True]
[ True  True  True ...  True  True  True]


In [27]:
pd.DataFrame(y_hat_probability)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.920000,1.000000,0.980,0.950000,0.540000,0.565000,0.980000,0.990000,0.99,0.99,0.755000
1,1.000000,0.990000,0.885,0.990000,0.930000,0.900000,0.590000,0.970000,0.99,0.98,0.405000
2,0.976833,1.000000,1.000,1.000000,0.489333,0.940000,1.000000,1.000000,1.00,1.00,0.940000
3,0.895000,0.903333,0.890,0.885000,0.888333,0.870000,0.865000,0.940000,0.82,0.90,0.445000
4,0.890000,0.854167,0.790,0.835000,0.785833,0.720000,0.750000,0.871667,0.96,0.95,0.340000
...,...,...,...,...,...,...,...,...,...,...,...
2395,0.960000,0.985000,0.930,0.966000,0.900000,0.870000,0.930000,0.970000,0.96,0.58,0.628333
2396,1.000000,0.980000,0.990,0.980000,0.692000,0.503333,1.000000,0.980000,1.00,0.98,0.940000
2397,0.900000,0.980000,0.920,0.945000,0.689000,0.591667,0.966667,0.971667,0.99,0.98,0.775000
2398,0.953333,0.976667,0.900,0.946667,0.732500,0.646667,0.940000,0.960000,0.98,0.98,0.663333
