# 3 - Many encodings, many algorithms, and ensembles.


In the last example we used HIV1 protease inhibition data to compare the QSAR modelling performance when using different descriptors, why did we not iterate through ML algorithms? Let's do that here!

You will then learn that using the pooled output of many models can often outperform any given model, this has the term " an ensemble model".

### Import libraries

In [1]:
# general
import os
from multiprocessing import Pool
import random

# data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# cheminformatics
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import deepchem as dc

2023-06-06 22:32:16.248334: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-06 22:32:18.862747: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-06 22:32:18.872668: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Skipped loading some PyTorch models, missing a dependency. No module named 'torch'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch'
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'torch'
Skipped loading some Jax models, missing a dependency. jax requires jaxlib to be installed. See https://github.com/google/j

### Data cleaning as previously

In [2]:
df = pd.read_table('lib/data/2_hiv_protease.tsv')
# some entries are missing label values
df = df[df['Standard Value'] > 0]
# there are duplicate values, let's get the mean Ki per smiles
df = pd.DataFrame( df.groupby(['Smiles'])['Standard Value'].mean() )
# now the Ki values are very diverse, let's take the log10 transform
data = {'smiles': df['Standard Value'].index,
        'pKi': np.log10(df['Standard Value'].values)}
df = pd.DataFrame(data)
df

Unnamed: 0,smiles,pKi
0,C#CCN1C(=O)N(CC#C)[C@H](Cc2ccccc2)[C@@H]2OC(C)...,1.342423
1,C#CCN1C(=O)N(CC#C)[C@H](Cc2ccccc2)[C@H](O)[C@@...,1.342423
2,C/C(=C\C(=O)N(C)[C@@H](Cc1ccccc1)[C@H](O)CN(Cc...,1.720490
3,C/C(=N/O)c1cccc(CN2C(=O)N(Cc3cccc(/C(C)=N/O)c3...,-1.744727
4,C/C(=N/O)c1cccc(CN2[C@H](COc3ccccc3)[C@H](O)[C...,0.531479
...,...,...
2524,O[C@H]1C[C@H](Cc2ccccc2)[C@H](O)[C@@H](Cc2cccc...,6.301030
2525,S=C(NCCOc1nc2cc(Cl)ccc2n2cccc12)Nc1ccc(Br)cn1,1.845098
2526,[N-]=[N+]=N[C@@H](Cc1ccccc1)[C@@H]1[C@@H](O)[C...,2.518514
2527,[N-]=[N+]=Nc1ccc(S(=O)(=O)Nc2cccc(C(c3c(O)oc4c...,0.414973


In [3]:
# define helper functions to help calculate vectors in parallel
def smiles_2_rdkitDescr(smiles):
    featurizer = dc.feat.RDKitDescriptors()
    return(featurizer(smiles).flatten())

def smiles_2_ecfp3(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol,3,nBits=1024,useFeatures=True)
    arr = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return(arr)

def smiles_2_ecfp4(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol,4,nBits=1024,useFeatures=True)
    arr = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return(arr)

def smiles_2_MACCS(smiles):
    featurizer = dc.feat.MACCSKeysFingerprint()
    return(featurizer(smiles).flatten())

def smiles_2_mordredDescr(smiles):
    featurizer = dc.feat.MordredDescriptors(ignore_3D=False)
    return(featurizer(smiles).flatten())  

In [10]:
# Now we generate the vector of vectors for each type of descriptor
df = df[0:200]
y = df['pKi']

# rdkit descriptors - 208 total
with Pool(processes=os.cpu_count()) as p:
    x_rdkit = np.stack(p.map(smiles_2_rdkitDescr, df.smiles.values))

# Extended fingerprint
with Pool(processes=os.cpu_count()) as p:
    x_ecfp3 = np.stack(p.map(smiles_2_ecfp3, df.smiles.values))

with Pool(processes=os.cpu_count()) as p:
    x_ecfp4 = np.stack(p.map(smiles_2_ecfp4, df.smiles.values))

# MACCS
with Pool(processes=os.cpu_count()) as p:
    x_MACCS = np.stack(p.map(smiles_2_MACCS, df.smiles.values))

# Mordred - commented out to save time
#with Pool(processes=os.cpu_count()) as p:
#    x_mordred = np.stack(p.map(smiles_2_mordredDescr, df.Smiles.values))

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

dict_algorithm = {
    "LR": LinearRegression(),
    "KNN": KNeighborsRegressor(),
    "DT": DecisionTreeRegressor(max_depth=8),
    "RF": RandomForestRegressor(),
    "xgb3": xgb.XGBRegressor(n_estimators=200, max_depth=3),
    "xgb4": xgb.XGBRFRegressor(n_estimators=200, max_depth=4),
}

dict_feats = {
    'x_rdkit': x_rdkit,
    'x_ecfp3': x_ecfp3,
    'x_ecfp4': x_ecfp4,
    'x_MACCS': x_MACCS,
    #'x_mordred': x_mordred,    # commented out to save time
}

print('feature - algorithm - RMSE - r2')

resample_num = 10

# for each molvector type
for f, f_df in dict_feats.items():

    # for each algorithm
    for m, model_instantiation in dict_algorithm.items():

        #traack metrics
        rmse_total = 0
        r2_total = 0

        for rep in range(resample_num):
            
            # each resmaple produces a novel dataset and model
            x_train, x_test, y_train, y_test = train_test_split(
                f_df,
                y,
                test_size=0.3,
                random_state=random.randint(1,999))
            model_instantiation.fit(x_train, y_train)

            # tot up metrics
            test_preds = model_instantiation.predict(x_test)
            col1 = f
            col2 = m
            rmse_total = rmse_total + mean_squared_error(y_test, test_preds)
            r2_total = r2_total + r2_score(y_test, test_preds)
        
        # get mean summary stats
        mean_rmse = rmse_total / resample_num
        mean_r2 = r2_total / resample_num

        newrow = pd.DataFrame( [[col1, col2, mean_rmse, mean_r2]] ,columns=['feat', 'model', 'RMSE', 'r2']) 
        print(newrow.to_string(index=False, header=False))

feature - algorithm - RMSE - r2
x_rdkit LR 9.227669e+19 -1.135705e+19
x_rdkit KNN 8.119523 -0.087871
x_rdkit DT 8.778026 -0.699454
x_rdkit RF 5.685718 0.19222
x_rdkit xgb3 6.503069 0.000019
x_rdkit xgb4 7.019775 0.152754
x_ecfp3 LR 2.955073e+27 -3.895561e+26
x_ecfp3 KNN 4.745328 0.425525
x_ecfp3 DT 6.523491 0.233176
x_ecfp3 RF 4.658947 0.276264
x_ecfp3 xgb3 5.980744 0.024374
x_ecfp3 xgb4 5.185611 0.365605
x_ecfp4 LR 3.008078e+27 -4.928348e+26
x_ecfp4 KNN 4.132746 0.419916
x_ecfp4 DT 7.740574 0.016389
x_ecfp4 RF 4.960041 0.29256
x_ecfp4 xgb3 6.209366 0.124537
x_ecfp4 xgb4 5.068801 0.286116
x_MACCS LR 1.841422e+25 -3.307634e+24
x_MACCS KNN 5.63692 0.22788
x_MACCS DT 9.493749 -0.205244
x_MACCS RF 5.660772 0.244222
x_MACCS xgb3 7.104456 0.080607
x_MACCS xgb4 6.139033 0.209061


### So what did we learn?

Here we took a problem, predicting the activity of HIV1 protease inhibitors. And using a set of classic ML QSAR models, tried to find the optimal mix of encoding and algorithm for a QSAR model.

In this example the stochastic nature of the test:train split between different runs means each of us may have a different best model. I actually found KNN with ecfp4 the best, this may be as the data has a clumpy chemical space, with groups of active frameworks, and less active frameworks, but more on visualising that another time.   

What happens if you increase the data size from 200 to 500, or 1000? does the best algorithm always stay the same?



Now this is fairly comprehensive as far as classicML goes, but we can do a little better. There's one final concept to introduce that will / should improve our model, and that's combining models!

### The endgame of classicML, the ensemble model

Ensemble tree models, expand on decison trees. Rather thatn rely one "well" trained tree, they rely on many sub-optimal tree's then collate the output. Think of this as a medical problem, rather than taking one consultants opinion you instead take the average opinion of a diverse pool of junior doctors. Now why  stop your analogy there? If we asked for the opinions of many junior doctors across many hospitals, and totted dup their votes we could (should?) achieve better predictive power. This is what ensemble models (as in a model of models) try to do, Let's build one here!

Now what we will do, it take the best performing models from above, re-run them, but collate their outputs.
Let's use the xgb4 algorthm, and sum over all the different descriptors.

In [None]:
dict_algorithm = {
    "KNN": KNeighborsRegressor(),
}

dict_feats = {
    'x_rdkit': x_rdkit,
    'x_ecfp3': x_ecfp3,
    'x_ecfp4': x_ecfp4,
    'x_MACCS': x_MACCS,
    #'x_mordred': x_mordred,
}

print(' RMSE - r2')

resample_num = 5

for rep in range(resample_num):
    i = 0

    #traack metrics
    rmse_total = 0
    r2_total = 0

    r_state = random.randint(1,999)
         
    # for each molvector type
    for f, f_df in dict_feats.items():

        # each resmaple produces a novel dataset and model
        x_train, x_test, y_train, y_test = train_test_split(
            f_df,
            y,
            test_size=0.3,
            random_state=r_state)

        # for each algorithm
        for m, model_instantiation in dict_algorithm.items():

            # get the sum of prediction value per test point
            model_instantiation.fit(x_train, y_train)
            if i == 0:
                test_preds = model_instantiation.predict(x_test)
            else:
                test_preds = test_preds + model_instantiation.predict(x_test)
            i = i+1
        
    # now get the mean predicted value per test datapoint
    test_preds = test_preds / len(dict_feats)

    rmse_total = rmse_total + mean_squared_error(y_test, test_preds)
    r2_total = r2_total + r2_score(y_test, test_preds)
        
    # get mean summary stats
    mean_rmse = rmse_total
    mean_r2 = r2_total 

    newrow = pd.DataFrame( [[mean_rmse, mean_r2]] ,columns=['RMSE', 'r2']) 
    print(newrow.to_string(index=False, header=False))

 RMSE - r2
4.478775 0.49514
3.93618 0.501792
2.570388 0.607279
4.560828 0.51086
4.98882 0.376435


And there we have it, you can see that ensemble models, where we take the pooled output from many different models, is (or normally should be) an improvement on any given model. 


Try tinkering with the above code block, can you find the best combination?