# 3 - Many encodings, many algorithms, and ensembles.


In the last example 23 used HIV1 protease inhibition data to compare the QSAR modelling performance when using different descriptors, why did we not iterate through ML algorithms? Let's do that here!

You will then learn that using the pooled output of many models can often outperform any given model, this has the term " an ensemble model".

### Import libraries

In [15]:
# general
import os
from multiprocessing import Pool
import random

# data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# cheminformatics
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import deepchem as dc

### Data cleaning as previously

In [16]:
df = pd.read_table('lib/data/2_hiv_protease.tsv')
# some entries are missing label values
df = df[df['Standard Value'] > 0]
# there are duplicate values, let's get the mean Ki per smiles
df = pd.DataFrame( df.groupby(['Smiles'])['Standard Value'].mean() )
# now the Ki values are very diverse, let's take the log10 transform
data = {'smiles': df['Standard Value'].index,
        'pKi': np.log10(df['Standard Value'].values)}
df = pd.DataFrame(data)
df

Unnamed: 0,smiles,pKi
0,C#CCN1C(=O)N(CC#C)[C@H](Cc2ccccc2)[C@@H]2OC(C)...,1.342423
1,C#CCN1C(=O)N(CC#C)[C@H](Cc2ccccc2)[C@H](O)[C@@...,1.342423
2,C/C(=C\C(=O)N(C)[C@@H](Cc1ccccc1)[C@H](O)CN(Cc...,1.720490
3,C/C(=N/O)c1cccc(CN2C(=O)N(Cc3cccc(/C(C)=N/O)c3...,-1.744727
4,C/C(=N/O)c1cccc(CN2[C@H](COc3ccccc3)[C@H](O)[C...,0.531479
...,...,...
2524,O[C@H]1C[C@H](Cc2ccccc2)[C@H](O)[C@@H](Cc2cccc...,6.301030
2525,S=C(NCCOc1nc2cc(Cl)ccc2n2cccc12)Nc1ccc(Br)cn1,1.845098
2526,[N-]=[N+]=N[C@@H](Cc1ccccc1)[C@@H]1[C@@H](O)[C...,2.518514
2527,[N-]=[N+]=Nc1ccc(S(=O)(=O)Nc2cccc(C(c3c(O)oc4c...,0.414973


In [17]:
# define helper functions to help calculate vectors in parallel
def smiles_2_rdkitDescr(smiles):
    featurizer = dc.feat.RDKitDescriptors()
    return(featurizer(smiles).flatten())

def smiles_2_ecfp3(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol,3,nBits=1024,useFeatures=True)
    arr = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return(arr)

def smiles_2_ecfp4(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol,4,nBits=1024,useFeatures=True)
    arr = np.zeros((1,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return(arr)

def smiles_2_MACCS(smiles):
    featurizer = dc.feat.MACCSKeysFingerprint()
    return(featurizer(smiles).flatten())

def smiles_2_mordredDescr(smiles):
    featurizer = dc.feat.MordredDescriptors(ignore_3D=False)
    return(featurizer(smiles).flatten())  

In [18]:
# Now we generate the vector of vectors for each type of descriptor
df = df[0:200]
y = df['pKi']

# rdkit descriptors - 208 total
with Pool(processes=os.cpu_count()) as p:
    x_rdkit = np.stack(p.map(smiles_2_rdkitDescr, df.smiles.values))

# Extended fingerprint
with Pool(processes=os.cpu_count()) as p:
    x_ecfp3 = np.stack(p.map(smiles_2_ecfp3, df.smiles.values))

with Pool(processes=os.cpu_count()) as p:
    x_ecfp4 = np.stack(p.map(smiles_2_ecfp4, df.smiles.values))

# MACCS
with Pool(processes=os.cpu_count()) as p:
    x_MACCS = np.stack(p.map(smiles_2_MACCS, df.smiles.values))

# Mordred - commented out to save time
#with Pool(processes=os.cpu_count()) as p:
#    x_mordred = np.stack(p.map(smiles_2_mordredDescr, df.Smiles.values))

In [25]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

dict_algorithm = {
    "LR": LinearRegression(),
    "KNN": KNeighborsRegressor(),
    "DT": DecisionTreeRegressor(max_depth=8),
    "RF": RandomForestRegressor(),
    "xgb3": xgb.XGBRegressor(n_estimators=200, max_depth=3),
    "xgb4": xgb.XGBRFRegressor(n_estimators=200, max_depth=4),
    "xgb5": xgb.XGBRFRegressor(n_estimators=200, max_depth=5),
}

dict_feats = {
    'x_rdkit': x_rdkit,
    'x_ecfp3': x_ecfp3,
    'x_ecfp4': x_ecfp4,
    'x_MACCS': x_MACCS,
    #'x_mordred': x_mordred,    # commented out to save time
}

print('feature - algorithm - RMSE - r2')

resample_num = 10

# for each molvector type
for f, f_df in dict_feats.items():

    # for each algorithm
    for m, model_instantiation in dict_algorithm.items():

        #traack metrics
        rmse_total = 0
        r2_total = 0

        for rep in range(resample_num):
            
            # each resmaple produces a novel dataset and model
            x_train, x_test, y_train, y_test = train_test_split(
                f_df,
                y,
                test_size=0.3,
                random_state=random.randint(1,999))
            model_instantiation.fit(x_train, y_train)

            # tot up metrics
            test_preds = model_instantiation.predict(x_test)
            col1 = f
            col2 = m
            rmse_total = rmse_total + mean_squared_error(y_test, test_preds)
            r2_total = r2_total + r2_score(y_test, test_preds)
        
        # get mean summary stats
        mean_rmse = rmse_total / resample_num
        mean_r2 = r2_total / resample_num

        newrow = pd.DataFrame( [[col1, col2, mean_rmse, mean_r2]] ,columns=['feat', 'model', 'RMSE', 'r2']) 
        print(newrow.to_string(index=False, header=False))

feature - algorithm - RMSE - r2
x_rdkit LR 1.867885e+21 -2.627160e+20
x_rdkit KNN 8.969781 -0.052794
x_rdkit DT 10.086065 -0.328299
x_rdkit RF 5.321819 0.251444
x_rdkit xgb3 6.968863 0.108608
x_rdkit xgb4 6.684751 0.239276
x_rdkit xgb5 7.041868 0.089502
x_ecfp3 LR 1.502057e+27 -3.219055e+26
x_ecfp3 KNN 4.667032 0.319367
x_ecfp3 DT 6.871945 -0.00026
x_ecfp3 RF 4.898402 0.24375
x_ecfp3 xgb3 6.886354 0.061699
x_ecfp3 xgb4 6.222483 0.288439
x_ecfp3 xgb5 5.489033 0.3548
x_ecfp4 LR 2.437789e+27 -3.833814e+26
x_ecfp4 KNN 4.193992 0.422053
x_ecfp4 DT 6.99137 0.186972
x_ecfp4 RF 5.548642 0.368625
x_ecfp4 xgb3 6.200743 0.049698
x_ecfp4 xgb4 5.878062 0.097405
x_ecfp4 xgb5 5.527232 0.344559
x_MACCS LR 8.378004e+25 -7.127467e+24
x_MACCS KNN 5.651942 0.245493
x_MACCS DT 10.510051 -0.485777
x_MACCS RF 5.820756 0.247792
x_MACCS xgb3 7.615086 -0.047532
x_MACCS xgb4 5.854964 0.160063
x_MACCS xgb5 6.679641 0.069235


### So what did we learn?

Here we took a problem, predicting the activity of HIV1 protease inhibitors. And using a set of classic ML QSAR models, tried to find the optimal balance of encoding and algorithm in order to find the best model.
In this example the stochastic nature of the test:train split between different runs means each of us may have a different best model. I actually found KNN with ecfp4 the best, this may be as the data has a clumpy chemical space, with groups of active frameworks, and less active frameworks, but more on visualisation in another chapter.

How could we overcome this? Well by increasing the resample_num to a much higher number means we can get a much better resolution on the probability distributions of the performance metrics. Try this at home.


Now this is fairly comprehensive as far as classicML goes, but we can do a little better. 

### The endgame of classicML, the ensemble model

Ensemlbe tree models, expand on decison trees. Rather thatn rely one "well" trained tree, they rely on many sub-optimal tree's then collate the output. Think of this as a medical problem, rather than taking one consultants opinion you instead take the average opinion of a diverse pool of junior doctors. Now why  stop your analogy there? If we asked for the opinions of many junior doctors across many hospitals, and totted dup their votes we could (should?) achieve better predictive power. This is what ensemble models (as in a model of models) try to do, Let's build one here!

Now what we will do, it take the best performing models from above, re-run them, but collate their outputs.
Let's use the xgb4 algorthm, and sum over all the different descriptors.

In [33]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

dict_algorithm = {
    #"xgb3": xgb.XGBRFRegressor(n_estimators=200, max_depth=3),
    "KNN": KNeighborsRegressor(),
}

dict_feats = {
    'x_rdkit': x_rdkit,
    'x_ecfp3': x_ecfp3,
    'x_ecfp4': x_ecfp4,
    'x_MACCS': x_MACCS,
    #'x_mordred': x_mordred,
}

print(' RMSE - r2')

resample_num = 5



for rep in range(resample_num):
    i = 0

    #traack metrics
    rmse_total = 0
    r2_total = 0

    r_state = random.randint(1,999)
         
    # for each molvector type
    for f, f_df in dict_feats.items():

        # each resmaple produces a novel dataset and model
        x_train, x_test, y_train, y_test = train_test_split(
            f_df,
            y,
            test_size=0.3,
            random_state=r_state)

        # for each algorithm
        for m, model_instantiation in dict_algorithm.items():

            # get the sum of prediction value per test point
            model_instantiation.fit(x_train, y_train)
            if i == 0:
                test_preds = model_instantiation.predict(x_test)
            else:
                test_preds = test_preds + model_instantiation.predict(x_test)
            i = i+1
        
    # now get the mean predicted value per test datapoint
    test_preds = test_preds / len(dict_feats)

    rmse_total = rmse_total + mean_squared_error(y_test, test_preds)
    r2_total = r2_total + r2_score(y_test, test_preds)
        
    # get mean summary stats
    mean_rmse = rmse_total
    mean_r2 = r2_total 

    newrow = pd.DataFrame( [[mean_rmse, mean_r2]] ,columns=['RMSE', 'r2']) 
    print(newrow.to_string(index=False, header=False))

 RMSE - r2
4.478775 0.49514
3.93618 0.501792
2.570388 0.607279
4.560828 0.51086
4.98882 0.376435


And there we have it, you can see that ensemble models, where we take the pooled output from many different models, is (or normally should be) an improvement on any given model. 


Try tinkering with the above code block, can you find the best combination?