In [None]:
import rdkit

In [27]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, make_scorer

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

# Context

<p>  This concept of pipelining  + gridSearch was asked in an interview assignment in 2020.<br> The Concept is easy to forget when you do not use Scikit-Learn.
<br> This work can easily be extended to RandomSearch or even <b>BayesianSearch<b>.
<br> The company is specialized in chemoinformatics<p>

### Custom Transformers

Although Scikit-Learn provides many useful transformers, you will need to write
your own for tasks such as custom cleanup operations or combining specific
attributes. 

1. Transform must integrate seamlessly with Scikit-Learn Pipelines

<i>You will want your transformer to work seamlessly with Scikit-Learn functionalities (such as pipelines), and since Scikit-Learn relies on duck typing (not inheritance)</i>

 2. Transform must have 3 methods explicitly defined

<i>All you need is to create a class and implement three methods:</i>
    - fit() (returning self)
    - transform()
    - and fit_transform()
    
<i>You can get the last one for <b>free</b> by simply adding TransformerMixin as a base class.</i>

 3. Tricks with Inheritance

Also, if you add BaseEstimator as a base class 
(and avoid *args and **kargs in your constructor) 

you will get two extra methods (get_params() and set_params()) 
that will be useful for automatic hyperparameter tuning. 
For example, here is a small transformer class that adds
the combined attributes we discussed earlier:

4. Example of Transformer:

In [177]:
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    
    # initialization of the class (with boolean add_bedrooms_per_room)
    def __init__(self, my_add_bedrooms_per_room = True): # no *args or **kargs
        # when initialized we set the property add_bedrooms_per_room 
        # to the value passed when initialized
        self.add_bedrooms_per_room = my_add_bedrooms_per_room
    
    def fit(self, X, y=None):
        '''do not implement anything here because there is no learning'''
        return self # nothing else to do
    
    # here is the magic
    def transform(self, X, y=None):
    
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [178]:
from sklearn.pipeline import Pipeline 
from sklearn.base import BaseEstimator, TransformerMixin

In [239]:
class MorganFreeman(BaseEstimator, TransformerMixin):
    '''
    Transformer operation which output the MorganFingerPrints
    (Chemistry descriptors)
    '''
    def __init__(self, input_morgan_fingerprint_size = 512, input_radius = 1): # no *args or **kargs
        self.input_morgan_fingerprint_size = input_morgan_fingerprint_size
        self.input_radius = input_radius
    
    def fit(self, X, y=None):
        return self # nothing else to do
    
    def transform(self, X, y=None):
        X = AllChem.GetMorganFingerprintAsBitVect(X, self.radius,self.morgan_fingerprint_size)
        return X
    

In [242]:
mg = MorganFreeman(512,2)
mg.input_radius

2

In [243]:
from sklearn.linear_model import LogisticRegression, SGDClassifier

In [None]:
# the idea is to run gridsearches for each model
# and consolidate in a final table

In [244]:
def build_pipeline(classifier):
    '''
    build pipeline with classifier as input
    '''
    # Defining the steps in the categorical pipeline 
    categorical_pipeline = Pipeline( steps = [ ('fingerprint', MorganFreeman()),
                                         ('classifier', classifier)
                                         ])
    return categorical_pipeline

In [245]:
def build_grid_search(pipeline, param_grid, cv_folds=5):
    '''
    build gridsearchCV function with pipeline, param_grid, cv_folds will default to 5
    '''
    return GridSearchCV(pipeline, param_grid, cv=cv_folds, return_train_score=True, refit='accuracy',
                        scoring={ 'accuracy': make_scorer(accuracy_score),
                                  'precision': make_scorer(precision_score)
                                },
                        verbose=1)

In [246]:
def pretty_cv_results(cv_results, 
                      sort_by='rank_test_accuracy',
                      sort_ascending=True,
                      n_rows=5):
    '''
    utility function which displays the results namely: 
    mean_train, mean_test and rank_
    TODO: parameterized the function
    '''
    df = pd.DataFrame(cv_results)
    
    cols_of_interest = [key for key in df.keys() if key.startswith('param_') 
                        or key.startswith('mean_train') 
                        or key.startswith('mean_test_')
                        or key.startswith('rank')]
    return df.loc[:, cols_of_interest].sort_values(by=sort_by, ascending=sort_ascending).head(n_rows)


In [247]:
def run_grid_search(grid_search, X_train, y_train):
    '''
    fit gridsearch on x_train and y_train and return the best score
    with the best parameters
    '''
    grid_search.fit(X_train, y_train)
    print('Best test score accuracy is:', grid_search.best_score_)
    return pretty_cv_results(grid_search.cv_results_)

In [248]:
# Be careful the parameters of the preprocessing step (fingerprint)
# should be refered to as pipelinekey__<input_param>

param_grid = [
    {'fingerprint__input_morgan_fingerprint_size' : [512],
     'fingerprint__input_radius' :[1],

    'classifier': [SGDClassifier(loss='log', tol=None, random_state=42)],
      'classifier__alpha': np.logspace(-5, -3, 3),
      'classifier__penalty': ['l2'],
      'classifier__max_iter': [20],
    }
]

# dummy data
X_train = [[1],[2],[3],[4],[5],[6],[7],[4],[5],[6],[7]]
y_train = [[0],[1],[0],[0],[1],[0],[0],[1],[0],[1],[1]]

<p>We will iterate over classifiers we believe make sense for our problem<br>
They will be classifiers (because we are solving  a classification problem)</p>

##  1. SGD classifier

In [249]:
cls = SGDClassifier

    a) Execute the pipeline with the classifier as argument

In [254]:
# pipeline verification
pipeline = build_pipeline(cls)

In [251]:
#fit the pipeline to X_train , y_train
#in this case no parameter optimisation
pipeline.fit(X_train, y_train)

NameError: name 'AllChem' is not defined

In [252]:
# define gridsearch over the pipeline and the param_grid
sgd_grid_search = build_grid_search(pipeline=build_pipeline(cls), param_grid=param_grid)

# run gridsearch over X_train, y_train
sgd_cv = run_grid_search(sgd_grid_search, X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 292, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/opt/anaconda3/lib/python3.8/site-packages/joblib/memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/opt/anaconda3/lib/python3.8/site-packages/sklearn/base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transfo

NameError: name 'AllChem' is not defined