# Another summer out of office notebook with TPOT

<div align="center"><img src="https://raw.githubusercontent.com/EpistasisLab/tpot/master/images/tpot-logo.jpg" alt="drawing" width="300"/></div>
TPOT is a Python Automated Machine Learning tool that optimizes machine learning pipelines using genetic programming.

You can find more about it <a href="https://epistasislab.github.io/tpot/">here</a>.

Let's see how to use it in this playground to have the machines do most of the work while we enjoy the summer time !

<u>*Side note:*</u> 

*TPOT use genetic programming to find the best model. It takes time as it assess a population of solutions, then add some new solutions or mix solutions together to try to have better solutions at each generations. I've seen people having TPOT running for a couple of days.*

*So My first intent was to run the notebook for about the maximum before notebook timeout. Then serialize and save the TPOT model in a dataset in order to run the notebook again. I could by this process resume the solutions search process where it was.*

*Unfortunately, TPOT models can't be serialized for later re-use as far as I found. So I can't re-run the notebook over and over to find better solution.*

*See the feature request for this that is still open since about 2019 <a href="https://github.com/EpistasisLab/tpot/issues/977">TPOT GitHub Repo - issue #977</a>*

In [None]:
# first install the TPOT package
! pip install tpot

In [None]:
# let's import the needed pacakages

import numpy as np 
import pandas as pd
import os

import tpot

In [None]:
# and now load the data

root = os.path.join("..","input","tabular-playground-series-aug-2022")
df_train = pd.read_csv(os.path.join(root, "train.csv"), index_col=0)
df_test = pd.read_csv(os.path.join(root, "test.csv"), index_col=0)

df_all = pd.concat([df_train.copy(), df_test.copy()])

In [None]:
# for TPOT, we need to give only numerical features
# so we will now encode text features

from sklearn.preprocessing import LabelEncoder

def create_encoders(df, col):
    
    encoder = LabelEncoder()
    encoder.fit(df[col].values)
    
    df[col] = encoder.transform(df[col].values)
    
    return encoder, df

def apply_encoders(encoders, df):
    for col in encoders.keys():
        if col in df.columns:
            df[col] = encoders[col].transform(df[col].values)

    return df

# first we create the encoders with all the data
encoders = {}
df = df_all.copy()
d_unique = df_all.nunique().to_dict()
for k in d_unique.keys():
    if df_all[k].dtype=="object":
        encoders[k], df = create_encoders(df.copy(), k)
        
# then we apply them on test and train datasets
df_test = apply_encoders(encoders, df_test.copy())    
df_train = apply_encoders(encoders, df_train.copy())        

In [None]:
# we will specify to TPOT the models to investigate
# primarily, to avoid models that do not provide the "predict_proba" method 
# but also to refine the base models settings

classifier_config_dict = {

    # Classifiers
    'sklearn.naive_bayes.GaussianNB': {
    },

    'sklearn.naive_bayes.BernoulliNB': {
        'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],
        'fit_prior': [True, False]
    },

    'sklearn.naive_bayes.MultinomialNB': {
        'alpha': [1e-3, 1e-2, 1e-1, 1., 10., 100.],
        'fit_prior': [True, False]
    },

    'sklearn.tree.DecisionTreeClassifier': {
        'criterion': ["gini", "entropy"],
        'max_depth': range(1, 11),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21)
    },

    'sklearn.ensemble.ExtraTreesClassifier': {
        'n_estimators': [50, 100, 200],
        'criterion': ["gini", "entropy"],
        'max_features': np.arange(0.05, 1.01, 0.05),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21),
        'bootstrap': [True, False]
    },

    'sklearn.ensemble.RandomForestClassifier': {
        'n_estimators': [50, 100, 200],
        'criterion': ["gini", "entropy"],
        'max_features': np.arange(0.05, 1.01, 0.05),
        'min_samples_split': range(2, 21),
        'min_samples_leaf':  range(1, 21),
        'bootstrap': [True, False]
    },

    'sklearn.ensemble.GradientBoostingClassifier': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        'max_depth': range(1, 11),
        'min_samples_split': range(2, 21),
        'min_samples_leaf': range(1, 21),
        'subsample': np.arange(0.05, 1.01, 0.05),
        'max_features': np.arange(0.05, 1.01, 0.05)
    },

    'sklearn.neighbors.KNeighborsClassifier': {
        'n_neighbors': range(1, 101),
        'weights': ["uniform", "distance"],
        'p': [1, 2]
    },

# no predict_proba() method available for this model
#     'sklearn.svm.LinearSVC': {
#         'penalty': ["l1", "l2"],
#         'loss': ["hinge", "squared_hinge"],
#         'dual': [True, False],
#         'tol': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
#         'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.]
#     },

    'sklearn.linear_model.LogisticRegression': {
        'penalty': ["l1", "l2"],
        'C': [1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1., 5., 10., 15., 20., 25.],
        'dual': [True, False]
    },

    'xgboost.XGBClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': range(1, 11),
        'learning_rate': [1e-3, 1e-2, 1e-1, 0.5, 1.],
        'subsample': np.arange(0.05, 1.01, 0.05),
        'min_child_weight': range(1, 21),
        'n_jobs': [1],
        'verbosity': [0]
    },

    'sklearn.linear_model.SGDClassifier': {
        'loss': ['log', 'hinge', 'modified_huber', 'squared_hinge', 'perceptron'],
        'penalty': ['elasticnet'],
        'alpha': [0.0, 0.01, 0.001],
        'learning_rate': ['invscaling', 'constant'],
        'fit_intercept': [True, False],
        'l1_ratio': [0.25, 0.0, 1.0, 0.75, 0.5],
        'eta0': [0.1, 1.0, 0.01],
        'power_t': [0.5, 0.0, 1.0, 0.1, 100.0, 10.0, 50.0]
    },

    'sklearn.neural_network.MLPClassifier': {
        'alpha': [1e-4, 1e-3, 1e-2, 1e-1],
        'learning_rate_init': [1e-3, 1e-2, 1e-1, 0.5, 1.]
    },

    # Preprocesssors
    'sklearn.preprocessing.Binarizer': {
        'threshold': np.arange(0.0, 1.01, 0.05)
    },

    'sklearn.decomposition.FastICA': {
        'tol': np.arange(0.0, 1.01, 0.05)
    },

    'sklearn.cluster.FeatureAgglomeration': {
        'linkage': ['ward', 'complete', 'average'],
        'affinity': ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']
    },

    'sklearn.preprocessing.MaxAbsScaler': {
    },

    'sklearn.preprocessing.MinMaxScaler': {
    },

    'sklearn.preprocessing.Normalizer': {
        'norm': ['l1', 'l2', 'max']
    },

    'sklearn.kernel_approximation.Nystroem': {
        'kernel': ['rbf', 'cosine', 'chi2', 'laplacian', 'polynomial', 'poly', 'linear', 'additive_chi2', 'sigmoid'],
        'gamma': np.arange(0.0, 1.01, 0.05),
        'n_components': range(1, 11)
    },

    'sklearn.decomposition.PCA': {
        'svd_solver': ['randomized'],
        'iterated_power': range(1, 11)
    },

    'sklearn.preprocessing.PolynomialFeatures': {
        'degree': [2],
        'include_bias': [False],
        'interaction_only': [False]
    },

    'sklearn.kernel_approximation.RBFSampler': {
        'gamma': np.arange(0.0, 1.01, 0.05)
    },

    'sklearn.preprocessing.RobustScaler': {
    },

    'sklearn.preprocessing.StandardScaler': {
    },

    'tpot.builtins.ZeroCount': {
    },

    'tpot.builtins.OneHotEncoder': {
        'minimum_fraction': [0.05, 0.1, 0.15, 0.2, 0.25],
        'sparse': [False],
        'threshold': [10]
    },

    # Selectors
    'sklearn.feature_selection.SelectFwe': {
        'alpha': np.arange(0, 0.05, 0.001),
        'score_func': {
            'sklearn.feature_selection.f_classif': None
        }
    },

    'sklearn.feature_selection.SelectPercentile': {
        'percentile': range(1, 100),
        'score_func': {
            'sklearn.feature_selection.f_classif': None
        }
    },

    'sklearn.feature_selection.VarianceThreshold': {
        'threshold': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.2]
    },

    'sklearn.feature_selection.RFE': {
        'step': np.arange(0.05, 1.01, 0.05),
        'estimator': {
            'sklearn.ensemble.ExtraTreesClassifier': {
                'n_estimators': [100],
                'criterion': ['gini', 'entropy'],
                'max_features': np.arange(0.05, 1.01, 0.05)
            }
        }
    },

    'sklearn.feature_selection.SelectFromModel': {
        'threshold': np.arange(0, 1.01, 0.05),
        'estimator': {
            'sklearn.ensemble.ExtraTreesClassifier': {
                'n_estimators': [100],
                'criterion': ['gini', 'entropy'],
                'max_features': np.arange(0.05, 1.01, 0.05)
            }
        }
    },

    'tpot.builtins.PytorchLRClassifier': {
        'learning_rate': [1e-3, 1e-2, 1e-1],
        'batch_size': [4, 8, 16, 32, 64, 128, 256, 512, 1024],
        'num_epochs': [10, 25, 50, 100, 250],
        'weight_decay': [0, 1e-4, 1e-3, 1e-2]
    },

    'tpot.builtins.PytorchMLPClassifier': {
        'learning_rate': [1e-3, 1e-2, 1e-1],
        'batch_size': [4, 8, 16, 32, 64, 128, 256, 512],
        'num_epochs': [10, 25, 50, 100, 250],
        'weight_decay': [0, 1e-4, 1e-3, 1e-2]
    },
}

In [None]:
# instanciante the TPot classifier
# TPot wil generate a intial population 
# then refresh and make mutation in the population 
# I've specified a folder where TPot will store the best models found at each generation
# it is interesting to look at what is investigated 

model_tpot = tpot.TPOTClassifier(
    scoring="roc_auc", 
    verbosity=2, 
    population_size=100,
    offspring_size=50,
    warm_start=True,
    generations=10000,
    cv=5,
    n_jobs=-1,
    periodic_checkpoint_folder="tpot",
    max_time_mins = 11 * 60,
    max_eval_time_mins = 7, 
    config_dict = classifier_config_dict,
    template='Selector-Transformer-Classifier'
)

In [None]:
# fit the TOPT model with features and labels
# export the model

features = [c for c in df_train.columns if c != "failure"]
model_tpot.fit(df_train[features].values, df_train["failure"].values)

In [None]:
# 0.5947982151377031

In [None]:
# export the selected model as a file and have a quick glance at it

model_tpot.export('model_tpot.py')

! cat model_tpot.py

In [None]:
# make prediction using the optimized pipeline

df_test["pred"] = model_tpot.predict_proba(df_test[features].values)[:, 1]

In [None]:
# create the submission file

df_submission = pd.read_csv(os.path.join(root, "sample_submission.csv"), index_col=0)

df_submission = df_submission.join(df_test["pred"])
df_submission["failure"] = df_submission["pred"]
df_submission["failure"].to_csv("submission.csv")
df_submission["failure"]

In [None]:
# quick viz of the predictions

df_submission["failure"].hist(bins=100)

In [None]:
nan