## Traditional ML pipeline

In [10]:
# Configuration
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import modules
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

# Load MNIST (simplified) dataset
digits = load_digits()

# Define predictors (features) and target
X = digits["data"]
y = digits["target"]

print(X)
print(y)

print(X.shape)
print(y.shape)

[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]
[0 1 2 ... 8 9 8]
(1797, 64)
(1797,)


In [3]:
import numpy as np
np.unique(y, return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([178, 182, 177, 183, 181, 182, 181, 179, 174, 180]))

In [4]:
# Train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state=42)

In [5]:
# Fit a Logistic regression classifier
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.9622222222222222

In [6]:
# Print (default) model parameters
clf

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Automated ML pipeline with TPOT

In [7]:
from tpot import TPOTClassifier

clf = TPOTClassifier(verbosity = 2,
                     n_jobs = -1,
                     generations = 5,
                     config_dict="TPOT light")

clf.fit(X_train, y_train)

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=600.0, style=ProgressStyle(de…

Generation 1 - Current best internal CV score: 0.9814484372848685
Generation 2 - Current best internal CV score: 0.9851576483546743
Generation 3 - Current best internal CV score: 0.9851576483546743
Generation 4 - Current best internal CV score: 0.9851576483546743
Generation 5 - Current best internal CV score: 0.9851576483546743

Best pipeline: KNeighborsClassifier(input_matrix, n_neighbors=7, p=2, weights=distance)


TPOTClassifier(config_dict='TPOT light', crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=None, generations=5,
               max_eval_time_mins=5, max_time_mins=None, memory=None,
               mutation_rate=0.9, n_jobs=-1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=100,
               random_state=None, scoring=None, subsample=1.0, template=None,
               use_dask=False, verbosity=2, warm_start=False)

In [8]:
clf.score(X_test, y_test)

0.98

**Configuration Space** <br/>
Documentation: https://epistasislab.github.io/tpot/using/#built-in-tpot-configurations