# TPOT for Classification
In this section, we will use TPOT to discover a best performance model for classification task

In [1]:
# import needed libraries
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedStratifiedKFold
from tpot import TPOTClassifier



In [2]:
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv'
dataframe = read_csv(url, header=None)
# split into input and output elements
data = dataframe.values
X, y = data[:, :-1], data[:, -1]
print(X.shape, y.shape)

(208, 60) (208,)


In [3]:
# minimally prepare dataset
X = X.astype('float32')
y = LabelEncoder().fit_transform(y.astype('str'))

In [4]:
...
# define model evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [5]:
...
# define search
model = TPOTClassifier(generations=5, population_size=50, cv=cv, scoring='accuracy', verbosity=2, random_state=1, n_jobs=-1)

In [7]:
...
# perform the search
model.fit(X, y)
# export the best model
model.export('tpot_sonar_best_model.py')

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=300.0, style=ProgressStyle(de…


Generation 1 - Current best internal CV score: 0.8634920634920636
Generation 2 - Current best internal CV score: 0.8650000000000001
Generation 3 - Current best internal CV score: 0.8650000000000001
Generation 4 - Current best internal CV score: 0.8744444444444445
Generation 5 - Current best internal CV score: 0.8744444444444445
Best pipeline: KNeighborsClassifier(GradientBoostingClassifier(input_matrix, learning_rate=0.1, max_depth=5, max_features=0.7000000000000001, min_samples_leaf=18, min_samples_split=11, n_estimators=100, subsample=0.7500000000000001), n_neighbors=4, p=1, weights=distance)


now : Running the example fits the best-performing model on the dataset and makes a prediction for a single row of new data.

In [10]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive

# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv'
dataframe = read_csv(url, header=None)
# split into input and output elements
data = dataframe.values
X, y = data[:, :-1], data[:, -1]
# minimally prepare dataset
X = X.astype('float32')
y = LabelEncoder().fit_transform(y.astype('str'))

# Average CV score on the training set was: 0.8744444444444445
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=GradientBoostingClassifier(learning_rate=0.1, max_depth=5, max_features=0.7000000000000001, min_samples_leaf=18, min_samples_split=11, n_estimators=100, subsample=0.7500000000000001)),
    KNeighborsClassifier(n_neighbors=4, p=1, weights="distance")
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 1)
# fit the model
exported_pipeline.fit(X, y)

# make a prediction on a new row of data
row = [0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,0.1609,0.1582,0.2238,0.0645,0.0660,0.2273,0.3100,0.2999,0.5078,0.4797,0.5783,0.5071,0.4328,0.5550,0.6711,0.6415,0.7104,0.8080,0.6791,0.3857,0.1307,0.2604,0.5121,0.7547,0.8537,0.8507,0.6692,0.6097,0.4943,0.2744,0.0510,0.2834,0.2825,0.4256,0.2641,0.1386,0.1051,0.1343,0.0383,0.0324,0.0232,0.0027,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032]
yhat = exported_pipeline.predict([row])
print('Predicted: %.3f' % yhat[0])

Predicted: 1.000


you read more in https://machinelearningmastery.com/tpot-for-automated-machine-learning-in-python/