# Auto ML Models - HyperoptEstimator and TPOT

In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.shape

(768, 9)

In [4]:
X=df.drop("Outcome", axis=1)
y=df.Outcome

In [15]:
# example of hyperopt-sklearn for a classification dataset
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from hpsklearn import HyperoptEstimator
from hpsklearn import any_classifier
from hpsklearn import any_preprocessing
from hyperopt import tpe
# define dataset
X, y = make_classification(n_samples=50, n_features=8, n_informative=8, n_redundant=0, random_state=1)
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# define search
model = HyperoptEstimator(classifier=any_classifier('cla'), preprocessing=any_preprocessing('pre'), algo=tpe.suggest, max_evals=30, trial_timeout=10)
# perform the search
model.fit(X_train, y_train)
# summarize performance
acc = model.score(X_test, y_test)
print("Accuracy: %.3f" % acc)
# summarize the best model
print(model.best_model())

100%|██████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.29s/trial, best loss: 0.5714285714285714]
100%|██████████████████████████████████████████████████| 2/2 [00:03<00:00,  3.33s/trial, best loss: 0.5714285714285714]
100%|██████████████████████████████████████████████████| 3/3 [00:03<00:00,  3.79s/trial, best loss: 0.5714285714285714]
100%|██████████████████████████████████████████████████| 4/4 [00:03<00:00,  3.59s/trial, best loss: 0.1428571428571429]
100%|██████████████████████████████████████████████████| 5/5 [00:04<00:00,  4.29s/trial, best loss: 0.1428571428571429]
100%|██████████████████████████████████████████████████| 6/6 [00:03<00:00,  3.89s/trial, best loss: 0.1428571428571429]
100%|██████████████████████████████████████████████████| 7/7 [00:04<00:00,  4.29s/trial, best loss: 0.1428571428571429]
100%|██████████████████████████████████████████████████| 8/8 [00:05<00:00,  5.81s/trial, best loss: 0.1428571428571429]
100%|███████████████████████████████████

In [12]:
titanic=pd.read_csv("titanic_train.csv")
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [13]:
titanic['Sex'] = titanic['Sex'].apply(lambda x: 1 if x == 'male' else 0)
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].mean())
titanic['Embarked'].fillna(titanic['Embarked'].mode()[0], inplace=True)
titanic=titanic.drop(["PassengerId", "Cabin", "Name", "Ticket"], axis=1)

In [14]:
titanic=pd.get_dummies(titanic, columns=["Embarked"], drop_first=True)

In [15]:
X = titanic.drop('Survived', axis = 1)
y = titanic['Survived']

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

In [17]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25)
# Make a custom metric function
def my_custom_accuracy(y_true, y_pred):
    return float(sum(y_pred == y_true)) / len(y_true)

# Make a custom a scorer from the custom metric function
# Note: greater_is_better=False in make_scorer below would mean that the scoring function should be minimized.
my_custom_scorer = make_scorer(my_custom_accuracy, greater_is_better=True)

tpot = TPOTClassifier(generations=5, population_size=40, verbosity=2,
                      scoring=my_custom_scorer)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

HBox(children=(HTML(value='Optimization Progress'), FloatProgress(value=0.0, max=240.0), HTML(value='')))


Generation 1 - Current best internal CV score: 0.8278756592974974

Generation 2 - Current best internal CV score: 0.8338570306362921

Generation 3 - Current best internal CV score: 0.8338570306362921

Generation 4 - Current best internal CV score: 0.8353046796094714

Generation 5 - Current best internal CV score: 0.839838401975087

Best pipeline: GradientBoostingClassifier(PolynomialFeatures(input_matrix, degree=2, include_bias=False, interaction_only=False), learning_rate=0.1, max_depth=3, max_features=1.0, min_samples_leaf=6, min_samples_split=17, n_estimators=100, subsample=0.8500000000000001)
0.8475336322869955


In [18]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

model = make_pipeline(PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
GradientBoostingClassifier(learning_rate=0.1, max_depth=3, max_features=1.0, min_samples_leaf=6, 
                           min_samples_split=17, n_estimators=100, subsample=0.8500000000000001))

model.fit(X_train, y_train)
results = model.predict(X_test)

In [19]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [20]:
print(classification_report(y_test, results))

              precision    recall  f1-score   support

           0       0.86      0.89      0.87       146
           1       0.77      0.71      0.74        77

    accuracy                           0.83       223
   macro avg       0.81      0.80      0.81       223
weighted avg       0.83      0.83      0.83       223



In [23]:
print(confusion_matrix(y_test,results))

[[130  16]
 [ 22  55]]


In [22]:
print(accuracy_score(y_test,results))

0.8295964125560538
