In [85]:
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [86]:
class IsolationForestOutlierRemover:
    def __init__(self, contamination):
        self.contamination = contamination

    def transform(self, X, y):
        iforest = IsolationForest(n_estimators=100, contamination=self.contamination, random_state=0)
        pred = iforest.fit_predict(X)
        return X.iloc[pred == 1], y.iloc[pred == 1]

In [87]:
def load_iris_features_and_target():
        features = ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
        target = ['species']
        iris_df = pd.read_csv('iris.csv')
        return iris_df[features], iris_df[target]

In [88]:
features_df, target_df = load_iris_features_and_target()

X_train, X_test, y_train, y_test = train_test_split(features_df, target_df, train_size=0.8, random_state=0, stratify=target_df)

outlier_remover = IsolationForestOutlierRemover(0.05)

X_train, y_train = outlier_remover.transform(X_train, y_train)



## SVM

In [89]:

param_grid = [
  {'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.1, 0.001, 0.0001], 'kernel': ['rbf']},
 ]

svm_classifier = SVC(random_state=0)

svm_pipe = Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=0.99)), ('svc', GridSearchCV(svm_classifier, param_grid, cv=5, n_jobs=-1))])

In [90]:
svm_pipe.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [91]:
y_pred = svm_pipe.predict(X_test)
accuracy_score(y_test, y_pred)

1.0

## Gradient Boosting

In [92]:
param_grid = {'n_estimators': [25, 50, 100, 150, 200, 300, 500], 'learning_rate': [0.5,0.2,0.1, 0.01], 'max_depth': [3, 5, 10], 'min_samples_split': [2, 5, 10]}

gb_classifier = GradientBoostingClassifier(random_state=0)

gb_pipe = Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=0.99)), ('gb', GridSearchCV(gb_classifier, param_grid, cv=5, n_jobs=-1))])

In [93]:

gb_pipe.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [94]:
y_pred = gb_pipe.predict(X_test)

accuracy_score(y_pred=y_pred, y_true=y_test)

1.0

## Random Forest

In [95]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {'n_estimators': [25, 50, 100, 150], 'max_depth': [3, 5, 10], 'min_samples_split': [2, 5, 10]}

rf_classifier = RandomForestClassifier(random_state=0)

rf_pipe = Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=0.99)), ('gb', GridSearchCV(rf_classifier, param_grid, cv=5, n_jobs=-1))])

In [96]:
rf_pipe.fit(X_train, y_train)

  self.best_estimator_.fit(X, y, **fit_params)


In [97]:
y_pred = rf_pipe.predict(X_test)

accuracy_score(y_pred=y_pred, y_true=y_test)

0.9666666666666667

## MLP

In [98]:
from sklearn.neural_network import MLPClassifier

param_grid = {'hidden_layer_sizes': [(5,),(10,),(20,)], 'alpha': [1e-05, 1e-03, 1e-02, 1e-01, 0], 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'learning_rate_init': [1e-05, 1e-03, 1e-02, 1e-01]}

mlp_classifier = MLPClassifier(solver='sgd', max_iter=10000000000, random_state=0)

mlp_pipe = Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=0.99)), ('gb', GridSearchCV(mlp_classifier, param_grid, cv=5, n_jobs=-1))])

In [99]:
mlp_pipe.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [100]:
y_pred = mlp_pipe.predict(X_test)

accuracy_score(y_pred=y_pred, y_true=y_test)

1.0

## Runtime measurements for inference

In [101]:
%timeit -r 15 svm_pipe.predict(X_test)

934 µs ± 25.5 µs per loop (mean ± std. dev. of 15 runs, 1,000 loops each)


In [102]:
%timeit -r 15 gb_pipe.predict(X_test)

1.45 ms ± 147 µs per loop (mean ± std. dev. of 15 runs, 1,000 loops each)


In [103]:
%timeit -r 15 rf_pipe.predict(X_test)

3.2 ms ± 105 µs per loop (mean ± std. dev. of 15 runs, 100 loops each)


In [104]:
%timeit -r 15 mlp_pipe.predict(X_test)

919 µs ± 25.5 µs per loop (mean ± std. dev. of 15 runs, 1,000 loops each)
