In [1]:
%load_ext autoreload
%autoreload 2

## let's attack our house-prices example

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder


from sklearn.model_selection import train_test_split
train = pd.read_csv('../data/train.csv', sep=",")
test = pd.read_csv('../data/test.csv')

import sklearn
y = train['SalePrice']
X = train.drop('SalePrice', axis=1)
categorical = [var for var in X.columns if X[var].dtype=='O']
numerical = [var for var in X.columns if X[var].dtype!='O']
X[categorical] = X[categorical].fillna('None')

# auto-sklearn can not deal with categorical variables
X= pd.concat([pd.get_dummies(X[categorical], dummy_na=True), X[numerical]], axis=1)
# enc = OneHotEncoder(handle_unknown='ignore')
# X = pd.concat([pd.DataFrame(enc.fit_transform(X[categorical])), X[numerical]], axis=1)

y = np.log1p(y)
to_convert = {i: int for i,j in list(zip(X.dtypes.index, X.dtypes.values)) if j == 'bool'}
X = X.astype(to_convert)
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, random_state=42, test_size=0.2)

In [3]:
X.dtypes

MSZoning_C (all)    int64
MSZoning_FV         int64
MSZoning_RH         int64
MSZoning_RL         int64
MSZoning_RM         int64
                    ...  
ScreenPorch         int64
PoolArea            int64
MiscVal             int64
MoSold              int64
YrSold              int64
Length: 347, dtype: object

# Auto-Sklearn

[install auto-sklearn](https://automl.github.io/auto-sklearn/master/installation.html)

In [4]:
import sklearn.metrics
import autosklearn.regression
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("default")

[Parameters](https://automl.github.io/auto-sklearn/master/api.html#regression)

In [5]:
autosklearn.__version__

'0.15.0'

In [6]:
! rm -rf /tmp/autosklearn_*
automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=300,
    #time_left_for_this_task=60,
    per_run_time_limit=30,
    memory_limit = 4096,
    ensemble_size = 8, 
    ensemble_nbest=4,
    max_models_on_disc = 16,
    n_jobs = 4,
    include = {'regressor': ['gradient_boosting', 'ard_regression', 'sgd', 'random_forest'],
    'feature_preprocessor': ["no_preprocessing"]
              },
    resampling_strategy = 'cv',
    # include_preprocessors=["no_preprocessing"],
    tmp_folder='/tmp/autosklearn_regression_example_tmp',
    # output_folder='/tmp/autosklearn_regression_example_out',
    delete_tmp_folder_after_terminate = True,
    # delete_output_folder_after_terminate = False
)

  automl = autosklearn.regression.AutoSklearnRegressor(


In [7]:
automl.fit(X_train, y_train, dataset_name='house-prices')





AutoSklearnRegressor(ensemble_class=<class 'autosklearn.ensembles.ensemble_selection.EnsembleSelection'>,
                     ensemble_kwargs={'ensemble_size': 8}, ensemble_nbest=4,
                     ensemble_size=8,
                     include={'feature_preprocessor': ['no_preprocessing'],
                              'regressor': ['gradient_boosting',
                                            'ard_regression', 'sgd',
                                            'random_forest']},
                     max_models_on_disc=16, memory_limit=4096, n_jobs=2,
                     per_run_time_limit=30, resampling_strategy='cv',
                     time_left_for_this_task=600,
                     tmp_folder='/tmp/autosklearn_regression_example_tmp')

In [8]:
print(automl.show_models())

autosklearn.__version__

{4: {'model_id': 4, 'rank': 1, 'cost': 0.11099382061766988, 'ensemble_weight': 0.5, 'voting_model': VotingRegressor(estimators=None), 'estimators': [{'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0xffff42d59fc0>, 'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0xffff3cddd210>, 'regressor': <autosklearn.pipeline.components.regression.RegressorChoice object at 0xffff3cddd630>, 'sklearn_regressor': HistGradientBoostingRegressor(l2_regularization=0.005746611563553693,
                              learning_rate=0.0913971028976721, max_iter=512,
                              max_leaf_nodes=9, min_samples_leaf=2,
                              n_iter_no_change=20, random_state=1,
                              validation_fraction=None, warm_start=True)}, {'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0xffff3d6

'0.15.0'

In [9]:
predictions = automl.predict(X_test)
print("R2 score:", sklearn.metrics.r2_score(y_test, predictions))
print("mean-squared-error:", sklearn.metrics.mean_squared_error(y_test, predictions, squared=False))

R2 score: 0.9081811636338023
mean-squared-error: 0.13089844852170957
