In [None]:
%pip install auto-sklearn

Collecting auto-sklearn
  Downloading auto-sklearn-0.14.6.tar.gz (6.4 MB)
[K     |████████████████████████████████| 6.4 MB 4.4 MB/s 
Collecting distro
  Downloading distro-1.7.0-py3-none-any.whl (20 kB)
Collecting scipy>=1.7.0
  Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)
[K     |████████████████████████████████| 38.1 MB 1.3 MB/s 
Collecting scikit-learn<0.25.0,>=0.24.0
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 56.7 MB/s 
[?25hCollecting dask>=2021.12
  Downloading dask-2022.2.0-py3-none-any.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 40.5 MB/s 
[?25hCollecting distributed>=2012.12
  Downloading distributed-2022.2.0-py3-none-any.whl (837 kB)
[K     |████████████████████████████████| 837 kB 48.8 MB/s 
Collecting liac-arff
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
Collecting ConfigSpace<0.5,>=0.4.14
  Downloading ConfigSpace-0

In [None]:
import sklearn.datasets
import sklearn.model_selection
import autosklearn.classification
import autosklearn.metrics
import pandas as pd

In [None]:
def dataSetup(name):
  X,y = sklearn.datasets.fetch_openml(data_id = name, as_frame=True, return_X_y=True)
  train = pd.concat([X, y], axis=1, join='inner')
  train.dropna()
  train = train.apply(lambda x: pd.factorize(x)[0])
  X,y = train.iloc[:,:-1], train.iloc[:, -1]
  # X = X.apply(lambda x: pd.factorize(x)[0])
  X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,y,random_state=42)
  return (X_train, X_test, y_train, y_test)

In [None]:
X_train, X_test, y_train, y_test = dataSetup(40975)
X_train.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
1036,2,1,2,1,0,1
757,1,3,0,0,0,1
589,1,1,1,2,1,1
907,2,0,1,1,2,1
1159,2,2,2,2,2,1


In [None]:
estimator_askl = autosklearn.classification.AutoSklearnClassifier(time_left_for_this_task=3600,seed=42,resampling_strategy='cv',)
estimator_askl.fit(X_train, y_train, dataset_name='car')
prediction = estimator_askl.predict(X_test)

In [None]:
(prediction == y_test).sum()/len(prediction)

0.9791666666666666

In [None]:
print(estimator_askl.sprint_statistics())

auto-sklearn results:
  Dataset name: car
  Metric: accuracy
  Best validation score: 0.998457
  Number of target algorithm runs: 220
  Number of successful target algorithm runs: 218
  Number of crashed target algorithm runs: 1
  Number of target algorithms that exceeded the time limit: 1
  Number of target algorithms that exceeded the memory limit: 0



In [None]:
from pprint import pprint
pprint(estimator_askl.show_models(), indent=4)

{   43: {   'cost': 0.0023148148148148103,
            'ensemble_weight': 0.04,
            'estimators': [   {   'balancing': Balancing(random_state=42, strategy='weighting'),
                                  'classifier': <autosklearn.pipeline.components.classification.ClassifierChoice object at 0x7f72e2762590>,
                                  'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice object at 0x7f72e51d9050>,
                                  'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice object at 0x7f72e275bdd0>,
                                  'sklearn_classifier': HistGradientBoostingClassifier(early_stopping=True,
                               l2_regularization=2.268842963809998e-09,
                               learning_rate=0.09814042923887036, max_iter=512,
                               max_leaf_nodes=23, min_samples_leaf=18,
                             

In [None]:
for run_key in estimator_askl.automl_.runhistory_.data:
    print('#########')
    print(run_key)
    print(estimator_askl.automl_.runhistory_.data[run_key])

#########
RunKey(config_id=1, instance_id='{"task_id": "car"}', seed=0, budget=0.0)
RunValue(cost=0.018518518518518476, time=10.526482582092285, status=<StatusType.SUCCESS: 1>, starttime=1651513462.1403139, endtime=1651513472.687515, additional_info={'duration': 10.192305326461792, 'num_run': 2, 'train_loss': 0.0, 'configuration_origin': 'Initial design'})
#########
RunKey(config_id=2, instance_id='{"task_id": "car"}', seed=0, budget=0.0)
RunValue(cost=0.29320987654320985, time=11.30854082107544, status=<StatusType.SUCCESS: 1>, starttime=1651513472.7363832, endtime=1651513484.061751, additional_info={'duration': 11.12779450416565, 'num_run': 3, 'train_loss': 0.2932098765432099, 'configuration_origin': 'Initial design'})
#########
RunKey(config_id=3, instance_id='{"task_id": "car"}', seed=0, budget=0.0)
RunValue(cost=0.0810185185185185, time=10.140499591827393, status=<StatusType.SUCCESS: 1>, starttime=1651513484.10659, endtime=1651513494.2657015, additional_info={'duration': 9.91186642

In [None]:
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, prediction))

Accuracy score 0.9791666666666666
