In [1]:
import pandas as pd
from datacleaner import autoclean
from concurrent.futures import ProcessPoolExecutor

from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.model_selection import train_test_split

from tpot import TPOTClassifier, TPOTRegressor
import openml as oml

## Selected datasets

- [anneal](https://www.openml.org/d/2)
- [vehicle](https://www.openml.org/d/54)
- [kdd_el_nino-small](https://www.openml.org/d/839)
- [vowel](https://www.openml.org/d/1016)
- [xd6](https://www.openml.org/d/40693)
- [tokyo1](https://www.openml.org/d/40705)
- [tic-tac-toe](https://www.openml.org/d/50)
- [stock](https://www.openml.org/d/841)
- [oil_spill](https://www.openml.org/d/311)

These datasets were chosen because they are small in number of records and features allowing a fast execution of the AutoML algorithm.

In [108]:
datasets_ids = [2, 54, 839, 1016, 40693, 40705, 50, 841, 311]

best_results = {}
for ds_id in datasets_ids:
    tasks = oml.tasks.list_tasks(task_type_id=1, data_id=ds_id)
    evals = oml.evaluations.list_evaluations("f_measure", task=list(tasks.keys()))
    values = list(evals.values())
    evals = [v for v in values if getattr(v, 'value', None)]
    evals.sort(key=lambda x: x.value, reverse=True)
    e = evals[0]
    print(ds_id, e.flow_name, e.function, e.value)
    best_results[ds_id] = (e.flow_name, e.function, e.value)

2 weka.LogitBoost_DecisionStump(3) f_measure 0.997506
54 sklearn.pipeline.Pipeline(imputation=hyperimp.utils.preprocessing.ConditionalImputer2,hotencoding=sklearn.preprocessing.data.OneHotEncoder,scaling=sklearn.preprocessing.data.StandardScaler,variencethreshold=sklearn.feature_selection.variance_threshold.VarianceThreshold,clf=sklearn.svm.classes.SVC)(1) f_measure 0.869092164757794
839 weka.RotationForest_PrincipalComponents_J48(14) f_measure 0.962931
1016 classif.IBk(5) f_measure 1.0
40693 weka.kf.RandomForest(1) f_measure 1.0
40705 weka.kf.RandomForest(1) f_measure 0.934157
50 weka.SMO_PolyKernel(1) f_measure 1.0
841 weka.Decorate(1) f_measure 0.973682
311 classif.lda(11) f_measure 0.966721


In [120]:
df = pd.DataFrame(best_results).T
df.columns = ['flow_name', 'measure', 'value']
names = pd.Series(
    data=['anneal','vehicle','kdd_el_nino-small','vowel','xd6','tokyo1','tic-tac-toe','stock','oil_spill'],
    index=[2, 54, 839, 1016, 40693, 40705, 50, 841, 311]
)
df['dataset'] = names
df

Unnamed: 0,flow_name,measure,value,dataset
2,weka.LogitBoost_DecisionStump(3),f_measure,0.997506,anneal
50,weka.SMO_PolyKernel(1),f_measure,1.0,tic-tac-toe
54,sklearn.pipeline.Pipeline(imputation=hyperimp....,f_measure,0.869092,vehicle
311,classif.lda(11),f_measure,0.966721,oil_spill
839,weka.RotationForest_PrincipalComponents_J48(14),f_measure,0.962931,kdd_el_nino-small
841,weka.Decorate(1),f_measure,0.973682,stock
1016,classif.IBk(5),f_measure,1.0,vowel
40693,weka.kf.RandomForest(1),f_measure,1.0,xd6
40705,weka.kf.RandomForest(1),f_measure,0.934157,tokyo1


In [121]:
df.to_csv('data/openml_best_results.csv')

## Checkpoint

In [2]:
df = pd.read_csv('data/openml_best_results.csv', index_col=0)
df

Unnamed: 0,flow_name,measure,value,dataset
2,weka.LogitBoost_DecisionStump(3),f_measure,0.997506,anneal
50,weka.SMO_PolyKernel(1),f_measure,1.0,tic-tac-toe
54,sklearn.pipeline.Pipeline(imputation=hyperimp....,f_measure,0.869092,vehicle
311,classif.lda(11),f_measure,0.966721,oil_spill
839,weka.RotationForest_PrincipalComponents_J48(14),f_measure,0.962931,kdd_el_nino-small
841,weka.Decorate(1),f_measure,0.973682,stock
1016,classif.IBk(5),f_measure,1.0,vowel
40693,weka.kf.RandomForest(1),f_measure,1.0,xd6
40705,weka.kf.RandomForest(1),f_measure,0.934157,tokyo1


In [3]:
datasets_ids = [2, 54, 839, 1016, 40693, 40705, 50, 841, 311]

datasets = oml.datasets.get_datasets(datasets_ids)

In [6]:
from marvin_python_toolbox.common.data import MarvinData

In [7]:
MarvinData.data_path

'/home/rafael/marvin/data'

In [8]:
ds = datasets[0]

X, y, attribute_names = ds.get_data(
        target=ds.default_target_attribute,
        return_attribute_names=True,
    )
df = pd.DataFrame(X, columns=attribute_names)
df['class'] = y

In [11]:
path = '%s/%s.csv' % (MarvinData.data_path, ds.name)
df.to_csv(path, sep=';', encoding='utf-8')

In [14]:
print(ds.description)

**Author**: Unknown. Donated by David Sterling and Wray Buntine  
**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Annealing) - 1990  
**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html)  

The original Annealing dataset from UCI. The exact meaning of the features and classes is largely unknown. Annealing, in metallurgy and materials science, is a heat treatment that alters the physical and sometimes chemical properties of a material to increase its ductility and reduce its hardness, making it more workable. It involves heating a material to above its recrystallization temperature, maintaining a suitable temperature, and then cooling. (Wikipedia)

### Attribute Information:
     1. family:          --,GB,GK,GS,TN,ZA,ZF,ZH,ZM,ZS
     2. product-type:    C, H, G
     3. steel:           -,R,A,U,K,M,S,W,V
     4. carbon:          continuous
     5. hardness:        continuous
     6. temper_rolling:  -,T
     7. condition:       -,S,A,X
     8. formability

In [17]:
df.head(1).values

array([[          nan,    0.        ,    1.        ,    8.        ,
           0.        ,           nan,    0.        ,           nan,
           0.        ,           nan,           nan,    3.        ,
                  nan,           nan,           nan,           nan,
                  nan,           nan,           nan,           nan,
                  nan,           nan,           nan,           nan,
                  nan,           nan,           nan,           nan,
                  nan,           nan,           nan,    0.        ,
           0.69999999,  610.        ,    0.        ,           nan,
           0.        ,           nan,    2.        ]])

In [6]:
def automl(dataset, results, metric='f1_score'):
    print('Start: %s' % dataset.name)
    X, y, attribute_names = dataset.get_data(
        target=dataset.default_target_attribute,
        return_attribute_names=True,
    )
    df = pd.DataFrame(X, columns=attribute_names)
    df['class'] = y
    df = autoclean(df)

    X = df.drop('class', axis=1)
    y = df['class']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    try:
        model = TPOTClassifier(
            generations=5,
            population_size=200,
            verbosity=0,
            scoring='f1_weighted',
            n_jobs=2
        )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    except Exception as e:
        print(dataset.name, e)
        return
#         print('Regression')
#         model = TPOTRegressor(
#             generations=5,
#             population_size=200, 
#             verbosity=1,
#             scoring='f1_weighted',
#             n_jobs=2
#         )
#         model.fit(X_train, y_train)
#         y_pred = model.predict(X_test)
    
    metric_func = accuracy_score if metric == 'acc' else f1_score 
    try:
        result = metric_func(y_test, y_pred)
    except:
        result = metric_func(y_test, y_pred, average='micro')
    
    results[dataset.dataset_id] = result
    print(f'dataset: {dataset.name} - automl result: {result}')

In [7]:
results = {}
with ProcessPoolExecutor(max_workers=3) as pool:
    for d in datasets:
        pool.submit(automl, d, results)

Start: kdd_el_nino-small
Start: anneal
Start: vehicle
dataset: kdd_el_nino-small - automl result: 0.9803921568627451
Start: vowel
dataset: vowel - automl result: 1.0
Start: xd6
dataset: anneal - automl result: 0.9500000000000001
Start: tokyo1
dataset: xd6 - automl result: 1.0
Start: tic-tac-toe
dataset: vehicle - automl result: 0.8294117647058825
Start: stock
tic-tac-toe feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8'] ['top-left-square', 'top-middle-square', 'top-right-square', 'middle-left-square', 'middle-middle-square', 'middle-right-square', 'bottom-left-square', 'bottom-middle-square', 'bottom-right-square']
expected f7, f6, f1, f2, f4, f3, f8, f0, f5 in input data
training data did not have the following fields: top-left-square, bottom-left-square, middle-left-square, top-middle-square, top-right-square, middle-middle-square, middle-right-square, bottom-middle-square, bottom-right-square
Start: oil_spill
dataset: stock - automl result: 0.96296296296

In [14]:
import pickle

with open('data/automl_results.pkl', 'wb') as f:
    pickle.dump(results, f)

In [15]:
import pickle

with open('data/automl_results.pkl', 'rb') as f:
    results = pickle.load(f)

results

{839: 0.9803921568627451,
 1016: 1.0,
 2: 0.9500000000000001,
 40693: 1.0,
 54: 0.8294117647058825,
 50: nan,
 841: 0.9629629629629629,
 40705: 0.9365079365079365,
 311: 0.7142857142857143}

In [18]:
res = pd.Series(results, index=results.keys())
res

839      0.980392
1016     1.000000
2        0.950000
40693    1.000000
54       0.829412
50            NaN
841      0.962963
40705    0.936508
311      0.714286
dtype: float64

In [20]:
df['automl_value'] = res
df

Unnamed: 0,flow_name,measure,value,dataset,automl_value
2,weka.LogitBoost_DecisionStump(3),f_measure,0.997506,anneal,0.95
50,weka.SMO_PolyKernel(1),f_measure,1.0,tic-tac-toe,
54,sklearn.pipeline.Pipeline(imputation=hyperimp....,f_measure,0.869092,vehicle,0.829412
311,classif.lda(11),f_measure,0.966721,oil_spill,0.714286
839,weka.RotationForest_PrincipalComponents_J48(14),f_measure,0.962931,kdd_el_nino-small,0.980392
841,weka.Decorate(1),f_measure,0.973682,stock,0.962963
1016,classif.IBk(5),f_measure,1.0,vowel,1.0
40693,weka.kf.RandomForest(1),f_measure,1.0,xd6,1.0
40705,weka.kf.RandomForest(1),f_measure,0.934157,tokyo1,0.936508


In [21]:
df['delta'] = df['automl_value'] - df['value']
df

Unnamed: 0,flow_name,measure,value,dataset,automl_value,delta
2,weka.LogitBoost_DecisionStump(3),f_measure,0.997506,anneal,0.95,-0.047506
50,weka.SMO_PolyKernel(1),f_measure,1.0,tic-tac-toe,,
54,sklearn.pipeline.Pipeline(imputation=hyperimp....,f_measure,0.869092,vehicle,0.829412,-0.03968
311,classif.lda(11),f_measure,0.966721,oil_spill,0.714286,-0.252435
839,weka.RotationForest_PrincipalComponents_J48(14),f_measure,0.962931,kdd_el_nino-small,0.980392,0.017461
841,weka.Decorate(1),f_measure,0.973682,stock,0.962963,-0.010719
1016,classif.IBk(5),f_measure,1.0,vowel,1.0,0.0
40693,weka.kf.RandomForest(1),f_measure,1.0,xd6,1.0,0.0
40705,weka.kf.RandomForest(1),f_measure,0.934157,tokyo1,0.936508,0.002351


In [30]:
df['flow_name'][54] = 'sklearn.pipeline.Pipeline(..., clf=sklearn.svm.classes.SVC)(1)'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [32]:
from tabulate import tabulate

print(tabulate(df, tablefmt="pipe", headers='keys'))

|       | flow_name                                                      | measure   |    value | dataset           |   automl_value |        delta |
|------:|:---------------------------------------------------------------|:----------|---------:|:------------------|---------------:|-------------:|
|     2 | weka.LogitBoost_DecisionStump(3)                               | f_measure | 0.997506 | anneal            |       0.95     |  -0.047506   |
|    50 | weka.SMO_PolyKernel(1)                                         | f_measure | 1        | tic-tac-toe       |     nan        | nan          |
|    54 | sklearn.pipeline.Pipeline(..., clf=sklearn.svm.classes.SVC)(1) | f_measure | 0.869092 | vehicle           |       0.829412 |  -0.0396804  |
|   311 | classif.lda(11)                                                | f_measure | 0.966721 | oil_spill         |       0.714286 |  -0.252435   |
|   839 | weka.RotationForest_PrincipalComponents_J48(14)                | f_measure | 0.962931 | kd