# OpenML Cheat Sheet (Python)

In [None]:
# General imports
from openml import datasets, tasks, runs, flows, config, evaluations, study, extensions
import os, pandas, sklearn, arff, pprint, numpy, seaborn

## config

Find your API key (required for uploads):
* `www.openml.org` > Your profile > API Authentication

Main OpenML servers:
* Public: `https://www.openml.org/api/v1` (default)
* Test: `https://test.openml.org/api/v1` 

Set server, API key and cache directory (default: `~/.openml/cache`)

Or, create a config file called `~/.openml/config`
and add these lines:

In [None]:
server=https://www.openml.org/api/v1
apikey=qxlfpbeaudtprb23985hcqlfoebairtd
cachedir=/homedir/.openml/cache

## datasets  
**`list_datasets(offset=None, size=None, tag=None)`**
* `offset` and `size` for paging results
* `tag` to filter datasets (e.g. 'uci')
* `status`: active, in_preparation, deactivated
* `data_name`, `data_version`, `number_instances`,...

In [None]:
dlist = datasets.list_datasets(size=100)
pandas.DataFrame.from_dict(dlist, orient='index')[
['name','NumberOfInstances', 'NumberOfFeatures']][:3]

**`get_dataset(dataset_id)`**
* returns **OpenMLDataset** object
* automatically downloads and caches the data itself

In [None]:
odata = datasets.get_dataset(1471)
print(odata.name, "Target: "+ odata.default_target_attribute, 
      odata.description[260:308], sep='\n')

**`OpenMLDataset`**  

**`.features`**: list of features and their properties  
**`.qualities`**: list of all dataset properties  
**`.get_data`**(target,return_attribute_names=False,return_categorical_indicator=False):  
  returns data as numpy arrays, attribute names, and which are categorical  
**`.retrieve_class_labels(target_name='class')`**: return all class labels for the given target attribute



In [None]:
data, targets, categorical, attribute_names = odata.get_data(
    target=odata.default_target_attribute
)

data.head(5)

**Upload new datasets**
* Create a new OpenML dataset with all relevant information
* `datasets.functions.create_dataset` for uploading pandas dataframes or numpy arrays
* Call **`.publish()`** to upload

In [None]:
md = datasets.OpenMLDataset(data_file='dataset.arff', name='t',
    description='t', version='1', format='ARFF', licence='CC0',
    visibility='public', default_target_attribute='class')
data_id = md.publish()

print("New dataset ID: " + str(data_id)) 

## tasks  
**`list_tasks(task_type_id=None, offset=None, size=None, ...)`**
* `offset` and `size` for paging results, `tag` to filter tags
* `task_type_id`: 1=Classification, 2=Regression,...
* Dataset properties: `data_tag`, `status`, `data_id`, `data_name`, `number_instances`, `number_features`, `number_classes`, ...

In [None]:
tlist = tasks.list_tasks(task_type_id=1, size=100)
pandas.DataFrame.from_dict(tlist, orient='index')[
['name','estimation_procedure']][:3]

**`get_task(task_id)`**
* returns **OpenMLTask** object
    *  includes estimation procedure, target name, cost matrix,...
* automatically caches the task description

In [None]:
task = tasks.get_task(14951)
pprint.pprint(task.estimation_procedure)

**`OpenMLTask`**  
**`.get_dataset()`**: downloads associated dataset   
**`.download_split()`**: downloads train/test splits

**Create new tasks**  
...  
`Under development`

## flows  
**`list_flows(offset=None, size=None, tag=None, uploader=None)`**
* returns ID -> flow dict mapping
* `offset` and `size` for paging results, `tag` to filter tags
* `uploader`: list of uploader IDs to filter on, e.g. [1,2,3]

In [None]:
flist = flows.list_flows(size=200)
pandas.DataFrame.from_dict(flist, orient='index')[
    ['name','version','external_version']][100:102]

**`sklearn_to_flow(sklearn_estimator)`**

* converts a scikit-learn estimator or pipeline to an OpenML Flow

**`publish()`**

* Uploads the flow to the server. Returns ID

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression().fit(data.values[:,0:-1], data.values[:,-1])
extension = extensions.get_extension_by_model(lr)
flow = extension.model_to_flow(lr)

pipe = sklearn.pipeline.Pipeline(steps=[
    ('Imputer', sklearn.preprocessing.Normalizer()),
    ('Classifier', sklearn.linear_model.LinearRegression())])
flow2 = extension.model_to_flow(pipe)
# flows.publish(flow)

## runs  
**`list_runs(offset=None, size=None, tag=None, id=None, task=None, flow=None, uploader=None, display_errors=False)`**
* `offset` and `size` for paging results, `tag` to filter tags
* `id`, `task`, `flow`, `uploader`: list of IDs to filter, e.g. [1,2,3]
* `display_errors`: whether to return failed runs

In [None]:
rl = runs.list_runs(task=[14951],size=100)
pandas.DataFrame.from_dict(rl, orient='index')[1:5]

**`get_run(run_id)`**
* returns **OpenMLRun** object
    *  includes the exact task, exact flow, and all evaluations
* automatically caches the run description

**OpenMLRun**  
**.uploader_name**: full name of the run author  
**.flow_name**: full name of the flow  
**.parameter_settings**: hyperparameters of the flow  
**.evaluations**: key-value pairs of metric and score  
**.fold_evaluations**: dict of per-fold evaluations  

In [None]:
rlist = runs.list_runs(task=[14951],size=50)
scores = []
for id, _ in rlist.items():
    run = runs.get_run(id)
    scores.append({"flow":run.flow_name, 
                   "score":run.evaluations['area_under_roc_curve']})
pandas.DataFrame.from_dict(scores)[17:20]

**`run_flow_on_task(flow, task)`**  
**`run_model_on_task(model, task)`**

* Runs a flow or model (e.g. sklearn model) on the task
* Returns a **OpenMLRun** with all information
* Trains and tests the flow of all train/test splits defined by the task

**`publish()`**
* Publishes the run on OpenML


In [None]:
task = tasks.get_task(14951)
clf = sklearn.linear_model.LogisticRegression()
run = runs.run_model_on_task(clf, task)
score = run.get_metric_fn(sklearn.metrics.accuracy_score)
myrun = run.publish()

print(myrun)
print("Accuracy: {:.2f}%".format(score.mean()))

## evaluations  
**`list_evaluations(function=None, offset=None, size=None, tag=None, id=None, task=None, flow=None, uploader=None, display_errors=False)`**
* `function`: evaluation measure, e.g. `area_under_roc_curve'
* `offset` and `size` for paging results, `tag` to filter tags
* `id`, `task`, `flow`, `uploader`: list of IDs to filter, e.g. [1,2,3]
* `per_fold`: if True, returns per-fold evaluations 
* `setup`: list of hyperparameter setup ID's

In [None]:
evals = evaluations.list_evaluations(task=[167133], 
    function='area_under_roc_curve', size=100)
scores = [{"flow":e.flow_name[0:70], "score":e.value} 
          for id, e in evals.items()]
seaborn.violinplot(x="score", y="flow", cut=0, scale="width", 
                   data=pandas.DataFrame(scores));

## Benchmark suites
* Curated collections of tasks for benchmarking
* Run any model or pipeline on all tasks
* Frictionless evaluation and sharing

In [None]:
benchmark_suite = study.get_study('OpenML100')
clf = sklearn.linear_model.LogisticRegression()
for task_id in benchmark_suite.tasks[0:2]: # take small subset
    run = runs.run_model_on_task(clf, tasks.get_task(task_id))
    score = run.get_metric_fn(sklearn.metrics.accuracy_score)
    print('Data set: %s; Accuracy: %0.2f' % (task.get_dataset().name,score.mean()))
    # run.publish()

In [None]:
from IPython.display import set_matplotlib_formats, display, HTML
HTML('''<style>.prompt {display:none;}
        .output_subarea pre{width:130%}
        </style>''')

In [None]:
config.server = 'https://www.openml.org/api/v1'