In [1]:
import openml
from openml.tasks import TaskType
from statistics import mean 

In [24]:
MAX_INSTANCES=5000
MAX_FEATURES=50
MAX_CLASSES=15
SIMBOLIC_FEATURES= 0
MISSING_VALUES=0

In [3]:
def get_valid_tasks(filter, task_type = TaskType.SUPERVISED_CLASSIFICATION):
    tasks = openml.tasks.list_tasks(task_type=task_type, output_format="dataframe")
    filtered_tasks=tasks.query(filter)
    return list(filtered_tasks.tid)

In [5]:
def evaluate_tasks(task_ids):
    result = []
    for id in task_ids:
        mean_auc = evaluate_roc_auc(id)
        result.append({"task_id": id, "auc": mean_auc})
    return result

In [18]:
def evaluate_roc_auc(task_id):
    run_ids = list(openml.runs.list_runs(task=[task_id]))
    roc_auc = []
    for rid in run_ids:
        run = openml.runs.get_run(rid)
        roc_auc.append(run.evaluations["area_under_roc_curve"])
    if roc_auc == []:
        print(task_id)
        return
    return mean(roc_auc)

In [25]:
filter = f"NumberOfInstances<{MAX_INSTANCES} and NumberOfFeatures<{MAX_FEATURES} and NumberOfClasses<{MAX_CLASSES} and NumberOfSymbolicFeatures=={SIMBOLIC_FEATURES} and NumberOfInstancesWithMissingValues=={MISSING_VALUES}"


In [27]:
valid_tasks_ids=get_valid_tasks(filter)
valid_tasks_ids

[8, 238, 1772, 1888, 1949, 211702]

In [21]:
evaluate_tasks(valid_tasks_ids)

238
1772
1888
1949
211702


[{'task_id': 8, 'auc': 0.6299079433962265},
 {'task_id': 238, 'auc': None},
 {'task_id': 1772, 'auc': None},
 {'task_id': 1888, 'auc': None},
 {'task_id': 1949, 'auc': None},
 {'task_id': 211702, 'auc': None}]