In [1]:
%matplotlib inline
import openml as oml
import seaborn as sns
import numpy as np
import pandas as pd
import sys
import math
from scipy.stats import norm
from matplotlib import pyplot
import sklearn.tree
import sklearn.preprocessing
from sklearn.externals.joblib import Parallel, delayed
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Step 1: Get datasets from OpenML
- Only classification datasets
- Only active (verified) datasets

In [2]:
# Get all OpenML datasets
openml_list = oml.datasets.list_datasets() # Returns a dict
datalist = pd.DataFrame.from_dict(openml_list, orient='index') # Transform to pandas
datalist = datalist[datalist.status == 'active'] # Only use active (verified) datasets
datalist = datalist[datalist.NumberOfClasses>=2] # Only classification
print("{} active classification datasets".format(len(datalist)))

1025 active classification datasets


In [3]:
# Bookkeeping
data_names = {k: v for (k, v) in datalist[['did','name']].values} # dataset names
data_status = {k: 'OK' for k in datalist.index} # dataset status (OK or reason for removal)
datalist_full = datalist.copy()

## Step 1: Apply simple preconditions
- Number of observations larger than 500 (meaningful evaluations)
- Number of observations smaller than 100000 (keep runtime manageable)
- Number of features does not exceed 5000 (keep runtime manageable)
- The ratio of the minority class and the majority class is > 0.05 (severely imbalanced datasets complicate analysis)
- Number of values for categorical features must not exceed 100 (severely slows down some algorithms)
- Sparsely formatted data (requires special data readers)

In [4]:
# Apply preconditions
data_status.update({k: 'Too small' for k in datalist.index[datalist.NumberOfInstances<500]})
data_status.update({k: 'Too large' for k in datalist.index[datalist.NumberOfInstances>100000]})
data_status.update({k: 'High-dimensional' for k in datalist.index[datalist.NumberOfFeatures>5000]})
data_status.update({k: 'Extreme imbalance' for k in datalist.index[datalist.MinorityClassSize / datalist.MajorityClassSize < 0.05]})
data_status.update({k: 'Too many categories' for k in datalist.index[datalist.MaxNominalAttDistinctValues > 100]})
data_status.update({k: 'Sparse format' for k in datalist.index[datalist.format == 'Sparse_ARFF']})


# Filter dataset list
datalist = datalist[pd.Series({k:(v=='OK') for k,v in data_status.items()})] 

# Status update
[[x,list(data_status.values()).count(x)] for x in set(data_status.values())]

[['Sparse format', 32],
 ['Too large', 102],
 ['Too small', 332],
 ['OK', 302],
 ['Too many categories', 22],
 ['Extreme imbalance', 175],
 ['High-dimensional', 60]]

## Step 2: Filter out special datasets
- Artificial datasets (may bias the results)
- Time series dataset (cannot use random sampling for evaluation)
- Text data (contains string features which need additional preprocessing)
- Multilabel data (multiple targets need to be predicted)
- Derived versions of datasets (with additional preprocessing)
- Datasets where the intended classification target is unclear
- Binarized regression problems
- Unknown origin (no description how data was collected and what the problem is)
- Grouped data (instances form groups (blocks) and can't be randomly sampled)

In [5]:
# Get lists of special datasets
artificial_set = set(oml.datasets.list_datasets(tag="artificial").keys()) # Artificial datasets
timeseries_set = set(oml.datasets.list_datasets(tag="time_series").keys()) # Time series datasets
text_set = set(oml.datasets.list_datasets(tag="text_data").keys()) # Text dataset (contains string features)
multilabel_set = set(oml.datasets.list_datasets(tag="multi_label").keys()) # Multi-label datasets
derived_set = set(oml.datasets.list_datasets(tag="derived").keys()) # Derived datasets
unspecified_set = set(oml.datasets.list_datasets(tag="unspecified_target_feature").keys()) # Unspecified target
binarized_set = set(oml.datasets.list_datasets(tag="binarized_regression_problem").keys()) # Binarized data
unknown_set = set(oml.datasets.list_datasets(tag="origin_unknown").keys())
grouped_set = set(oml.datasets.list_datasets(tag="grouped_data").keys())

data_status.update({k: 'Artificial data' for k in artificial_set})
data_status.update({k: 'Time series data' for k in timeseries_set})
data_status.update({k: 'Text data' for k in text_set})
data_status.update({k: 'Multi-label data' for k in multilabel_set})
data_status.update({k: 'Derived (non-original) data' for k in derived_set})
data_status.update({k: 'Unspecified target feature' for k in unspecified_set})
data_status.update({k: 'Binarized regression problem' for k in binarized_set})
data_status.update({k: 'Unknown origin' for k in unknown_set})
data_status.update({k: 'Grouped data' for k in grouped_set})
#data_status.update({k: 'OpenML100' for k in openml100_set})

# Filter dataset list
datalist = datalist[pd.Series({k:(v=='OK') for k,v in data_status.items()})] 

# Status update
[[x,list(data_status.values()).count(x)] for x in set(data_status.values())]

[['Binarized regression problem', 84],
 ['Artificial data', 196],
 ['Time series data', 4],
 ['Grouped data', 1],
 ['Unknown origin', 7],
 ['Sparse format', 32],
 ['Text data', 2],
 ['Too large', 22],
 ['Too small', 331],
 ['OK', 142],
 ['Derived (non-original) data', 33],
 ['Multi-label data', 6],
 ['Too many categories', 12],
 ['Unspecified target feature', 6],
 ['Extreme imbalance', 110],
 ['High-dimensional', 60]]

## Step 3: Remove alternative versions of datasets
- Remove binarized versions of multi-class datasets
- Check other possible duplicates

In [6]:
# Sorting makes things easier
# We need the full list because there may be binarized versions of already removed datasets
datalist_full = datalist_full.sort_values(by=['name','NumberOfClasses'], ascending=[True, False])

checked_datasets = {
    40979: [1022, 20], # Correct version of mfeat-pixel
    40984: [958, 36], # Correct version of segment
    40994: [40990, 40989, 1467], # Correct version of climate-model-simulation-crashes
    1590: [179], # Correct version of adult
    40983: [1570], # Correct version of wilt
    40945: [40704], # Correct version of Titanic
    772: [948], # Correct version of classification version of the quake dataset
}
duplicates_of = {}
# Mark the duplicates of datasets where we know which version is the correct one!
for cd in checked_datasets:
    for dup_id in checked_datasets[cd]:
        duplicates_of[dup_id] = cd

data_unique = {}
for index, row in datalist_full.iterrows():
    if row['did'] in duplicates_of:
        data_status[row['did']] = 'Duplicate of %d' % duplicates_of[row['did']]
    elif row['did'] in checked_datasets and data_status[row['did']] in ('OK', 'Possible duplicate'):
        data_status[row['did']] = 'OK'
    elif row['name'] not in data_unique:
        data_unique[row['name']] = row
    else:
        previous = data_unique[row['name']]
        if previous['NumberOfClasses'] > 2 and row['NumberOfClasses'] == 2:
            data_status[row['did']] = 'Binarized version of multiclass dataset'
        elif data_status[row['did']] in ('OK', 'Possible duplicate'):
            data_status[row['did']] = 'Possible duplicate'

# Filter dataset list
datalist = datalist[pd.Series({k:(v=='OK') for k,v in data_status.items()})] 
               
# Status update
[[x,list(data_status.values()).count(x)] for x in set(data_status.values())]

[['Duplicate of 40979', 2],
 ['Time series data', 4],
 ['Too large', 22],
 ['Too small', 291],
 ['Duplicate of 40983', 1],
 ['Duplicate of 40984', 2],
 ['Binarized regression problem', 82],
 ['Unknown origin', 7],
 ['OK', 103],
 ['Duplicate of 772', 1],
 ['Artificial data', 195],
 ['Grouped data', 1],
 ['Binarized version of multiclass dataset', 78],
 ['Duplicate of 40994', 1],
 ['Duplicate of 40945', 1],
 ['Text data', 2],
 ['Derived (non-original) data', 31],
 ['Multi-label data', 6],
 ['Too many categories', 11],
 ['Sparse format', 31],
 ['Duplicate of 1590', 1],
 ['Unspecified target feature', 6],
 ['Extreme imbalance', 109],
 ['High-dimensional', 60]]

In [7]:
# These need to be checked
[k for k,v in data_status.items() if v=='Possible duplicate']

[]

## Step 3: Remove trivial datasets
- See if a model (e.g. random forest) based on 1 feature can get perfect CV performance

In [8]:
datasets = [k for k,v in data_status.items() if v=='OK']

def get_per_feature_score(dataset_id):
    dataset = oml.datasets.get_dataset(dataset_id)
    X, y = dataset.get_data(target=dataset.default_target_attribute)
    cv = StratifiedShuffleSplit(n_splits=3, random_state=0)
    n_features = X.shape[1]
    scores = []
    for feat_idx in range(n_features):
        try:
            X1 = X[:, feat_idx].reshape((-1, 1))
            clf = make_pipeline(sklearn.preprocessing.Imputer(strategy='median'), sklearn.tree.DecisionTreeClassifier())
            scores.append(cross_val_score(clf, X1, y, cv=cv))
        except ValueError:
            continue
    
    return {
        'score': np.max(scores), 
        'argmax': np.argmax(scores),
        'name': dataset.name
    }
    
max_score_per_dataset_list = Parallel(n_jobs=-1, backend='multiprocessing')(
    delayed(get_per_feature_score)(dataset_id) for dataset_id in datasets
)
max_score_per_dataset = {}
for i, dataset_id in enumerate(datasets):
    max_score_per_dataset[dataset_id] = max_score_per_dataset_list[i]
    if max_score_per_dataset[dataset_id]["score"] > 0.99:
        dataset = oml.datasets.get_dataset(dataset_id)
        data_status[dataset_id] = 'Too easy'
        print("Dataset ", dataset.name, "is too easy.")
    
results = pd.DataFrame(max_score_per_dataset).transpose()

Dataset  mushroom is too easy.
Dataset  irish is too easy.
Dataset  cjs is too easy.


In [9]:
results.sort_values(by='score')

Unnamed: 0,argmax,name,score
40923,2027,Devnagari-Script,0.0518478
1493,56,one-hundred-plants-texture,0.075
1491,126,one-hundred-plants-margin,0.0875
1492,122,one-hundred-plants-shape,0.10625
40971,2,collins,0.14
6,32,letter,0.1765
40927,6329,CIFAR_10,0.1875
300,1749,isolet,0.197436
1501,235,semeion,0.2
1468,1160,cnae-9,0.212963


### Results
Final list of selected datasets:

In [10]:
final_datasets = [k for k,v in data_status.items() if v=='OK']
print('{} datasets selected'.format(len(final_datasets)))
{k:v for k,v in data_names.items() if k in final_datasets}

100 datasets selected


{3: 'kr-vs-kp',
 6: 'letter',
 11: 'balance-scale',
 12: 'mfeat-factors',
 14: 'mfeat-fourier',
 15: 'breast-w',
 16: 'mfeat-karhunen',
 18: 'mfeat-morphological',
 22: 'mfeat-zernike',
 23: 'cmc',
 28: 'optdigits',
 29: 'credit-approval',
 31: 'credit-g',
 32: 'pendigits',
 37: 'diabetes',
 38: 'sick',
 42: 'soybean',
 44: 'spambase',
 46: 'splice',
 50: 'tic-tac-toe',
 54: 'vehicle',
 60: 'waveform-5000',
 151: 'electricity',
 182: 'satimage',
 188: 'eucalyptus',
 300: 'isolet',
 307: 'vowel',
 375: 'JapaneseVowels',
 377: 'synthetic_control',
 458: 'analcatdata_authorship',
 469: 'analcatdata_dmft',
 470: 'profb',
 554: 'mnist_784',
 1036: 'sylva_agnostic',
 1038: 'gina_agnostic',
 1043: 'ada_agnostic',
 1046: 'mozilla4',
 1049: 'pc4',
 1050: 'pc3',
 1053: 'jm1',
 1063: 'kc2',
 1067: 'kc1',
 1068: 'pc1',
 1120: 'MagicTelescope',
 1176: 'Internet-Advertisements',
 1461: 'bank-marketing',
 1462: 'banknote-authentication',
 1464: 'blood-transfusion-service-center',
 1466: 'cardiotocogr

Passed all tests, but not in OpenML100:

In [16]:
openml100_set = set(oml.datasets.list_datasets(tag="OpenML100").keys()) # OpenML100

new_datasets = [k for k,v in data_status.items() if v=='OK' and k not in openml100_set]
{k:v for k,v in data_names.items() if k in new_datasets}

{23517: 'numerai28.6',
 40645: 'GAMETES_Epistasis_2-Way_1000atts_0.4H_EDM-1_EDM-1_1',
 40646: 'GAMETES_Epistasis_2-Way_20atts_0.1H_EDM-1_1',
 40647: 'GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1',
 40648: 'GAMETES_Epistasis_3-Way_20atts_0.2H_EDM-1_1',
 40649: 'GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_50_EDM-2_001',
 40650: 'GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_75_EDM-2_001',
 40670: 'dna',
 40687: 'solar-flare',
 40701: 'churn',
 40705: 'tokyo1',
 40922: 'Run_or_walk_information',
 40923: 'Devnagari-Script',
 40927: 'CIFAR_10',
 40966: 'MiceProtein',
 40971: 'collins',
 40979: 'mfeat-pixel',
 40982: 'steel-plates-fault',
 40983: 'wilt',
 40984: 'segment',
 40994: 'climate-model-simulation-crashes',
 40996: 'Fashion-MNIST',
 41001: 'jungle_chess_2pcs_endgame_complete',
 41027: 'jungle_chess_2pcs_raw_endgame_complete'}

Datasets tagged with OpenML100 that did not pass all tests:

In [18]:
new_datasets = [k for k,v in data_status.items() if k in openml100_set and v!='OK']
{k:v for k,v in data_names.items() if k in new_datasets}

{20: 'mfeat-pixel',
 24: 'mushroom',
 36: 'segment',
 312: 'scene',
 333: 'monks-problems-1',
 334: 'monks-problems-2',
 335: 'monks-problems-3',
 451: 'irish',
 1112: 'KDDCup09_churn',
 1114: 'KDDCup09_upselling',
 1459: 'artificial-characters',
 1467: 'climate-model-simulation-crashes',
 1471: 'eeg-eye-state',
 1476: 'gas-drift',
 1504: 'steel-plates-fault',
 1570: 'wilt',
 4135: 'Amazon_employee_access',
 23380: 'cjs',
 40496: 'LED-display-domain-7digit',
 40536: 'SpeedDating'}

Reasons to exclude datasets:

In [35]:
v = {}
for key, value in sorted(data_status.items()):
    v.setdefault(value, []).append(key)
{k:str(v) for k,v in v.items()}

{'Artificial data': '[70, 71, 72, 73, 74, 75, 76, 77, 78, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 146, 147, 148, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 271, 272, 333, 334, 335, 1177, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1351, 1352, 1353, 1354, 1355, 1356, 1357, 1358, 1359, 1360, 1361, 1362, 1363, 1364, 1365, 1366, 1367, 1368, 1369, 1370, 1371, 1372, 1373, 1374, 1375, 1376, 1377, 1378, 1379, 1380, 1381, 1382, 1383, 1384, 1385, 1386, 1387, 1388, 1389, 1390, 1391, 1392, 1393, 1394, 1395, 1396, 1397, 1398, 1399, 1400, 1401, 1402, 1403, 1404, 1405, 1406, 1407, 1408, 1409, 1410, 1459, 1460, 1496, 1507, 1547, 1548, 1549

## Difference to google doc

In [None]:
names_selected = {v for k,v in data_names.items() if k in final_datasets}
google_doc = {
    "phoneme","breast-w","Australian","banknote-authentication","eeg-eye-state","electricity",
    "analcatdata_dmft","higgs","blood-transfusion-service-center","ilpd","steel-plates-fault",
    "satimage","mfeat-morphological","balance-scale","credit-a","cmc","diabetes","wilt","MagicTelescope",
    "vowel","pc3","adult","pc4","ada_agnostic","GesturePhaseSegmentationProcessed","PhishingWebsites",
    "bank-marketing","cardiotocography","climate-model-simulation-crashes","first-order-theorem-proving",
    "wall-robot-navigation","dresses-sales","sick","waveform-5000","wdbc","car","tic-tac-toe","mfeat-zernike",
    "segment","connect-4","kc2","jm1","pc1","kc1","qsar-biodeg","eucalyptus","credit-g","pendigits","vehicle",
    "letter","optdigits","mfeat-fourier","mfeat-karhunen","kr-vs-kp","ozone-level-8hr","sylva_agnostic",
    "nomao","spambase","splice","mushroom","cylinder-bands","SpeedDating","texture","mfeat-factors","collins",
    "mnist_784","gina_agnostic","Bioresponse","Internet-Advertisements","semeion","soybean","madelon","har",
    "isolet","micro-mass","cnae-9","MiceProtein","one-hundred-plants-margin","one-hundred-plants-shape",
    "one-hundred-plants-texture","mfeat-pixel",
}

In [15]:
print("New ones!")
for d in sorted(names_selected - google_doc):
    print('  ', d)
print("Dropped ones!")
for d in sorted(google_doc - names_selected):
    print('  ', d)
print('Difference')
for d in sorted(google_doc ^ names_selected):
    print('  ', d)

New ones!
   CIFAR_10
   Devnagari-Script
   Fashion-MNIST
   GAMETES_Epistasis_2-Way_1000atts_0.4H_EDM-1_EDM-1_1
   GAMETES_Epistasis_2-Way_20atts_0.1H_EDM-1_1
   GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1
   GAMETES_Epistasis_3-Way_20atts_0.2H_EDM-1_1
   GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_50_EDM-2_001
   GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_75_EDM-2_001
   JapaneseVowels
   Run_or_walk_information
   analcatdata_authorship
   churn
   credit-approval
   dna
   hill-valley
   jungle_chess_2pcs_endgame_complete
   jungle_chess_2pcs_raw_endgame_complete
   mozilla4
   numerai28.6
   profb
   solar-flare
   synthetic_control
   tokyo1
Dropped ones!
   SpeedDating
   car
   credit-a
   eeg-eye-state
   mushroom
Difference
   CIFAR_10
   Devnagari-Script
   Fashion-MNIST
   GAMETES_Epistasis_2-Way_1000atts_0.4H_EDM-1_EDM-1_1
   GAMETES_Epistasis_2-Way_20atts_0.1H_EDM-1_1
   GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1
   GAMETES_Epistasis_3-Way_20atts_0.2H_EDM-1_1
   GA