In [None]:
%matplotlib inline
import openml
import seaborn as sns
import numpy as np
import pandas as pd
import sys
import math
from scipy.stats import norm
from matplotlib import pyplot
import sklearn.tree
import sklearn.ensemble
import sklearn.preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score

In [None]:
datasets = openml.datasets.list_datasets(tag='OpenML100')

Approach 1: see if a decision stump based on 1 feature can get perfect performance on all data

In [None]:
max_score_per_dataset = {}
for dataset_id in datasets:
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y = dataset.get_data(target=dataset.default_target_attribute)
    n_features = X.shape[1]
    scores = []
    for feat_idx in range(n_features):
        X1 = X[:, feat_idx].reshape((-1, 1))
        imp = sklearn.preprocessing.Imputer(strategy='median')
        X1 = imp.fit_transform(X1)
        if X1.shape[1] == 0:
            continue
        clf = sklearn.tree.DecisionTreeClassifier(max_leaf_nodes = int(float(dataset.qualities["NumberOfClasses"])))
        clf.fit(X1, y)
        scores.append(clf.score(X1,y))
    
    max_score_per_dataset[dataset_id] = {
        'score': np.max(scores), 
        'argmax': np.argmax(scores),
        'name': dataset.name
    }
    print("Dataset ", dataset.name, " score ", max_score_per_dataset[dataset.dataset_id]["score"])
    
results = pd.DataFrame(max_score_per_dataset).transpose()

In [29]:
results.sort_values(by='score')

Unnamed: 0,argmax,name,score
1491,5,one-hundred-plants-margin,0.133125
1493,0,one-hundred-plants-texture,0.138837
6,12,letter,0.1734
1501,161,semeion,0.19774
1468,545,cnae-9,0.205556
300,583,isolet,0.207516
40496,6,LED-display-domain-7digit,0.21
469,1,analcatdata_dmft,0.222083
1492,18,one-hundred-plants-shape,0.22375
1459,6,artificial-characters,0.237816


Approach 2: see if a random forest based on 1 feature can get perfect CV performance

In [None]:
max_score_per_dataset = {}
for dataset_id in datasets:
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y = dataset.get_data(target=dataset.default_target_attribute)
    cv = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0)
    n_features = X.shape[1]
    scores = []
    for feat_idx in range(n_features):
        try:
            X1 = X[:, feat_idx].reshape((-1, 1))
            clf = make_pipeline(sklearn.preprocessing.Imputer(strategy='median'), sklearn.ensemble.RandomForestClassifier())
            scores.append(np.mean(cross_val_score(clf, X1, y, cv=cv)))
        except ValueError:
            continue
    
    max_score_per_dataset[dataset_id] = {
        'score': np.max(scores), 
        'argmax': np.argmax(scores),
        'name': dataset.name
    }
    print("Dataset ", dataset.name, " score ", max_score_per_dataset[dataset.dataset_id]["score"])
    
results = pd.DataFrame(max_score_per_dataset).transpose()

In [26]:
results.sort_values(by='score')

Unnamed: 0,argmax,name,score
1491,429,one-hundred-plants-margin,0.08125
1493,482,one-hundred-plants-texture,0.08125
1492,149,one-hundred-plants-shape,0.11875
6,109,letter,0.178
1501,64,semeion,0.2
300,5834,isolet,0.202564
1468,3863,cnae-9,0.222222
40496,62,LED-display-domain-7digit,0.24
1515,845,micro-mass,0.241379
554,4069,mnist_784,0.25
