In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = np.load('meta_dataset.npz')

In [3]:
X = data['X']
H = data['H']
A = data['A']
Y = data['y']
D = data['D']
X, H, A, Y, D = shuffle(X, H, A, Y, D, random_state=42)

In [8]:
A[0:100]

array(['SGDClassifier', 'SGDClassifier', 'SGDClassifier',
       'GradientBoostingClassifier', 'SGDClassifier', 'SGDClassifier',
       'SGDClassifier', 'SGDClassifier', 'SGDClassifier',
       'DecisionTreeClassifier', 'SGDClassifier', 'RandomForestClassifier',
       'SGDClassifier', 'SGDClassifier', 'ExtraTreesClassifier',
       'SGDClassifier', 'SGDClassifier', 'GradientBoostingClassifier',
       'SGDClassifier', 'GradientBoostingClassifier', 'SGDClassifier',
       'SGDClassifier', 'SGDClassifier', 'SGDClassifier', 'SGDClassifier',
       'GradientBoostingClassifier', 'SGDClassifier', 'SGDClassifier',
       'GradientBoostingClassifier', 'GradientBoostingClassifier',
       'SGDClassifier', 'SGDClassifier', 'SGDClassifier', 'SGDClassifier',
       'SGDClassifier', 'RandomForestClassifier', 'SGDClassifier',
       'SGDClassifier', 'SGDClassifier', 'SGDClassifier', 'SGDClassifier',
       'SGDClassifier', 'SGDClassifier', 'SGDClassifier', 'SGDClassifier',
       'RandomForestClass

In [10]:
A[[0, 3, 9, 11, 14]]

array(['SGDClassifier', 'GradientBoostingClassifier',
       'DecisionTreeClassifier', 'RandomForestClassifier',
       'ExtraTreesClassifier'],
      dtype='<U27')

In [None]:
#f = (D == 'yeast') * (A == 'SGDClassifier')
#f = Y > 0.999
#f = Y>0
f = Y==1.0
Xf = X[f]
Hf = H[f]
Af = A[f]
Yf = Y[f]
Df = D[f]

In [None]:
train_datasets = [
       'GAMETES_Epistasis_2-Way_1000atts_0.4H_EDM-1_EDM-1_1',
       'hayes-roth', 'heart-c', 'heart-h', 'heart-statlog', 'hepatitis',
       'horse-colic', 'house-votes-84', 'hungarian', 'hypothyroid',
       'ionosphere', 'iris', 'irish', 'kddcup', 'kr-vs-kp', 'krkopt',
       'labor', 'led24', 'led7', 'letter', 'liver-disorder', 'lupus',
       'lymphography', 'magic', 'mfeat-factors', 'mfeat-fourier',
       'mfeat-karhunen', 'mfeat-morphological', 'mfeat-pixel',
       'mfeat-zernike', 'mnist', 'mofn-3-7-10',
       'molecular-biology_promoters', 'monk1', 'monk2', 'monk3',
       'movement_libras', 'mushroom', 'mux6', 'new-thyroid', 'nursery',
       'optdigits', 'page-blocks', 'parity5', 'parity5+5', 'pendigits',
       'phoneme', 'pima', 'poker', 'postoperative-patient-data',
       'prnn_crabs', 'prnn_fglass', 'prnn_synth', 'profb', 'promoters',
       'ring', 'saheart', 'satimage', 'schizo', 'segmentation', 'shuttle',
       'sleep', 'solar-flare_1', 'solar-flare_2', 'sonar', 'soybean',
       'spambase', 'spect', 'spectf', 'splice', 'tae', 'texture',
       'threeOf9', 'tic-tac-toe', 'titanic', 'tokyo1', 'twonorm',
       'vehicle', 'vote', 'vowel', 'waveform-21', 'waveform-40', 'wdbc',
       'wine-quality-red', 'wine-quality-white', 'wine-recognition', 'xd6',
       'yeast'
]
test_datasets = [
       'GAMETES_Epistasis_2-Way_20atts_0.1H_EDM-1_1',
       'GAMETES_Epistasis_2-Way_20atts_0.4H_EDM-1_1',
       'GAMETES_Epistasis_3-Way_20atts_0.2H_EDM-1_1',
       'GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_50_EDM-2_001',
       'GAMETES_Heterogeneity_20atts_1600_Het_0.4_0.2_75_EDM-2_001',
       'Hill_Valley_with_noise', 'Hill_Valley_without_noise', 'adult',
       'agaricus-lepiota', 'allbp', 'allhyper', 'allhypo', 'allrep',
       'analcatdata_aids', 'analcatdata_asbestos',
       'analcatdata_authorship', 'analcatdata_bankruptcy',
       'analcatdata_boxing1', 'analcatdata_boxing2',
       'analcatdata_creditscore', 'analcatdata_cyyoung8092',
       'analcatdata_cyyoung9302', 'analcatdata_dmft', 'analcatdata_fraud',
       'analcatdata_germangss', 'analcatdata_happiness',
       'analcatdata_japansolvent', 'analcatdata_lawsuit', 'ann-thyroid',
       'appendicitis', 'australian', 'auto', 'backache', 'balance-scale',
       'banana', 'biomed', 'breast', 'breast-cancer',
       'breast-cancer-wisconsin', 'breast-w', 'buggyCrx', 'bupa',
       'calendarDOW', 'car', 'car-evaluation', 'cars', 'cars1', 'chess',
       'churn', 'clean1', 'clean2', 'cleve', 'cleveland',
       'cleveland-nominal', 'cloud', 'cmc', 'coil2000', 'colic', 'collins',
       'confidence', 'connect-4', 'contraceptive', 'corral', 'credit-a',
       'credit-g', 'crx', 'dermatology', 'diabetes', 'dis', 'dna', 'ecoli',
       'fars', 'flags', 'flare', 'german', 'glass', 'glass2', 'haberman'
]
    
train = np.array([d in train_datasets for d in Df])
test = np.array([d in test_datasets for d in Df])

Xtrain = Xf[train]
Ytrain = Af[train]
Rtrain = Yf[train]

Xtest = Xf[test]
Ytest = Af[test]

In [None]:
clf = RandomForestClassifier(max_depth=5)
clf.fit(Xtrain, Ytrain)
print((clf.predict(Xtrain)==Ytrain).mean())

In [None]:
(clf.predict(Xtest) == Ytest).mean()

In [None]:
clf.predict(Xtest)[0:1000]

In [None]:
import seaborn as sns
fig = plt.figure(figsize=(30, 10))
sns.barplot(x=Af, y=np.ones(len(Af)), estimator=lambda x:x.sum())