In [1]:
import pandas as pd
import itertools
import matplotlib.pyplot as plt
from IPython.display import display_html
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
import numpy as np
from sklearn.model_selection import train_test_split
from metadata_utils import get_metadata, get_tuned_alg_perf, process_metafeatures, compute_feature_corrs

In [2]:
dataset_version = "_v0"
# For choosing metafeatures
filter_families = [
    'general',
    'statistical',
    'info-theory']

In [3]:
metadataset_df, metafeatures_df = get_metadata(dataset_version)

metafeatures_processed = process_metafeatures(metafeatures_df, filter_families=filter_families)
metafeatures_df = metafeatures_processed

  metafeatures_processed = metafeatures_df.fillna(metafeatures_df.median())


In [4]:
metric = "F1" # Choices: "Accuracy", "F1", "Log Loss"

In [5]:
tuned_alg_perf = get_tuned_alg_perf(metadataset_df, metric=metric)
tuned_alg_perf

Unnamed: 0,results_bucket_path,dataset_fold_id,dataset_name,alg_name,hparam_source,trial_number,alg_hparam_id,exp_name,time__train,Log Loss__train,...,AUC__test,Accuracy__test,F1__test,time__train-eval,MSE__train,R2__train,MSE__val,R2__val,MSE__test,R2__test
20,results/openml__APSFailure__168868/CatBoost/gp...,openml__APSFailure__168868__fold_0,openml__APSFailure__168868,CatBoost,random_27_s0,27,CatBoost__seed_0__trial_27,gpu-expt-a_091822_065111_fdd9.zip,5.285700,0.012935,...,0.990643,0.994211,0.994211,0.146698,,,,,,
150,results/openml__APSFailure__168868/CatBoost/gp...,openml__APSFailure__168868__fold_1,openml__APSFailure__168868,CatBoost,random_3_s0,3,CatBoost__seed_0__trial_3,gpu-expt-a_091822_065111_fdd9.zip,3.450149,0.011320,...,0.988862,0.992105,0.992105,0.140466,,,,,,
282,results/openml__APSFailure__168868/CatBoost/gp...,openml__APSFailure__168868__fold_2,openml__APSFailure__168868,CatBoost,random_8_s0,8,CatBoost__seed_0__trial_8,gpu-expt-a_091822_065111_fdd9.zip,11.938921,0.007183,...,0.995551,0.993421,0.993421,0.173912,,,,,,
394,results/openml__APSFailure__168868/CatBoost/gp...,openml__APSFailure__168868__fold_3,openml__APSFailure__168868,CatBoost,random_20_s0,20,CatBoost__seed_0__trial_20,gpu-expt-a_091822_065111_fdd9.zip,7.538957,0.011646,...,0.991612,0.994868,0.994868,0.179523,,,,,,
536,results/openml__APSFailure__168868/CatBoost/gp...,openml__APSFailure__168868__fold_4,openml__APSFailure__168868,CatBoost,random_8_s0,8,CatBoost__seed_0__trial_8,gpu-expt-a_091822_065111_fdd9.zip,11.805938,0.007725,...,0.989614,0.995658,0.995658,0.179625,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191448,results/openml__wilt__146820/XGBoost/gpu-expt-...,openml__wilt__146820__fold_5,openml__wilt__146820,XGBoost,default,0,XGBoost__seed_0__trial_0,gpu-expt-a_091722_220903_21b9.zip,0.142135,0.039770,...,0.955786,0.983471,0.983471,0.006239,,,,,,
191671,results/openml__wilt__146820/XGBoost/gpu-expt-...,openml__wilt__146820__fold_6,openml__wilt__146820,XGBoost,random_22_s0,22,XGBoost__seed_0__trial_22,gpu-expt-a_091722_220903_21b9.zip,0.116242,0.022093,...,0.997229,0.985537,0.985537,0.003031,,,,,,
191865,results/openml__wilt__146820/XGBoost/gpu-expt-...,openml__wilt__146820__fold_7,openml__wilt__146820,XGBoost,random_1_s0,1,XGBoost__seed_0__trial_1,gpu-expt-a_091722_220903_21b9.zip,0.235586,0.032988,...,0.996977,0.983471,0.983471,0.006548,,,,,,
192092,results/openml__wilt__146820/XGBoost/gpu-expt-...,openml__wilt__146820__fold_8,openml__wilt__146820,XGBoost,random_27_s0,27,XGBoost__seed_0__trial_27,gpu-expt-a_091722_220903_21b9.zip,0.120812,0.016541,...,0.994732,0.991736,0.991736,0.005032,,,,,,


In [6]:
joined_df = tuned_alg_perf.merge(metafeatures_df, right_on="dataset_name", left_on="dataset_fold_id", how='left')

In [7]:
corr_columns = [col_name for col_name in joined_df if col_name.startswith("f__")]
f1_test = ['F1__test', 'alg_name']
features = corr_columns + f1_test

In [8]:
data = joined_df[joined_df.columns & features]
data = data.sample(frac=1).reset_index(drop=True)
train, test = train_test_split(data, test_size=0.2)

  data = joined_df[joined_df.columns & features]


In [9]:
data

Unnamed: 0,alg_name,F1__test,f__pymfe.general.attr_to_inst,f__pymfe.general.cat_to_num,f__pymfe.general.freq_class.count,f__pymfe.general.freq_class.histogram.0,f__pymfe.general.freq_class.histogram.1,f__pymfe.general.freq_class.histogram.2,f__pymfe.general.freq_class.histogram.3,f__pymfe.general.freq_class.histogram.4,...,f__pymfe.info-theory.mut_inf.quantiles.1,f__pymfe.info-theory.mut_inf.quantiles.2,f__pymfe.info-theory.mut_inf.quantiles.3,f__pymfe.info-theory.mut_inf.quantiles.4,f__pymfe.info-theory.mut_inf.range,f__pymfe.info-theory.mut_inf.sd,f__pymfe.info-theory.mut_inf.skewness,f__pymfe.info-theory.ns_ratio,f__pymfe.statistical.iq_range,f__pymfe.statistical.t_mean
0,XGBoost,0.917247,0.990741,0.000000,9,0.000000,0.000000,0.00,0.00,0.00,...,0.008414,0.018632,0.036939,0.132105,0.129517,0.036463,1.022602,38.780852,,
1,LinearModel,0.849057,0.048636,0.000000,2,0.500000,0.000000,0.00,0.00,0.00,...,0.008414,0.018632,0.036939,0.132105,0.129517,0.036463,1.022602,38.780852,,
2,SVM,0.992881,0.014235,0.000000,10,0.300000,0.100000,0.10,0.10,0.00,...,0.008414,0.018632,0.036939,0.132105,0.129517,0.036463,1.022602,38.780852,,
3,KNN,0.997801,0.000194,0.000000,7,0.714286,0.142857,0.00,0.00,0.00,...,0.008414,0.018632,0.036939,0.132105,0.129517,0.036463,1.022602,38.780852,,
4,KNN,0.887381,0.990741,0.000000,9,0.000000,0.000000,0.00,0.00,0.00,...,0.008414,0.018632,0.036939,0.132105,0.129517,0.036463,1.022602,38.780852,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6455,CatBoost,0.663317,0.000016,1.333333,2,0.500000,0.000000,0.00,0.00,0.00,...,0.015825,0.022118,0.030687,0.051745,0.050152,0.020681,0.241687,199.413060,,
6456,KNN,0.195738,0.000518,0.000000,100,0.640000,0.170000,0.05,0.01,0.06,...,0.008414,0.018632,0.036939,0.132105,0.129517,0.036463,1.022602,38.780852,,
6457,KNN,0.750219,0.000878,0.000000,2,0.500000,0.000000,0.00,0.00,0.00,...,0.008414,0.018632,0.036939,0.132105,0.129517,0.036463,1.022602,38.780852,,
6458,CatBoost,0.981470,0.089120,0.000000,8,0.125000,0.000000,0.00,0.00,0.00,...,0.008414,0.018632,0.036939,0.132105,0.129517,0.036463,1.022602,38.780852,,


In [10]:
def get_xy_test(data):

    x = data[data.columns & corr_columns]
    x = x.fillna(0)
    print(x.isnull().values.any())
    #X = x.values.reshape(-1, 650)

    y = data['F1__test']
    Y = y.values.reshape(-1, 1)
    
    
    return x, Y

In [11]:
def get_xy(data):
    c = data['alg_name']
    c = pd.get_dummies(c)
    C = c.values.reshape(-1, 7)

    x = data[data.columns & corr_columns]
    x = x.fillna(0)
    print(x.isnull().values.any())
    X = x.values.reshape(-1, 650)

    y = data['F1__test']
    Y = y.values.reshape(-1, 1)
    X1 = np.concatenate((X, C), axis=1)
    
    return X1, Y

In [12]:

algos = metadataset_df['alg_name'].unique()
algos = pd.get_dummies(algos)


In [13]:
algos

Unnamed: 0,CatBoost,KNN,LinearModel,MLP,SVM,TabNet,XGBoost
0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0
2,0,0,1,0,0,0,0
3,0,0,0,1,0,0,0
4,0,0,0,0,0,0,1
5,0,0,0,0,1,0,0
6,0,0,0,0,0,1,0


In [14]:
algo_index = ['CatBoost', 'KNN', 'LinearModel', 'MLP', 'SVM', 'TabNet', 'XGBoost']

In [15]:
def concatenate_per_row(A, B):
    m1,n1 = A.shape
    m2,n2 = B.shape

    out = np.zeros((m1,m2,n1+n2),dtype=A.dtype)
    out[:,:,:n1] = A[:,None,:]
    out[:,:,n1:] = B
    return out.reshape(m1*m2,-1)

In [16]:
def get_indices(lst):
    ind = []
    window_size = 7
    for i in range(int(len(lst)/window_size)):
        l = lst[i*window_size:(i+1)*window_size]
        ind.append(l.index(max(l)))
    return ind

In [17]:
def get_predictions(X_test):
    algos = metadataset_df['alg_name'].unique()
    algos = pd.get_dummies(algos)
    X = X_test.values.reshape(-1, 650)
    x1 = concatenate_per_row(X, algos)
    Y_pred = linear_regressor.predict(x1)
    Y_pred = Y_pred.tolist()
    indices = get_indices(Y_pred)
    preds=[]
    for i in indices:
        preds.append(algo_index[i])
        
    return preds, Y_pred
        

In [18]:
X_train, Y_train = get_xy(train)
linear_regressor = LinearRegression()
linear_regressor.fit(X_train, Y_train)
r_squared = linear_regressor.score(X_train, Y_train)
print(r_squared)

False
0.24020208857583147


  x = data[data.columns & corr_columns]


In [19]:
def get_best_algo(dataset, metric):
    temp_df = metadataset_df.loc[metadataset_df['dataset_name'] == dataset]
    return temp_df.loc[temp_df[metric].idxmax()]['alg_name']

In [20]:
def get_dataset_name(a):
    dataset=""
    a = a.fillna(0)
    a=a.values.tolist()
    a=[ '%.6f' % elem for elem in a ]
    a=[float(i) for i in a]
    for index, row in metafeatures_df.iterrows():
        row = row.fillna(0)
        row = row.values.tolist()
        dataset_current = row[0]
        row = row[1:]
        #print(row)
        row=[ '%.6f' % elem for elem in row ]
        row=[float(i) for i in row]


        if (row == a):
            dataset = dataset_current
    #print(dataset)
    #print(dataset[0:-8])
    return dataset[0:-8]

In [21]:
def get_ground_truths(X_test):
    ground_truths=[]
    metric='F1__test'
    for index, row in X_test.iterrows():
        print(index)
        dataset_name = get_dataset_name(row)
        if (dataset_name == ""):
            print("Error: No dataset returned")
        best_algo = get_best_algo(dataset_name, metric)
        ground_truths.append(best_algo)
    return ground_truths

In [22]:
X_test, Y_test = get_xy_test(test)

False


  x = data[data.columns & corr_columns]


In [23]:
ground_truths = get_ground_truths(X_test)

975
3837
3847
100
4399
2760
6405
3420
6165
3125
2652
927
2867
1944
2920
5663
2241
377
1170
5386
773
3559
4021
2437
6261
3247
3398
3901
595
1133
2103
2128
1379
549
4927
26
4604
3594
1822
6432
1050
1694
2282
3261
3550
4947
2002
5813
4366
3299
1157
4644
2403
2480
5485
2157
340
425
736
4667
3202
3495
1963
5293
4807
3143
4727
5079
3884
785
1187
2875
1506
1079
3544
4619
3648
468
6259
3596
4881
5699
1957
1239
3733
688
4841
6122
1034
6024
76
552
5873
2009
115
1542
2427
1140
236
5103
2704
3448
6268
3571
1059
2971
1286
3849
3026
3695
4403
1161
1091
2702
85
3256
2772
3846
792
1367
6092
3397
4315
1520
4480
5245
5170
1579
4763
5908
1437
5770
847
4018
864
1643
5856
678
1241
4867
2866
4215
5676
4163
3497
6447
2297
684
427
2636
3894
6161
5279
2124
5294
5204
911
1418
1472
1297
6253
5410
2337
1561
272
2869
1874
2529
498
1306
568
6197
1624
158
3238
6439
5693
3115
998
637
2808
952
1901
364
5109
1483
5263
2828
4286
2126
4138
4057
4451
3563
5518
6059
4677
6143
4489
6108
19
580
5968
3981
1242
4960
3391
4469


In [24]:
ground_truths

['SVM',
 'CatBoost',
 'XGBoost',
 'XGBoost',
 'XGBoost',
 'SVM',
 'TabNet',
 'MLP',
 'SVM',
 'XGBoost',
 'XGBoost',
 'SVM',
 'CatBoost',
 'CatBoost',
 'XGBoost',
 'XGBoost',
 'CatBoost',
 'XGBoost',
 'SVM',
 'XGBoost',
 'XGBoost',
 'XGBoost',
 'XGBoost',
 'XGBoost',
 'XGBoost',
 'XGBoost',
 'XGBoost',
 'XGBoost',
 'CatBoost',
 'KNN',
 'SVM',
 'CatBoost',
 'XGBoost',
 'TabNet',
 'SVM',
 'XGBoost',
 'CatBoost',
 'CatBoost',
 'XGBoost',
 'CatBoost',
 'CatBoost',
 'XGBoost',
 'XGBoost',
 'XGBoost',
 'SVM',
 'CatBoost',
 'KNN',
 'XGBoost',
 'SVM',
 'XGBoost',
 'TabNet',
 'CatBoost',
 'MLP',
 'XGBoost',
 'XGBoost',
 'SVM',
 'XGBoost',
 'XGBoost',
 'XGBoost',
 'SVM',
 'XGBoost',
 'XGBoost',
 'XGBoost',
 'CatBoost',
 'MLP',
 'MLP',
 'XGBoost',
 'XGBoost',
 'CatBoost',
 'SVM',
 'CatBoost',
 'CatBoost',
 'SVM',
 'CatBoost',
 'CatBoost',
 'XGBoost',
 'MLP',
 'XGBoost',
 'XGBoost',
 'XGBoost',
 'XGBoost',
 'XGBoost',
 'CatBoost',
 'XGBoost',
 'MLP',
 'XGBoost',
 'TabNet',
 'SVM',
 'SVM',
 'MLP',
 

In [25]:
preds, scores = get_predictions(X_test)

In [28]:
preds

['LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'CatBoost',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearModel',
 'LinearMod

In [26]:
scores

[[0.8122798589675773],
 [0.8122797114795718],
 [0.8122798641512541],
 [0.8122797164702753],
 [0.812279635330844],
 [0.812279672873802],
 [0.812279673038895],
 [0.8424165852346529],
 [0.8424164377466473],
 [0.8424165904183296],
 [0.8424164427373508],
 [0.8424163615979195],
 [0.8424163991408774],
 [0.8424163993059706],
 [1.01815636824012],
 [1.0181562207521146],
 [1.0181563734237968],
 [1.018156225742818],
 [1.0181561446033869],
 [1.0181561821463447],
 [1.0181561823114378],
 [0.8208896022493544],
 [0.8208894547613489],
 [0.8208896074330312],
 [0.8208894597520524],
 [0.8208893786126211],
 [0.820889416155579],
 [0.8208894163206721],
 [1.0142998661981826],
 [1.014299718710177],
 [1.0142998713818594],
 [1.0142997237008804],
 [1.0142996425614492],
 [1.014299680104407],
 [1.0142996802695003],
 [0.8282231666050968],
 [0.8282230191170912],
 [0.8282231717887735],
 [0.8282230241077947],
 [0.8282229429683634],
 [0.8282229805113214],
 [0.8282229806764145],
 [0.9019601573405749],
 [0.901960008328963]

In [27]:
correct_prediction = 0
for i in range(len(ground_truths)):
    if (ground_truths[i] == preds[i]):
        correct_prediction = correct_prediction + 1
print(correct_prediction/len(ground_truths))

0.010061919504643963
