In [67]:
import tensorflow as tf
import tensorflow_addons as tfa
from sklearn.metrics import r2_score
import openml
import random
import jsonlines
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from verstack import NaNImputer
import numpy as np


In [43]:
best_model_loaded = tf.keras.models.load_model('best_model.h5', compile=False)
best_model_loaded.compile(
    loss='mse',
    optimizer='adam',
    metrics=[tfa.metrics.RSquare(dtype=tf.float32, y_shape=(1,))])
best_model_loaded.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 469)               54404     
                                                                 
 dropout (Dropout)           (None, 469)               0         
                                                                 
 dense_1 (Dense)             (None, 93)                43710     
                                                                 
 dropout_1 (Dropout)         (None, 93)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 94        
                                                                 
Total params: 98,208
Trainable params: 98,208
Non-trainable params: 0
_________________________________________________________________


In [44]:
labeled_flows = {}
with jsonlines.open('flow_classification.jsonl', 'r') as f:
    for obj in f:
        labeled_flows[obj['id']] = obj

In [45]:
tasks_list = list(openml.tasks.list_tasks(task_type=openml.tasks.TaskType.SUPERVISED_CLASSIFICATION).keys())

In [63]:
from func_timeout import func_timeout, FunctionTimedOut
def get_task():
    random_task = None
    while not random_task:
        try:
            random_task = func_timeout(5, openml.tasks.get_task, args=([random.choice(tasks_list)]))
        except openml.exceptions.OpenMLCacheException:
            print("Please connect the cache volume")
        except FunctionTimedOut:
            continue
    return random_task
def get_x():
    results = []
    while len(results) < 100:  # Loop until we actually find something useful
        random_task = get_task()
        dataset = openml.datasets.get_dataset(random_task.dataset_id)
        run_to_analyze = None
        task_runs = list(
            openml.runs.list_runs(task=[random_task.task_id]).values())
        random.shuffle(task_runs)
        for run in task_runs[:20]:
            run = openml.runs.get_run(random.choice(task_runs)['run_id'])
            if run.flow_id in labeled_flows:
                run_to_analyze = run
                results.append({
                    "accuracy":
                        run_to_analyze.evaluations["predictive_accuracy"],
                    "algorithm":
                        labeled_flows[run_to_analyze.flow_id]["label"],
                    "task":
                        run_to_analyze.task_id
                } | dataset.qualities)
    return results
categories = ["DecisionTree", "Ensemble", "LinearRegression", "LogisticRegression", "NaiveBayes", "NeuralNetwork", "SVM", "kNearestNeighbor"]
ohe = OneHotEncoder(categories=[
    categories
])
ics = get_x()
data = pd.DataFrame(ics).drop_duplicates().reset_index()
X = data.drop(['accuracy', 'task', 'index'], axis=1)
y = data['accuracy']


- MaxAttributeEntropy:           imputed 61 NaNs
- MeanMutualInformation:         imputed 61 NaNs
- MaxMutualInformation:          imputed 61 NaNs
- MeanAttributeEntropy:          imputed 61 NaNs
- EquivalentNumberOfAtts:        imputed 61 NaNs
- MeanNoiseToSignalRatio:        imputed 61 NaNs
- MaxSkewnessOfNumericAtts:      imputed 8 NaNs
- MeanKurtosisOfNumericAtts:     imputed 8 NaNs
- MaxKurtosisOfNumericAtts:      imputed 8 NaNs
- MinAttributeEntropy:           imputed 61 NaNs
- MeanSkewnessOfNumericAtts:     imputed 8 NaNs
- MinKurtosisOfNumericAtts:      imputed 8 NaNs
- MinMutualInformation:          imputed 61 NaNs
- MeanMeansOfNumericAtts:        imputed 8 NaNs
- MaxStdDevOfNumericAtts:        imputed 8 NaNs
- MaxMeansOfNumericAtts:         imputed 8 NaNs
- MinMeansOfNumericAtts:         imputed 8 NaNs
- MeanStdDevOfNumericAtts:       imputed 8 NaNs
- Quartile1AttributeEntropy:     imputed 61 NaNs
- MinStdDevOfNumericAtts:        imputed 8 NaNs
- Quartile1MutualInformation:  

In [64]:
encoded_algo = pd.DataFrame(ohe.fit_transform(X["algorithm"].to_numpy().reshape(-1, 1)).toarray(), columns=categories)
X_mod = pd.concat([X, encoded_algo], axis=1).drop(['algorithm'], axis=1)
imputer = NaNImputer(conservative=True)
X_mod = imputer.impute(X_mod)
X_mod[X_mod.columns] = StandardScaler().fit_transform(
X_mod[X_mod.columns])
X_mod

NaNImputer(conservative = True, n_feats = 10,            
           fix_string_nans = True, verbose = True,                
           multiprocessing_load = 3, fill_nans_in_pure_text = True,                    
           drop_empty_cols = True, drop_nan_cols_with_constant = True                        
           feature_selection = correlation)

Dataset dimensions:
 - rows:         83
 - columns:      115
 - mb in memory: 0.07
 - NaN cols num: 83
--------------------------

Deploy multiprocessing with 12 parallel proceses



  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index


- CfsSubsetEval_NaiveBayesErrRate: imputed 14 NaNs
- CfsSubsetEval_DecisionStumpErrRate: imputed 14 NaNs
- CfsSubsetEval_kNN1NErrRate:    imputed 14 NaNs
- DecisionStumpAUC:              imputed 14 NaNs
- DecisionStumpErrRate:          imputed 14 NaNs
- CfsSubsetEval_DecisionStumpAUC: imputed 14 NaNs
- CfsSubsetEval_NaiveBayesAUC:   imputed 14 NaNs
- CfsSubsetEval_kNN1NAUC:        imputed 14 NaNs
- CfsSubsetEval_NaiveBayesKappa: imputed 14 NaNs
- CfsSubsetEval_kNN1NKappa:      imputed 14 NaNs
- CfsSubsetEval_DecisionStumpKappa: imputed 14 NaNs
- DecisionStumpKappa:            imputed 14 NaNs
- J48.00001.ErrRate:             imputed 14 NaNs
- J48.00001.AUC:                 imputed 14 NaNs
- J48.0001.ErrRate:              imputed 14 NaNs
- J48.0001.AUC:                  imputed 14 NaNs
- J48.001.ErrRate:               imputed 14 NaNs
- J48.001.AUC:                   imputed 14 NaNs
- J48.00001.Kappa:               imputed 14 NaNs
- J48.0001.Kappa:                imputed 14 NaNs
- J48.001

Unnamed: 0,AutoCorrelation,CfsSubsetEval_DecisionStumpAUC,CfsSubsetEval_DecisionStumpErrRate,CfsSubsetEval_DecisionStumpKappa,CfsSubsetEval_NaiveBayesAUC,CfsSubsetEval_NaiveBayesErrRate,CfsSubsetEval_NaiveBayesKappa,CfsSubsetEval_kNN1NAUC,CfsSubsetEval_kNN1NErrRate,CfsSubsetEval_kNN1NKappa,...,kNN1NErrRate,kNN1NKappa,DecisionTree,Ensemble,LinearRegression,LogisticRegression,NaiveBayes,NeuralNetwork,SVM,kNearestNeighbor
0,-1.203293,1.079990,-0.548388,0.229579,1.079990,-0.548388,0.229579,1.079990,-0.548388,0.229579,...,-0.315026,-0.018310,0.0,0.941469,-0.110432,-0.193649,-0.430946,-0.348743,-0.326599,-0.253185
1,-1.203293,1.079990,-0.548388,0.229579,1.079990,-0.548388,0.229579,1.079990,-0.548388,0.229579,...,-0.315026,-0.018310,0.0,-1.062170,-0.110432,-0.193649,2.320477,-0.348743,-0.326599,-0.253185
2,-1.203293,1.079990,-0.548388,0.229579,1.079990,-0.548388,0.229579,1.079990,-0.548388,0.229579,...,-0.315026,-0.018310,0.0,0.941469,-0.110432,-0.193649,-0.430946,-0.348743,-0.326599,-0.253185
3,-1.203293,1.079990,-0.548388,0.229579,1.079990,-0.548388,0.229579,1.079990,-0.548388,0.229579,...,-0.315026,-0.018310,0.0,0.941469,-0.110432,-0.193649,-0.430946,-0.348743,-0.326599,-0.253185
4,-1.203293,1.079990,-0.548388,0.229579,1.079990,-0.548388,0.229579,1.079990,-0.548388,0.229579,...,-0.315026,-0.018310,0.0,-1.062170,-0.110432,-0.193649,-0.430946,2.867442,-0.326599,-0.253185
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,0.610032,-0.789386,0.467066,-0.775682,-0.789386,0.467066,-0.775682,-0.789386,0.467066,-0.775682,...,0.192990,-0.611348,0.0,-1.062170,-0.110432,-0.193649,2.320477,-0.348743,-0.326599,-0.253185
79,0.610032,-0.789386,0.467066,-0.775682,-0.789386,0.467066,-0.775682,-0.789386,0.467066,-0.775682,...,0.192990,-0.611348,0.0,-1.062170,-0.110432,-0.193649,2.320477,-0.348743,-0.326599,-0.253185
80,0.610032,-0.789386,0.467066,-0.775682,-0.789386,0.467066,-0.775682,-0.789386,0.467066,-0.775682,...,0.192990,-0.611348,0.0,0.941469,-0.110432,-0.193649,-0.430946,-0.348743,-0.326599,-0.253185
81,0.610032,-0.789386,0.467066,-0.775682,-0.789386,0.467066,-0.775682,-0.789386,0.467066,-0.775682,...,0.192990,-0.611348,0.0,0.941469,-0.110432,-0.193649,-0.430946,-0.348743,-0.326599,-0.253185


In [72]:
best_model_loaded = tf.keras.models.load_model('best_model.h5', compile=False)
best_model_loaded.compile(
    loss='mse',
    optimizer='adam',
    metrics=[tfa.metrics.RSquare(dtype=tf.float32, y_shape=(1,))])
best_model_loaded.summary()
y_pred = best_model_loaded.predict(X_mod)
avg_error = sum(np.abs(y_pred.flatten() - y)) / len(y)
print(f"The average error was {avg_error}")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 469)               54404     
                                                                 
 dropout (Dropout)           (None, 469)               0         
                                                                 
 dense_1 (Dense)             (None, 93)                43710     
                                                                 
 dropout_1 (Dropout)         (None, 93)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 94        
                                                                 
Total params: 98,208
Trainable params: 98,208
Non-trainable params: 0
_________________________________________________________________
The average error was 0.10399386123827278
