In [183]:
import os
import sys
import random

In [184]:
import pandas as pd
import numpy as np

In [185]:
from joblib import dump
from scipy.io import loadmat
from pathlib import Path
from dataclasses import dataclass
from sklearn.preprocessing import MinMaxScaler
from metaod.models.gen_meta_features import generate_meta_features
from metaod.models.core import MetaODClass
from metaod.models.utility import fix_nan

In [186]:
@dataclass
class Model:
    name: str
    type: str
    pars: tuple

@dataclass
class Dataset:
    name: str
    path: str

In [187]:
script_directory = os.path.dirname(os.path.abspath('__file__'))
excel_file_path = os.path.join(script_directory, '..', 'data', 'performance_table.xlsx')
perf_df = pd.read_excel(excel_file_path, sheet_name='AP')

print(perf_df)

           Data  LODA (5, 10)  LODA (5, 20)  LODA (5, 30)  LODA (5, 40)  \
0    Annthyroid        0.0593        0.0513        0.0488        0.0469   
1    Arrhythmia        0.1753        0.2161        0.2275        0.1992   
2       Breastw        0.6431        0.9723        0.7711        0.9240   
3         Glass        0.0411        0.1193        0.0574        0.0585   
4    Ionosphere        0.7711        0.5320        0.6142        0.6257   
5        Letter        0.1130        0.0740        0.0799        0.0865   
6        Lympho        0.2946        0.9107        0.1109        0.5645   
7   Mammography        0.1886        0.3117        0.1198        0.0969   
8         Mnist        0.1070        0.1589        0.1155        0.1336   
9          Musk        0.1380        0.3413        0.1997        0.4673   
10    Optdigits        0.0172        0.0160        0.0180        0.0209   
11    Pendigits        0.1309        0.1212        0.0672        0.0982   
12         Pima        0.

In [188]:
data_list, model_list = [], []

models = perf_df.columns[1:].to_list()


for model in models:
    name = model
    print(model)
    type = model.split()[0]
    pars = model.split()[1]
    model_list.append(Model(name, type, pars))


for i in range(perf_df.shape[0]):
    name = perf_df.iloc[i, 0]
    path = name.lower() + '.mat'
    data_list.append(Dataset(name, path))

LODA (5, 10)
LODA (5, 20)
LODA (5, 30)
LODA (5, 40)
LODA (5, 50)
LODA (5, 75)
LODA (5, 100)
LODA (5, 150)
LODA (5, 200)
LODA (10, 10)
LODA (10, 20)
LODA (10, 30)
LODA (10, 40)
LODA (10, 50)
LODA (10, 75)
LODA (10, 100)
LODA (10, 150)
LODA (10, 200)
LODA (15, 10)
LODA (15, 20)
LODA (15, 30)
LODA (15, 40)
LODA (15, 50)
LODA (15, 75)
LODA (15, 100)
LODA (15, 150)
LODA (15, 200)
LODA (20, 10)
LODA (20, 20)
LODA (20, 30)
LODA (20, 40)
LODA (20, 50)
LODA (20, 75)
LODA (20, 100)
LODA (20, 150)
LODA (20, 200)
LODA (25, 10)
LODA (25, 20)
LODA (25, 30)
LODA (25, 40)
LODA (25, 50)
LODA (25, 75)
LODA (25, 100)
LODA (25, 150)
LODA (25, 200)
LODA (30, 10)
LODA (30, 20)
LODA (30, 30)
LODA (30, 40)
LODA (30, 50)
LODA (30, 75)
LODA (30, 100)
LODA (30, 150)
LODA (30, 200)
Iforest (10, 0.1)
Iforest (10, 0.2)
Iforest (10, 0.3)
Iforest (10, 0.4)
Iforest (10, 0.5)
Iforest (10, 0.6)
Iforest (10, 0.7)
Iforest (10, 0.8)
Iforest (10, 0.9)
Iforest (20, 0.1)
Iforest (20, 0.2)
Iforest (20, 0.3)
Iforest (20, 0.4)
I

In [189]:
perf_mat = perf_df.to_numpy()
print(perf_mat)

[['Annthyroid' 0.0593 0.0513 ... 0.2043 0.2153 0.2153]
 ['Arrhythmia' 0.1753 0.2161 ... 0.4635 0.3863 0.3863]
 ['Breastw' 0.6431 0.9723 ... 0.3921 0.4085 0.4085]
 ...
 ['Vowels' 0.0274 0.0305 ... 0.3852 0.4056 0.4056]
 ['Wbc' 0.4221 0.5957 ... 0.6497 0.601 0.601]
 ['Wine' 0.633 0.1624 ... 0.2902 0.2589 0.2589]]


In [190]:
perf_mat_red = fix_nan(perf_mat[:, 1:].astype('float'))
print(perf_mat_red)

[[0.0593 0.0513 0.0488 ... 0.2043 0.2153 0.2153]
 [0.1753 0.2161 0.2275 ... 0.4635 0.3863 0.3863]
 [0.6431 0.9723 0.7711 ... 0.3921 0.4085 0.4085]
 ...
 [0.0274 0.0305 0.0322 ... 0.3852 0.4056 0.4056]
 [0.4221 0.5957 0.2277 ... 0.6497 0.601  0.601 ]
 [0.633  0.1624 0.2572 ... 0.2902 0.2589 0.2589]]


In [191]:
n_datasets, n_configs = perf_mat_red.shape[0], perf_mat_red.shape[1]
print('num_datasets:', n_datasets, '\nnum_configs:', n_configs)

data_headers = perf_mat[:, 0]
config_headers = perf_df.columns[4:]
#dump(config_headers, 'model_list.joblib')  

num_datasets: 22 
num_configs: 207


In [192]:
meta_mat = np.zeros((len(data_list), 200))

for index, dataset in enumerate(data_list):
    print(dataset)
    mat = loadmat('../data/datasets/' + dataset.path)
    X = mat['X']
    meta_mat[index, :], meta_vec_names = generate_meta_features(X)

Dataset(name='Annthyroid', path='annthyroid.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Arrhythmia', path='arrhythmia.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Breastw', path='breastw.mat')
Dataset(name='Glass', path='glass.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  quant_coeff_disp = (q75 - q25) / (q75 + q25)
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Ionosphere', path='ionosphere.mat')
Dataset(name='Letter', path='letter.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  quant_coeff_disp = (q75 - q25) / (q75 + q25)
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Lympho', path='lympho.mat')
Dataset(name='Mammography', path='mammography.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  b2 = skew(a, axis)
  b2 = kurtosis(a, axis, fisher=False)
  moment_5 = moment(X, moment=5)
  moment_6 = moment(X, moment=6)
  moment_7 = moment(X, moment=7)
  moment_8 = moment(X, moment=8)
  moment_9 = moment(X, moment=9)
  moment_10 = moment(X, moment=10)


Dataset(name='Mnist', path='mnist.mat')


  skewness_list = skew(X).reshape(-1, 1)
  kurtosis_list = kurtosis(X)
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Musk', path='musk.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  quant_coeff_disp = (q75 - q25) / (q75 + q25)


Dataset(name='Optdigits', path='optdigits.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  quant_coeff_disp = (q75 - q25) / (q75 + q25)
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Pendigits', path='pendigits.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  quant_coeff_disp = (q75 - q25) / (q75 + q25)
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Pima', path='pima.mat')
Dataset(name='Satellite', path='satellite.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Satimage-2', path='satimage-2.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  sample_range = sample_max - sample_min
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Shuttle', path='shuttle.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Speech', path='speech.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Thyroid', path='thyroid.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Vertebral', path='vertebral.mat')
Dataset(name='Vowels', path='vowels.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  quant_coeff_disp = (q75 - q25) / (q75 + q25)
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Wbc', path='wbc.mat')
Dataset(name='Wine', path='wine.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


In [193]:
for i in range(meta_mat.shape[0]):
    dict = {}
    for j, meta_feature in enumerate(meta_vec_names):
        dict[meta_feature] = meta_mat[i, j]
    print(data_list[i].name, dict)

Annthyroid {'n_samples': 7200.0, 'n_features': 6.0, 'sample_mean': 0.14430651111111112, 'sample_median': 0.095, 'sample_var': 0.03669061941821191, 'sample_min': 0.0, 'sample_max': 0.97, 'sample_std': 0.19154795592282345, 'q1': 0.0, 'q25': 1e-05, 'q75': 3e-05, 'q99': 5e-05, 'iqr': 1.9999999999999998e-05, 'normalized_mean': 0.14876959908361972, 'normalized_median': 0.0979381443298969, 'sample_range': 0.97, 'sample_gini': 0.5989186931249482, 'med_abs_dev': 0.069, 'avg_abs_dev': 0.12864815667901233, 'quant_coeff_disp': 0.4999999999999999, 'coeff_var': 0.25425477433905513, 'percent_outliers_15iqr': 5.932916666666666, 'percent_outliers_3iqr': 5.913611111111111, 'percent_outliers_1_99': 5.932916666666666, 'percent_outliers_3std': 0.17888888888888888, 'has_outliers_15iqr': 1.0, 'has_outliers_3iqr': 1.0, 'has_outliers_1_99': 1.0, 'has_outliers_3std': 1.0, 'normality_p_min': 0.0, 'normality_p_max': 1.1941469202585856e-202, 'normality_p_mean': 1.9902448670976426e-203, 'normality_p_std': 0.0, 'nor

In [194]:
print(f'Num of NaN in metafeatures of dataset')
for i in range(meta_mat.shape[0]):
    nan = np.isnan(meta_mat[i, :]).sum()
    print(f'{data_list[i].name}: {nan}')

Num of NaN in metafeatures of dataset
Annthyroid: 12
Arrhythmia: 6
Breastw: 12
Glass: 11
Ionosphere: 6
Letter: 9
Lympho: 8
Mammography: 14
Mnist: 8
Musk: 10
Optdigits: 8
Pendigits: 9
Pima: 11
Satellite: 10
Satimage-2: 10
Shuttle: 12
Speech: 6
Thyroid: 14
Vertebral: 8
Vowels: 10
Wbc: 11
Wine: 6


In [195]:
meta_scalar = MinMaxScaler()
meta_mat_transformed = meta_scalar.fit_transform(meta_mat)
meta_mat_transformed = fix_nan(meta_mat_transformed)

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  col_mean = np.nanmean(X, axis = 0)


In [196]:
seed = 0
full_list = list(range(n_datasets))
print(full_list)

random.Random(seed).shuffle(full_list)
n_train = int(0.85 * n_datasets)
print('n_train:', n_train)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
n_train: 18


In [197]:
train_index = full_list[:n_train]
valid_index = full_list[n_train:]

train_set = perf_mat_red[train_index, :].astype('float64')
valid_set = perf_mat_red[valid_index, :].astype('float64')
print('train_set:\n', train_set, '\nvalid_set:\n', valid_set)

train_meta = meta_mat_transformed[train_index, :].astype('float64')
valid_meta = meta_mat_transformed[valid_index, :].astype('float64')

#deal with NaN values
train_meta[np.isnan(train_meta)] = 0
valid_meta[np.isnan(valid_meta)] = 0

train_set:
 [[0.0172 0.016  0.018  ... 0.0206 0.0201 0.0201]
 [0.4221 0.5957 0.2277 ... 0.6497 0.601  0.601 ]
 [0.0886 0.0891 0.0924 ... 0.0877 0.087  0.087 ]
 ...
 [0.633  0.1624 0.2572 ... 0.2902 0.2589 0.2589]
 [0.4371 0.9198 0.0683 ... 0.1194 0.1216 0.1216]
 [0.0184 0.0191 0.0183 ... 0.0202 0.019  0.019 ]] 
valid_set:
 [[0.107  0.1589 0.1155 0.1336 0.1235 0.1393 0.3066 0.0996 0.0695 0.1788
  0.2214 0.1316 0.0969 0.0723 0.1458 0.0664 0.135  0.1981 0.0681 0.1111
  0.2371 0.1294 0.0706 0.2258 0.2243 0.14   0.6    0.1402 0.0901 0.1442
  0.198  0.1841 0.1061 0.3955 0.1774 0.2721 0.1188 0.1118 0.074  0.0822
  0.1894 0.1969 0.1015 0.2022 0.3189 0.2417 0.0836 0.1132 0.1087 0.1869
  0.283  0.1787 0.2336 0.2625 0.1284 0.2655 0.351  0.1452 0.229  0.1552
  0.2185 0.2748 0.2338 0.2081 0.1444 0.2902 0.1529 0.3558 0.2315 0.2712
  0.2948 0.2588 0.1823 0.1495 0.1546 0.2182 0.2369 0.2726 0.3357 0.2247
  0.1905 0.1839 0.2562 0.2564 0.1928 0.2693 0.1976 0.1858 0.2412 0.3209
  0.2196 0.2858 0.1971 0.27

In [198]:
n_components = 15

clf = MetaODClass(train_set, valid_performance=valid_set, n_factors=n_components, learning='sgd')
clf.train(n_iter=50, meta_features=train_meta, valid_meta=valid_meta, learning_rate=0.05, max_rate=0.9, min_rate=0.1, discount=1, n_steps=8)

MetaOD 1 train 0.8827162031919586 valid 0.9463783391452918 learning rate 0.1


  if ((self.valid_loss_[-1] - self.valid_loss_[-2]) /


MetaOD 2 train 0.9033055303309199 valid 0.9522634774064296 learning rate 0.2142857142857143
MetaOD 3 train 0.9264223713938442 valid 0.9581081959851447 learning rate 0.3285714285714286
MetaOD 4 train 0.9433329287505272 valid 0.959708867984202 learning rate 0.4428571428571429
MetaOD 5 train 0.9545935097168914 valid 0.956777826610069 learning rate 0.5571428571428572
MetaOD 6 train 0.9472664923333133 valid 0.9511932597453178 learning rate 0.6714285714285715
MetaOD 7 train 0.9507412602451814 valid 0.9526417230555485 learning rate 0.7857142857142858
MetaOD 8 train 0.95213541645326 valid 0.9520825640655636 learning rate 0.9
MetaOD 9 train 0.9534469833981993 valid 0.9550003351817873 learning rate 0.9
MetaOD 10 train 0.954447945442458 valid 0.9557921461717286 learning rate 0.7857142857142858
MetaOD 11 train 0.9543879287768205 valid 0.9559778500017094 learning rate 0.6714285714285715
MetaOD 12 train 0.9547827357468148 valid 0.9560358016832335 learning rate 0.5571428571428572
MetaOD 13 train 0.95

<metaod.models.core.MetaODClass at 0x151477e50>