In [127]:
import os
import sys
import random

In [128]:
import pandas as pd
import numpy as np

In [129]:
from joblib import dump
from scipy.io import loadmat
from pathlib import Path
from dataclasses import dataclass
from sklearn.preprocessing import MinMaxScaler
from metaod.models.gen_meta_features import generate_meta_features
from metaod.models.core import MetaODClass
from metaod.models.utility import fix_nan

In [130]:
@dataclass
class Model:
    name: str
    type: str
    pars: tuple

@dataclass
class Dataset:
    name: str
    path: str

In [131]:
script_directory = os.path.dirname(os.path.abspath('__file__'))
excel_file_path = os.path.join(script_directory, '..', 'data', 'performance_table.xlsx')
perf_df = pd.read_excel(excel_file_path, sheet_name='AP')

print(perf_df)

           Data  LODA (5, 10)  LOF (70, 'euclidean')
0    Annthyroid        0.0593                 0.1967
1    Arrhythmia        0.1753                 0.3798
2       Breastw        0.6431                 0.3453
3         Glass        0.0411                 0.1092
4    Ionosphere        0.7711                 0.8635
5        Letter        0.1130                 0.2714
6        Lympho        0.2946                 0.8012
7   Mammography        0.1886                 0.1381
8         Mnist        0.1070                 0.3401
9          Musk        0.1380                 0.0836
10    Optdigits        0.0172                 0.0222
11    Pendigits        0.1309                 0.0282
12         Pima        0.5441                 0.4686
13    Satellite        0.2217                 0.3958
14   SatImage-2        0.5139                 0.0422
15      Shuttle        0.4371                 0.1230
16       Speech        0.0184                 0.0194
17      Thyroid        0.0151                 

In [132]:
model_list = [Model('LODA (5, 10)', 'LODA', (5, 10)),
              Model('LOF (70, "euclidean")', 'LOF', (70, "euclidean"))
              ]

data_list = []

for i in range(perf_df.shape[0]):
    name = perf_df.iloc[i, 0]
    path = name.lower() + '.mat'
    data_list.append(Dataset(name, path))

print(data_list)

[Dataset(name='Annthyroid', path='annthyroid.mat'), Dataset(name='Arrhythmia', path='arrhythmia.mat'), Dataset(name='Breastw', path='breastw.mat'), Dataset(name='Glass', path='glass.mat'), Dataset(name='Ionosphere', path='ionosphere.mat'), Dataset(name='Letter', path='letter.mat'), Dataset(name='Lympho', path='lympho.mat'), Dataset(name='Mammography', path='mammography.mat'), Dataset(name='Mnist', path='mnist.mat'), Dataset(name='Musk', path='musk.mat'), Dataset(name='Optdigits', path='optdigits.mat'), Dataset(name='Pendigits', path='pendigits.mat'), Dataset(name='Pima', path='pima.mat'), Dataset(name='Satellite', path='satellite.mat'), Dataset(name='SatImage-2', path='satimage-2.mat'), Dataset(name='Shuttle', path='shuttle.mat'), Dataset(name='Speech', path='speech.mat'), Dataset(name='Thyroid', path='thyroid.mat'), Dataset(name='Vertebral', path='vertebral.mat'), Dataset(name='Vowels', path='vowels.mat'), Dataset(name='Wbc', path='wbc.mat'), Dataset(name='Wine', path='wine.mat')]


In [133]:
perf_mat = perf_df.to_numpy()
print(perf_mat)

[['Annthyroid' 0.0593 0.1967]
 ['Arrhythmia' 0.1753 0.3798]
 ['Breastw' 0.6431 0.3453]
 ['Glass' 0.0411 0.1092]
 ['Ionosphere' 0.7711 0.8635]
 ['Letter' 0.113 0.2714]
 ['Lympho' 0.2946 0.8012]
 ['Mammography' 0.1886 0.1381]
 ['Mnist' 0.107 0.3401]
 ['Musk' 0.138 0.0836]
 ['Optdigits' 0.0172 0.0222]
 ['Pendigits' 0.1309 0.0282]
 ['Pima' 0.5441 0.4686]
 ['Satellite' 0.2217 0.3958]
 ['SatImage-2' 0.5139 0.0422]
 ['Shuttle' 0.4371 0.123]
 ['Speech' 0.0184 0.0194]
 ['Thyroid' 0.0151 0.2832]
 ['Vertebral' 0.0886 0.0847]
 ['Vowels' 0.0274 0.4071]
 ['Wbc' 0.4221 0.5965]
 ['Wine' 0.633 0.3367]]


In [134]:
perf_mat_red = fix_nan(perf_mat[:, 1:].astype('float'))
print(perf_mat_red)

[[0.0593 0.1967]
 [0.1753 0.3798]
 [0.6431 0.3453]
 [0.0411 0.1092]
 [0.7711 0.8635]
 [0.113  0.2714]
 [0.2946 0.8012]
 [0.1886 0.1381]
 [0.107  0.3401]
 [0.138  0.0836]
 [0.0172 0.0222]
 [0.1309 0.0282]
 [0.5441 0.4686]
 [0.2217 0.3958]
 [0.5139 0.0422]
 [0.4371 0.123 ]
 [0.0184 0.0194]
 [0.0151 0.2832]
 [0.0886 0.0847]
 [0.0274 0.4071]
 [0.4221 0.5965]
 [0.633  0.3367]]


In [135]:
n_datasets, n_configs = perf_mat_red.shape[0], perf_mat_red.shape[1]
print('num_datasets:', n_datasets, '\nnum_configs:', n_configs)

data_headers = perf_mat[:, 0]
config_headers = perf_df.columns[4:]
#dump(config_headers, 'model_list.joblib')  

num_datasets: 22 
num_configs: 2


In [136]:
meta_mat = np.zeros((len(data_list), 200))

for index, dataset in enumerate(data_list):
    print(dataset)
    mat = loadmat('../data/datasets/' + dataset.path)
    X = mat['X']
    meta_mat[index, :], meta_vec_names = generate_meta_features(X)

Dataset(name='Annthyroid', path='annthyroid.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Arrhythmia', path='arrhythmia.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Breastw', path='breastw.mat')
Dataset(name='Glass', path='glass.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  quant_coeff_disp = (q75 - q25) / (q75 + q25)
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Ionosphere', path='ionosphere.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  quant_coeff_disp = (q75 - q25) / (q75 + q25)
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Letter', path='letter.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Lympho', path='lympho.mat')
Dataset(name='Mammography', path='mammography.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Mnist', path='mnist.mat')


  b2 = skew(a, axis)
  b2 = kurtosis(a, axis, fisher=False)
  moment_5 = moment(X, moment=5)
  moment_6 = moment(X, moment=6)
  moment_7 = moment(X, moment=7)
  moment_8 = moment(X, moment=8)
  moment_9 = moment(X, moment=9)
  moment_10 = moment(X, moment=10)
  skewness_list = skew(X).reshape(-1, 1)
  kurtosis_list = kurtosis(X)
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Musk', path='musk.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  quant_coeff_disp = (q75 - q25) / (q75 + q25)


Dataset(name='Optdigits', path='optdigits.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  quant_coeff_disp = (q75 - q25) / (q75 + q25)
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Pendigits', path='pendigits.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  quant_coeff_disp = (q75 - q25) / (q75 + q25)
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Pima', path='pima.mat')
Dataset(name='Satellite', path='satellite.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='SatImage-2', path='satimage-2.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  sample_range = sample_max - sample_min
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Shuttle', path='shuttle.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Speech', path='speech.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Thyroid', path='thyroid.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Vertebral', path='vertebral.mat')
Dataset(name='Vowels', path='vowels.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  quant_coeff_disp = (q75 - q25) / (q75 + q25)
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


Dataset(name='Wbc', path='wbc.mat')
Dataset(name='Wine', path='wine.mat')


  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


In [137]:
for i in range(meta_mat.shape[0]):
    dict = {}
    for j, meta_feature in enumerate(meta_vec_names):
        dict[meta_feature] = meta_mat[i, j]
    print(data_list[i].name, dict)

Annthyroid {'n_samples': 7200.0, 'n_features': 6.0, 'sample_mean': 0.14430651111111112, 'sample_median': 0.095, 'sample_var': 0.03669061941821191, 'sample_min': 0.0, 'sample_max': 0.97, 'sample_std': 0.19154795592282345, 'q1': 0.0, 'q25': 1e-05, 'q75': 3e-05, 'q99': 5e-05, 'iqr': 1.9999999999999998e-05, 'normalized_mean': 0.14876959908361972, 'normalized_median': 0.0979381443298969, 'sample_range': 0.97, 'sample_gini': 0.5989186931249482, 'med_abs_dev': 0.069, 'avg_abs_dev': 0.12864815667901233, 'quant_coeff_disp': 0.4999999999999999, 'coeff_var': 0.25425477433905513, 'percent_outliers_15iqr': 5.932916666666666, 'percent_outliers_3iqr': 5.913611111111111, 'percent_outliers_1_99': 5.932916666666666, 'percent_outliers_3std': 0.17888888888888888, 'has_outliers_15iqr': 1.0, 'has_outliers_3iqr': 1.0, 'has_outliers_1_99': 1.0, 'has_outliers_3std': 1.0, 'normality_p_min': 0.0, 'normality_p_max': 1.1941469202585856e-202, 'normality_p_mean': 1.9902448670976426e-203, 'normality_p_std': 0.0, 'nor

In [138]:
print(f'Num of NaN in metafeatures of dataset')
for i in range(meta_mat.shape[0]):
    nan = np.isnan(meta_mat[i, :]).sum()
    print(f'{data_list[i].name}: {nan}')

Num of NaN in metafeatures of dataset
Annthyroid: 12
Arrhythmia: 6
Breastw: 12
Glass: 11
Ionosphere: 6
Letter: 9
Lympho: 8
Mammography: 14
Mnist: 8
Musk: 10
Optdigits: 8
Pendigits: 9
Pima: 11
Satellite: 10
SatImage-2: 10
Shuttle: 12
Speech: 6
Thyroid: 14
Vertebral: 8
Vowels: 10
Wbc: 11
Wine: 6


In [139]:
meta_scalar = MinMaxScaler()
meta_mat_transformed = meta_scalar.fit_transform(meta_mat)
meta_mat_transformed = fix_nan(meta_mat_transformed)

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  col_mean = np.nanmean(X, axis = 0)


In [140]:
seed = 0
full_list = list(range(n_datasets))
print(full_list)

random.Random(seed).shuffle(full_list)
n_train = int(0.85 * n_datasets)
print('n_train:', n_train)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
n_train: 18


In [141]:
train_index = full_list[:n_train]
valid_index = full_list[n_train:]

train_set = perf_mat_red[train_index, :].astype('float64')
valid_set = perf_mat_red[valid_index, :].astype('float64')
print('train_set:\n', train_set, '\nvalid_set:\n', valid_set)

train_meta = meta_mat_transformed[train_index, :].astype('float64')
valid_meta = meta_mat_transformed[valid_index, :].astype('float64')

#deal with NaN values
train_meta[np.isnan(train_meta)] = 0
valid_meta[np.isnan(valid_meta)] = 0

train_set:
 [[0.0172 0.0222]
 [0.4221 0.5965]
 [0.0886 0.0847]
 [0.2946 0.8012]
 [0.0593 0.1967]
 [0.0274 0.4071]
 [0.1309 0.0282]
 [0.6431 0.3453]
 [0.0411 0.1092]
 [0.138  0.0836]
 [0.113  0.2714]
 [0.1886 0.1381]
 [0.7711 0.8635]
 [0.0151 0.2832]
 [0.5139 0.0422]
 [0.633  0.3367]
 [0.4371 0.123 ]
 [0.0184 0.0194]] 
valid_set:
 [[0.107  0.3401]
 [0.1753 0.3798]
 [0.2217 0.3958]
 [0.5441 0.4686]]


In [142]:
n_components = 15

clf = MetaODClass(train_set, valid_performance=valid_set, n_factors=n_components, learning='sgd')
clf.train(n_iter=50, meta_features=train_meta, valid_meta=valid_meta, learning_rate=0.05, max_rate=0.9, min_rate=0.1, discount=1, n_steps=8)

MetaOD 1 train 0.9199165459881158 valid 0.9917044711487251 learning rate 0.1


  if ((self.valid_loss_[-1] - self.valid_loss_[-2]) /


MetaOD 2 train 0.9578529448843361 valid 0.9617166637828324 learning rate 0.2142857142857143
MetaOD 3 train 0.9753720018855998 valid 0.9617166637828324 learning rate 0.3285714285714286
MetaOD 4 train 0.9842863073665096 valid 0.9617166637828324 learning rate 0.4428571428571429
MetaOD 5 train 0.9886045595840199 valid 0.9617166637828324 learning rate 0.5571428571428572
MetaOD 6 train 0.9886045595840199 valid 0.9617166637828324 learning rate 0.6714285714285715


<metaod.models.core.MetaODClass at 0x16a7701d0>