In [97]:
import os
import sys
import random

In [98]:
import pandas as pd
import numpy as np

In [99]:
from joblib import dump
from scipy.io import loadmat
from pathlib import Path
from dataclasses import dataclass
from sklearn.preprocessing import MinMaxScaler
from metaod.models.gen_meta_features import generate_meta_features
from metaod.models.core import MetaODClass
from metaod.models.utility import fix_nan

In [100]:
@dataclass
class Model:
    name: str
    type: str
    pars: tuple

@dataclass
class Dataset:
    name: str
    path: str

In [101]:
model_list = [Model('LODA (5, 10)', 'LODA', (5, 10)), Model('LOF (70, "euclidean")', 'LOF', (70, "euclidean"))]
    
data_list = [Dataset('Annthyroid', 'annthyroid.mat'), Dataset('Arrhythmia', 'arrhythmia.mat')]

In [102]:
script_directory = os.path.dirname(os.path.abspath('__file__'))
excel_file_path = os.path.join(script_directory, '..', 'data', 'performance_table.xlsx')
perf_df = pd.read_excel(excel_file_path, sheet_name='AP')

print(perf_df)

         Data  LODA (5, 10)  LOF (70, 'euclidean')
0  Annthyroid       0.11314                 0.0906
1  Arrhythmia       0.09884                 0.0745


In [104]:
perf_mat = perf_df.to_numpy()
print(perf_mat)

perf_mat_red = fix_nan(perf_mat[:, 1:].astype('float'))
print(perf_mat_red)

[['Annthyroid' 0.11314 0.0906]
 ['Arrhythmia' 0.09884 0.0745]]
[[0.11314 0.0906 ]
 [0.09884 0.0745 ]]


In [108]:
n_datasets, n_configs = perf_mat_red.shape[0], perf_mat_red.shape[1]
print('num_datasets:', n_datasets, '\nnum_configs:', n_configs)

data_headers = perf_mat[:, 0]
config_headers = perf_df.columns[4:]
#dump(config_headers, 'model_list.joblib')  

num_datasets: 2 
num_configs: 2


In [109]:
meta_mat = np.zeros((n_datasets, 200))

for index, dataset in enumerate(data_list):
    mat = loadmat('../data/' + dataset.path)
    X = mat['X']
    meta_mat[index, :], meta_vec_names = generate_meta_features(X)

  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


In [110]:
dict_1, dict_2 = {}, {}

for i, meta_feature in enumerate(meta_vec_names):
    dict_1[meta_feature] = meta_mat[0, i]
    dict_2[meta_feature] = meta_mat[1, i]

print(dict_1)
print(dict_2)

nan1, nan2 = np.isnan(meta_mat[0, :]).sum(), np.isnan(meta_mat[1, :]).sum()

print('\nNum of NaN in dataset 1:', nan1, '\nNum of NaN in dataset 2:', nan2, '\n')

indexes_1, indexes_2 = np.argwhere(np.isnan(meta_mat[0, :])), np.argwhere(np.isnan(meta_mat[1, :]))
indexes_1, indexes_2 = [index[0] for index in indexes_1], [index[0] for index in indexes_2]

for i, value in enumerate(meta_vec_names):
    if i in indexes_1:
        print(i, value, dict_1[value])

print()

for i, value in enumerate(meta_vec_names):
    if i in indexes_2:
        print(i, value, dict_2[value])

{'n_samples': 7200.0, 'n_features': 6.0, 'sample_mean': 0.14430651111111112, 'sample_median': 0.095, 'sample_var': 0.03669061941821191, 'sample_min': 0.0, 'sample_max': 0.97, 'sample_std': 0.19154795592282345, 'q1': 0.0, 'q25': 1e-05, 'q75': 3e-05, 'q99': 5e-05, 'iqr': 1.9999999999999998e-05, 'normalized_mean': 0.14876959908361972, 'normalized_median': 0.0979381443298969, 'sample_range': 0.97, 'sample_gini': 0.5989186931249482, 'med_abs_dev': 0.069, 'avg_abs_dev': 0.12864815667901233, 'quant_coeff_disp': 0.4999999999999999, 'coeff_var': 0.25425477433905513, 'percent_outliers_15iqr': 5.932916666666666, 'percent_outliers_3iqr': 5.913611111111111, 'percent_outliers_1_99': 5.932916666666666, 'percent_outliers_3std': 0.17888888888888888, 'has_outliers_15iqr': 1.0, 'has_outliers_3iqr': 1.0, 'has_outliers_1_99': 1.0, 'has_outliers_3std': 1.0, 'normality_p_min': 0.0, 'normality_p_max': 1.1941469202585856e-202, 'normality_p_mean': 1.9902448670976426e-203, 'normality_p_std': 0.0, 'normality_p_sk

In [111]:
meta_scalar = MinMaxScaler()
meta_mat_transformed = meta_scalar.fit_transform(meta_mat)
meta_mat_transformed = fix_nan(meta_mat_transformed)
#dump(meta_scalar, Path('results') / 'meta_scalar.joblib')

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  col_mean = np.nanmean(X, axis = 0)


In [112]:
seed = 0
full_list = list(range(n_datasets))
print(full_list)

random.Random(seed).shuffle(full_list)
n_train = int(0.85 * n_datasets)
print('n_train:', n_train)

[0, 1]
n_train: 1


In [115]:
train_index = full_list[:n_train]
valid_index = full_list[n_train:]

train_set = perf_mat_red[train_index, :].astype('float64')
valid_set = perf_mat_red[valid_index, :].astype('float64')
print('train_set:\n', train_set, '\nvalid_set:\n', valid_set)

train_meta = meta_mat_transformed[train_index, :].astype('float64')
valid_meta = meta_mat_transformed[valid_index, :].astype('float64')

#deal with NaN values
train_meta[np.isnan(train_meta)] = 0
valid_meta[np.isnan(valid_meta)] = 0

train_set:
 [[0.11314 0.0906 ]] 
valid_set:
 [[0.09884 0.0745 ]]


In [116]:
n_components = 1

clf = MetaODClass(train_set, valid_performance=valid_set, n_factors=n_components, learning='sgd')
clf.train(n_iter=50, meta_features=train_meta, valid_meta=valid_meta, learning_rate=0.05, max_rate=0.9, min_rate=0.1, discount=1, n_steps=8)

#dump(clf, Path('results')  /  str('train_' + str(seed) + '.joblib'))

  explained_variance_ = (S**2) / (n_samples - 1)


MetaOD 1 train 0.9755762356213441 valid 0.9692028695917866 learning rate 0.1
MetaOD 2 train 0.9999999999999999 valid 0.9999999999999998 learning rate 0.2142857142857143
MetaOD 3 train 0.9999999999999999 valid 0.9999999999999998 learning rate 0.3285714285714286
MetaOD 4 train 0.9999999999999999 valid 0.9999999999999998 learning rate 0.4428571428571429
MetaOD 5 train 0.9999999999999999 valid 0.9999999999999998 learning rate 0.5571428571428572


  if ((self.valid_loss_[-1] - self.valid_loss_[-2]) /


MetaOD 6 train 0.9999999999999999 valid 0.9999999999999998 learning rate 0.6714285714285715
MetaOD 7 train 0.9999999999999999 valid 0.9999999999999998 learning rate 0.7857142857142858


<metaod.models.core.MetaODClass at 0x135fd5c90>