In [1]:
import os
import sys
import random

In [2]:
import pandas as pd
import numpy as np

In [3]:
from joblib import dump
from scipy.io import loadmat
from pathlib import Path
from dataclasses import dataclass
from sklearn.preprocessing import MinMaxScaler
from metaod.models.gen_meta_features import generate_meta_features
from metaod.models.core import MetaODClass
from metaod.models.utility import fix_nan

In [4]:
@dataclass
class Model:
    name: str
    type: str
    pars: tuple

@dataclass
class Dataset:
    name: str
    path: str

In [5]:
model_list = [Model('LODA (5, 10)', 'LODA', (5, 10)), Model('LOF (70, "euclidean")', 'LOF', (70, "euclidean"))]
    
data_list = [Dataset('Annthyroid', 'annthyroid.mat'), Dataset('Arrhythmia', 'arrhythmia.mat')]

In [6]:
script_directory = os.path.dirname(os.path.abspath('__file__'))
excel_file_path = os.path.join(script_directory, '..', 'data', 'performance_table.xlsx')
perf_df = pd.read_excel(excel_file_path, sheet_name='AP')

print(perf_df)

         Data  Index   #Samples   # Dimensions   Outlier Perc  LODA (5, 10)  \
0  Annthyroid      1       1986             15         5.0352       0.11314   
1  Arrhythmia      1       1986             15         5.0352       0.09884   

   LOF (70, 'euclidean')      Max   97% Max  tolerance  
0                 0.0906  0.11314  0.109746       0.03  
1                 0.0745  0.09884  0.095875       0.03  


In [7]:
perf_mat = perf_df.to_numpy()
print(perf_mat)

perf_mat_red = fix_nan(perf_mat[:, 4:].astype('float'))
print(perf_mat_red)

[['Annthyroid' 1 1986 15 5.0352 0.11314 0.0906 0.11314 0.1097458 0.03]
 ['Arrhythmia' 1 1986 15 5.0352 0.09884 0.0745 0.09884 0.0958748 0.03]]
[[5.0352    0.11314   0.0906    0.11314   0.1097458 0.03     ]
 [5.0352    0.09884   0.0745    0.09884   0.0958748 0.03     ]]


In [8]:
n_datasets, n_configs = perf_mat.shape[0], perf_mat.shape[1]
print('num_datasets:', n_datasets, '\nnum_configs:', n_configs)

data_headers = perf_mat[:, 0]
config_headers = perf_df.columns[4:]
dump(config_headers, 'model_list.joblib')  

num_datasets: 2 
num_configs: 10


['model_list.joblib']

In [9]:
meta_mat = np.zeros((n_datasets, 200))

for index, dataset in enumerate(data_list):
    mat = loadmat('../data/' + dataset.path)
    X = mat['X']
    meta_mat[index, :], meta_vec_names = generate_meta_features(X)

  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])
  return_list.append(skew(x, nan_policy='omit')[0])
  return_list.append(kurtosis(x, nan_policy='omit')[0])


In [10]:
meta_scalar = MinMaxScaler()
meta_mat_transformed = meta_scalar.fit_transform(meta_mat)
meta_mat_transformed = fix_nan(meta_mat_transformed)
#dump(meta_scalar, Path('results') / 'meta_scalar.joblib')

  data_min = np.nanmin(X, axis=0)
  data_max = np.nanmax(X, axis=0)
  col_mean = np.nanmean(X, axis = 0)


In [11]:
seed = 0
full_list = list(range(n_datasets))
random.Random(seed).shuffle(full_list)
n_train = int(0.85 * n_datasets)

In [12]:
train_index = full_list[:n_train]
valid_index = full_list[n_train:]

train_set = perf_mat_red[train_index, :].astype('float64')
valid_set = perf_mat_red[valid_index, :].astype('float64')

train_meta = meta_mat_transformed[train_index, :].astype('float64')
valid_meta = meta_mat_transformed[valid_index, :].astype('float64')

train_meta[np.isnan(train_meta)] = 0
valid_meta[np.isnan(valid_meta)] = 0

In [13]:
n_components = 1
clf = MetaODClass(train_set, valid_performance=valid_set, n_factors=n_components, learning='sgd')
clf.train(n_iter=50, meta_features=train_meta, valid_meta=valid_meta, learning_rate=0.05, max_rate=0.9, min_rate=0.1, discount=1, n_steps=8)

dump(clf, Path('results')  /  str('train_' + str(seed) + '.joblib'))

  explained_variance_ = (S**2) / (n_samples - 1)


MetaOD 1 train 0.5754731722470243 valid 0.5724258331350667 learning rate 0.1
MetaOD 2 train 0.5314278466007283 valid 0.5276112879055392 learning rate 0.2142857142857143
MetaOD 3 train 0.5314278466007283 valid 0.5276112879055392 learning rate 0.3285714285714286
MetaOD 4 train 0.5314278466007283 valid 0.5276112879055392 learning rate 0.4428571428571429


  if ((self.valid_loss_[-1] - self.valid_loss_[-2]) /


MetaOD 5 train 0.5314278466007283 valid 0.5276112879055392 learning rate 0.5571428571428572
MetaOD 6 train 0.5314278466007283 valid 0.5276112879055392 learning rate 0.6714285714285715


['results/train_0.joblib']