This notebook involves the steps for hyperparameter tuning for the standard one class models



# Importing the datasets and libraries

In [None]:
#Install the pyod library
!git clone https://github.com/yzhao062/pyod.git
%cd pyod
!pip install .

In [2]:
# Import the main libraries
from sklearn import datasets, metrics
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import rc
import matplotlib.pyplot as plt
import sys
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy.spatial.distance import squareform
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from sklearn.model_selection import KFold
from sklearn import preprocessing
from numpy import percentile
import warnings
from sklearn.preprocessing import MinMaxScaler
warnings.simplefilter("ignore", UserWarning)
import seaborn as sns
from random import Random

# Import all models
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.ocsvm import OCSVM
from pyod.models.cblof import CBLOF
from sklearn.mixture import GaussianMixture

# Hyperparameters for HBOS algorithm
from hyperopt import STATUS_OK
from sklearn.model_selection import KFold
from hyperopt import tpe
from hyperopt import Trials
from hyperopt import fmin
from hyperopt import hp

  import pandas.util.testing as tm


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [15]:
#df=  pd.read_csv('/content/drive/My Drive/cocrystal_design-master/data_test/df_reduced.csv')
df= pd.read_pickle('/content/drive/My Drive/cocrystal_design-master/data_test/df_reduced.pkl')
df=df.fillna(0)

In [16]:
contamination = 0.05
class GMM(GaussianMixture):
  def __init__(self, n_components, covariance_type, random_state):
    super().__init__(n_components=n_components , covariance_type=covariance_type, random_state=random_state)

  def fit(self, X, y):
    super().fit(X, y)
    self.prob = super().score_samples(X)
    self.c = percentile(self.prob, 100 * contamination)

  def predict(self, X):
    scores = []
    proba=super().score_samples(X)
    
    scores =(proba <= self.c).astype('int').ravel()

    return scores

In [103]:
# Create the dataset
X_train=df.iloc[:1722, 1:].values
X_train_val=pd.concat([pd.DataFrame(X_train), pd.DataFrame(np.zeros(len(X_train)))], axis=1)


In [107]:
# Perform k-fold cross validation
metric=[]

def Average(lst): 
    return sum(lst) / len(lst) 

def objective(params):
  n_components, covariance_type = int(params['n_components']), str(params['covariance_type'])
  kf = KFold(n_splits = 5)
  kf.get_n_splits(X_train_val)
  
  model =  GMM(n_components= n_components, covariance_type=covariance_type, random_state=0,)
  for train, test in kf.split(X_train_val):
    train_data = np.array(X_train_val)[train]
    train_label = train_data[:,-1]
    test_data = np.array(X_train_val)[test]
    test_label = test_data[:, -1]
    train_data = np.vstack([train_data, np.hstack([train_data[:,24:], train_data[:,:24]])])
    train_label = np.concatenate([train_label, train_label])
    model.fit(train_data, np.zeros(X_train.shape[0]))
    pred_train = model.predict(train_data)
    pred_test = model.predict(test_data)
    metric.append(metrics.accuracy_score(pred_test, test_label))
    best_score = Average(metric)
    loss=1-best_score
    return {'loss':loss, 'status':STATUS_OK}

space= {'n_components':hp.quniform('n_components', 2, 6,1), 
        'covariance_type': hp.choice('covariance_type', [ 'tied' ,'full', 'spherical','diag'])}
        
# Hyperopt settings
num_leaves = {'num_leaves': hp.quniform('num_leaves', 30, 150, 1)}
learning_rate={'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.01))}
tpe_algorithm =tpe.suggest
bayes_trials =Trials()
MAX_EVALS=50
rstate = np.random.RandomState(0)
# Print the hyperparameters that minimize the loss function
gmm_best = fmin(objective, space=space, algo=tpe.suggest, max_evals=MAX_EVALS, trials= bayes_trials, rstate = rstate)
print(gmm_best)

100%|██████████| 50/50 [00:06<00:00,  7.26it/s, best loss: 0.17753623188405787]
{'covariance_type': 3, 'n_components': 4.0}


In [108]:
#  Hyperparameters for HBOS algorithm

metric=[]

def objective(params):
  n_bins, alpha= int(params['n_bins']), float(params['alpha'])#, float(params['tol'])
  #n_components,  covariance_type= int(params['n_components']), str(params['covariance_type']) #int(params['n_clusters']) , int(params['beta']), (params['alpha'])
  kf = KFold(n_splits = 5)
  kf.get_n_splits(X_train_val)
  
  model = HBOS(contamination=0.05, n_bins= n_bins, alpha=alpha)
  for train, test in kf.split(X_train_val):
    train_data = np.array(X_train_val)[train]
    train_label = train_data[:,-1]
    test_data = np.array(X_train_val)[test]
    test_label = test_data[:, -1]
    train_data = np.vstack([train_data, np.hstack([train_data[:,24:], train_data[:,:24]])])
    train_label = np.concatenate([train_label, train_label])
    model.fit(train_data)
    pred_train = model.predict(train_data)
    pred_test = model.predict(test_data)
    metric.append(metrics.accuracy_score(pred_test, test_label))
    best_score = Average(metric)
    loss=1-best_score
    return {'loss':loss, 'status':STATUS_OK}

space= {'n_bins': hp.quniform('n_bins', 10, 20, 1), 
   'alpha': hp.quniform('alpha', 0.2, 0.9, 0.1)}
   
# Hyperopt settings
num_leaves = {'num_leaves': hp.quniform('num_leaves', 30, 150, 1)}
learning_rate={'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.01))}
tpe_algorithm =tpe.suggest
bayes_trials =Trials()
MAX_EVALS=50
rstate = np.random.RandomState(0)

# Print the hyperparameters that minimize the loss function
hbos_best = fmin(objective, space=space, algo=tpe.suggest, max_evals=MAX_EVALS, trials= bayes_trials, rstate = rstate)
print('Best HBOS hyperparemeters:', hbos_best)

100%|██████████| 50/50 [00:01<00:00, 33.64it/s, best loss: 0.42028985507246375]
Best HBOS hyperparemeters: {'alpha': 0.7000000000000001, 'n_bins': 15.0}


In [109]:
# Hyperparameters for OCSVM algorithm

metric=[]

def objective(params):
  nu, degree, gamma=  float(params['nu']) , int(params['degree']),float(params['gamma'])
  kf = KFold(n_splits = 5)
  kf.get_n_splits(X_train_val)
  
  model =  OCSVM(contamination=0.05, kernel='rbf' , nu= nu, degree=degree, gamma=gamma)

  for train, test in kf.split(X_train_val):
    train_data = np.array(X_train_val)[train]
    train_label = train_data[:,-1]
    test_data = np.array(X_train_val)[test]
    test_label = test_data[:, -1]
    train_data = np.vstack([train_data, np.hstack([train_data[:,24:], train_data[:,:24]])])
    train_label = np.concatenate([train_label, train_label])
    model.fit(train_data)
    pred_train = model.predict(train_data)
    pred_test = model.predict(test_data)
    metric.append(metrics.accuracy_score(pred_test, test_label))
    best_score = Average(metric)
    loss=1-best_score
    return {'loss':loss, 'status':STATUS_OK}


space= {'nu': hp.quniform('nu', 0.1, 0.9, 0.1),
        'degree': hp.quniform('degree', 2, 10, 1) , 
        'gamma' : hp.quniform('gamma', 1, 10, 1)}

# Hyperopt settings
num_leaves = {'num_leaves': hp.quniform('num_leaves', 30, 150, 1)}
learning_rate={'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.01))}
tpe_algorithm =tpe.suggest
bayes_trials =Trials()
MAX_EVALS=50
rstate = np.random.RandomState(0)
 
# Print the hyperparameters that minimize the loss function
ocsvm_best= fmin(objective, space=space, algo=tpe.suggest, max_evals=MAX_EVALS, trials= bayes_trials, rstate = rstate)
print('Best OCSVM hyperparemeters', ocsvm_best)

100%|██████████| 50/50 [01:07<00:00,  1.35s/it, best loss: 0.15652173913043477]
Best OCSVM hyperparemeters {'degree': 10.0, 'gamma': 7.0, 'nu': 0.5}


In [114]:
# Hyperparameters for Feature Bagging algorithm

metric=[]
def objective(params):
  n_neighbors = int(params['n_neighbors'])
  kf = KFold(n_splits = 5)
  kf.get_n_splits(X_train_val)
  
  model = FeatureBagging(LOF(n_neighbors=n_neighbors), contamination=0.05, random_state=0)
  for train, test in kf.split(X_train_val):
    train_data = np.array(X_train_val)[train]
    train_label = train_data[:,-1]
    test_data = np.array(X_train_val)[test]
    test_label = test_data[:, -1]
    train_data = np.vstack([train_data, np.hstack([train_data[:,24:], train_data[:,:24]])])
    train_label = np.concatenate([train_label, train_label])
    model.fit(train_data)
    pred_train = model.predict(train_data)
    pred_test = model.predict(test_data)
    metric.append(metrics.accuracy_score(pred_test, test_label))
    best_score = Average(metric)
    loss=1-best_score
    return {'loss':loss, 'status':STATUS_OK}

space= {'n_neighbors':hp.quniform('n_neighbors', 1, 15, 1)}

# Hyperopt settings
num_leaves = {'num_leaves': hp.quniform('num_leaves', 30, 150, 1)}
learning_rate={'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.01))}
tpe_algorithm =tpe.suggest
bayes_trials =Trials()
MAX_EVALS=50
rstate = np.random.RandomState(0)

# Print the hyperparameters that minimize the loss function
featbag_best = fmin(objective, space=space, algo=tpe.suggest, max_evals=MAX_EVALS, trials= bayes_trials, rstate = rstate)
print('Best Feature Bagging hyperparemeters:' , featbag_best)

100%|██████████| 50/50 [03:02<00:00,  3.64s/it, best loss: 0.08695652173913049]
Best Feature Bagging hyperparemeters: {'n_neighbors': 8.0}


In [73]:
# Hyperparameters for kNN algorithm

metric=[]
def objective(params):
  method, n_neighbors = str(params['methods']) , int(params['n_neighbors'])
  kf = KFold(n_splits = 5)
  kf.get_n_splits(X_train_val)
  model =  KNN(contamination=0.05, method=method, n_neighbors=n_neighbors) 
  LOF(n_neighbors=n_neighbors, contamination=0.05)
  for train, test in kf.split(X_train_val):
    train_data = np.array(X_train_val)[train]
    train_label = train_data[:,-1]
    test_data = np.array(X_train_val)[test]
    test_label = test_data[:, -1]
    train_data = np.vstack([train_data, np.hstack([train_data[:,24:], train_data[:,:24]])])
    train_label = np.concatenate([train_label, train_label])
    model.fit(train_data)
    pred_train = model.predict(train_data)
    pred_test = model.predict(test_data)
    metric.append(metrics.accuracy_score(pred_test, test_label))
    best_score = Average(metric)
    loss=1-best_score
    return {'loss':loss, 'status':STATUS_OK}

num_leaves = {'num_leaves': hp.quniform('num_leaves', 30, 150, 1)}
learning_rate={'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.01))}

space= { 'methods': hp.choice('methods', ['largest' ,'median', 'mean']),
    'n_neighbors':hp.quniform('n_neighbors', 5, 30, 1)}

# Hyperopt settings
num_leaves = {'num_leaves': hp.quniform('num_leaves', 30, 150, 1)}
learning_rate={'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.01))}
tpe_algorithm =tpe.suggest
bayes_trials =Trials()
MAX_EVALS=50
rstate = np.random.RandomState(0)

# Print the hyperparameters that minimize the loss function
knn_best = fmin(objective, space=space, algo=tpe.suggest, max_evals=MAX_EVALS, trials= bayes_trials, rstate = rstate)
print('Best kNN hyperparemeters:', knn_best)

100%|██████████| 50/50 [00:50<00:00,  1.00s/it, best loss: 0.10434782608695647]
Best kNN hyperparemeters: {'methods': 2, 'n_neighbors': 17.0}


In [72]:
# Hyperparameters for CBLOF algorithm

metric=[]
def objective(params):
  n_clusters,beta = int(params['n_clusters']),  int(params['beta'])  #alpha, float(params['alpha']),
  kf = KFold(n_splits = 5)
  kf.get_n_splits(X_train_val)
  
  model = CBLOF(contamination=0.05, n_clusters=n_clusters,  beta=beta, random_state=0) #alpha=alpha,
  for train, test in kf.split(X_train_val):
    train_data = np.array(X_train_val)[train]
    train_label = train_data[:,-1]
    test_data = np.array(X_train_val)[test]
    test_label = test_data[:, -1]
    train_data = np.vstack([train_data, np.hstack([train_data[:,24:], train_data[:,:24]])])
    train_label = np.concatenate([train_label, train_label])
    model.fit(train_data)
    pred_train = model.predict(train_data)
    pred_test = model.predict(test_data)
    metric.append(metrics.accuracy_score(pred_test, test_label))
    best_score = Average(metric)
    loss=1-best_score
    return {'loss':loss, 'status':STATUS_OK}

space= {'n_clusters':hp.quniform('n_clusters', 8, 16, 2),
    'beta': hp.quniform('beta', 2, 10 ,2)  }
    
# Hyperopt settings
num_leaves = {'num_leaves': hp.quniform('num_leaves', 30, 150, 1)}
learning_rate={'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.01))}
tpe_algorithm =tpe.suggest
bayes_trials =Trials()
MAX_EVALS=50
rstate = np.random.RandomState(0)

# Print the hyperparameters that minimize the loss function
cblof_best = fmin(objective, space=space, algo=tpe.suggest, max_evals=MAX_EVALS, trials= bayes_trials, rstate = rstate)
print('Best CBLOF hyperparemeters:', cblof_best)

100%|██████████| 50/50 [00:22<00:00,  2.19it/s, best loss: 0.230434782608696]
Best CBLOF hyperparemeters: {'beta': 4.0, 'n_clusters': 10.0}


In [116]:
# Hyperparameters for LOF algorithm

metric=[]
def objective(params):
  n_neighbors = int(params['n_neighbors'])
  #n_bins, alpha, tol = int(params['n_bins']), float(params['alpha']), float(params['tol'])
  #n_components,  covariance_type= int(params['n_components']), str(params['covariance_type']) #int(params['n_clusters']) , int(params['beta']), (params['alpha'])
  kf = KFold(n_splits = 5)
  kf.get_n_splits(X_train_val)
  
  model =  LOF(n_neighbors=n_neighbors, contamination=0.05)
  for train, test in kf.split(X_train_val):
    train_data = np.array(X_train_val)[train]
    train_label = train_data[:,-1]
    test_data = np.array(X_train_val)[test]
    test_label = test_data[:, -1]
    train_data = np.vstack([train_data, np.hstack([train_data[:,24:], train_data[:,:24]])])
    train_label = np.concatenate([train_label, train_label])
    model.fit(train_data)
    pred_train = model.predict(train_data)
    pred_test = model.predict(test_data)
    metric.append(metrics.accuracy_score(pred_test, test_label))
    best_score = Average(metric)
    loss=1-best_score
    return {'loss':loss, 'status':STATUS_OK}
space= {'n_neighbors':hp.quniform('n_neighbors', 5, 25, 1)}

# Hyperopt settings
num_leaves = {'num_leaves': hp.quniform('num_leaves', 30, 150, 1)}
learning_rate={'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.01))}
tpe_algorithm =tpe.suggest
bayes_trials =Trials()
MAX_EVALS=50
rstate = np.random.RandomState(0)

# Print the hyperparameters that minimize the loss function
lof_best = fmin(objective, space=space, algo=tpe.suggest, max_evals=MAX_EVALS, trials= bayes_trials, rstate = rstate )
print('Best LOF hyperparemeters:',lof_best)

100%|██████████| 50/50 [00:29<00:00,  1.68it/s, best loss: 0.061352657004830946]
Best LOF hyperparemeters: {'n_neighbors': 10.0}


In [112]:
# Hyperparameters for Iforest algorithm

metric=[]
def objective(params):
  n_estimators = int(params['n_estimators'])
  kf = KFold(n_splits = 5)
  kf.get_n_splits(X_train_val)
  
  model =  IForest(behaviour="new", bootstrap=False, contamination=0.05, n_estimators=n_estimators,  max_features=1.0, max_samples=1000)
  for train, test in kf.split(X_train_val):
    train_data = np.array(X_train_val)[train]
    train_label = train_data[:,-1]
    test_data = np.array(X_train_val)[test]
    test_label = test_data[:, -1]
    train_data = np.vstack([train_data, np.hstack([train_data[:,24:], train_data[:,:24]])])
    train_label = np.concatenate([train_label, train_label])
    model.fit(train_data)
    pred_train = model.predict(train_data)
    pred_test = model.predict(test_data)
    metric.append(metrics.accuracy_score(pred_test, test_label))
    best_score = Average(metric)
    loss=1-best_score
    return {'loss':loss, 'status':STATUS_OK}

space= {'n_estimators':hp.quniform('n_estimators', 100, 500, 10)}

# Hyperopt settings
num_leaves = {'num_leaves': hp.quniform('num_leaves', 30, 150, 1)}
learning_rate={'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.01))}
tpe_algorithm =tpe.suggest
bayes_trials =Trials()
MAX_EVALS=50
rstate = np.random.RandomState(0)

# Print the hyperparameters that minimize the loss function
ifor_best = fmin(objective, space=space, algo=tpe.suggest, max_evals=MAX_EVALS, trials= bayes_trials, rstate = rstate )
print('Best Iforest hyperparemeters:', ifor_best)

100%|██████████| 50/50 [01:47<00:00,  2.14s/it, best loss: 0.22246376811594193]
Best Iforest hyperparemeters: {'n_estimators': 310.0}


In [118]:
classifiers = {
    'Gaussiann Mixture Model (GMM)': GMM(n_components= gmm_best['n_components'], covariance_type=gmm_best['covariance_type'], random_state=0), 
    'K Nearest Neighbors (KNN)': KNN(contamination=0.05, method=knn_best['methods'], n_neighbors= knn_best['n_neighbors'] ),
    'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=0.05, n_bins=hbos_best['n_bins'], alpha=hbos_best['alpha']),
    'Feature Bagging':
     FeatureBagging(LOF(n_neighbors=featbag_best['n_neighbors']), contamination=0.05),
    'Isolation Forest': IForest(behaviour="new", bootstrap=False, contamination=0.05, n_estimators=ifor_best['n_estimators'],  max_features=1.0, max_samples=1000), 
    'One class SVM (OCSVM)': OCSVM(contamination=0.05, kernel='rbf' , nu=ocsvm_best['nu'] , degree=ocsvm_best['degree'], gamma=ocsvm_best['gamma']),
    'Local Outlier Factor (LOF)':
       LOF(n_neighbors=lof_best['n_neighbors'], contamination=0.05),
     'CBLOF':   CBLOF(contamination=0.05,  beta=cblof_best['beta'], n_clusters=cblof_best['n_clusters'])
}

In [119]:
classifiers

{'CBLOF': CBLOF(alpha=0.9, beta=4.0, check_estimator=False, clustering_estimator=None,
    contamination=0.05, n_clusters=10.0, n_jobs=1, random_state=None,
    use_weights=False),
 'Feature Bagging': FeatureBagging(base_estimator=LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
   metric_params=None, n_jobs=1, n_neighbors=8.0, p=2),
         bootstrap_features=False, check_detector=True,
         check_estimator=False, combination='average', contamination=0.05,
         estimator_params={}, max_features=1.0, n_estimators=10, n_jobs=1,
         random_state=None, verbose=0),
 'Gaussiann Mixture Model (GMM)': GMM(covariance_type=3, n_components=4.0, random_state=0),
 'Histogram-base Outlier Detection (HBOS)': HBOS(alpha=0.7000000000000001, contamination=0.05, n_bins=15.0, tol=0.5),
 'Isolation Forest': IForest(behaviour='new', bootstrap=False, contamination=0.05,
     max_features=1.0, max_samples=1000, n_estimators=310.0, n_jobs=1,
     random_state=None, verb