This notebook involves the code for hyperparameter tuning for the standard one class models



# Importing the datasets and libraries

In [None]:
#Install the pyod library
!git clone https://github.com/yzhao062/pyod.git
%cd pyod
!pip install .

In [7]:
# Import the main libraries
from sklearn import datasets, metrics
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import rc
import matplotlib.pyplot as plt
import sys
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy.spatial.distance import squareform
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from sklearn.model_selection import KFold
from sklearn import preprocessing
from numpy import percentile
import warnings
from sklearn.preprocessing import MinMaxScaler
warnings.simplefilter("ignore", UserWarning)
import seaborn as sns
from random import Random

# Import all models
from pyod.models.cblof import CBLOF
from pyod.models.feature_bagging import FeatureBagging
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.knn import KNN
from pyod.models.lof import LOF
from pyod.models.ocsvm import OCSVM
from pyod.models.cblof import CBLOF
from sklearn.mixture import GaussianMixture

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [10]:
df=  pd.read_csv('/content/drive/My Drive/cocrystal_design-master/data_test/df_reduced.csv')
df=df.fillna(0)

In [9]:
contamination = 0.05
class GMM(GaussianMixture):
  def __init__(self, n_components, covariance_type, random_state):
    super().__init__(n_components=n_components , covariance_type=covariance_type, random_state=random_state)

  def fit(self, X, y):
    super().fit(X, y)
    self.prob = super().score_samples(X)
    self.c = percentile(self.prob, 100 * contamination)

  def predict(self, X):
    scores = []
    proba=super().score_samples(X)
    
    scores =(proba <= self.c).astype('int').ravel()

    return scores

In [None]:
from sklearn.utils import shuffle

X_train=df.iloc[:1722, 1:].values
shuffle(X_train)

In [48]:
#Find GMM hyperparameters

from hyperopt import STATUS_OK
from sklearn.model_selection import KFold
from hyperopt import tpe
from hyperopt import Trials
from hyperopt import fmin
from hyperopt import hp

# Create the dataset


# Perform k-fold cross validation
from sklearn.model_selection import KFold
X_train_val=pd.concat([pd.DataFrame(X_train), pd.DataFrame(np.zeros(len(X_train)))], axis=1)
metric=[]

def Average(lst): 
    return sum(lst) / len(lst) 

def objective(params):
  n_components, covariance_type = int(params['n_components']), str(params['covariance_type'])
  kf = KFold(n_splits = 5)
  kf.get_n_splits(X_train_val)
  
  model =  GMM(n_components= n_components, covariance_type=covariance_type, random_state=0,)
  for train, test in kf.split(X_train_val):
    train_data = np.array(X_train_val)[train]
    train_label = train_data[:,-1]
    test_data = np.array(X_train_val)[test]
    test_label = test_data[:, -1]
    train_data = np.vstack([train_data, np.hstack([train_data[:,24:], train_data[:,:24]])])
    train_label = np.concatenate([train_label, train_label])
    model.fit(train_data, np.zeros(X_train.shape[0]))
    pred_train = model.predict(train_data)
    pred_test = model.predict(test_data)
    metric.append(metrics.accuracy_score(pred_test, test_label))
    best_score = Average(metric)
    loss=1-best_score
    return {'loss':loss, 'status':STATUS_OK}

num_leaves = {'num_leaves': hp.quniform('num_leaves', 30, 150, 1)}
learning_rate={'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.01))}

space= {'n_components':hp.quniform('n_components', 1, 5, 1),
        'covariance_type': hp.choice('covariance_type', [ 'tied' ,'full','spherical','diag'])}
  
tpe_algorithm =tpe.suggest
bayes_trials =Trials()
MAX_EVALS=50
rstate = np.random.RandomState(0)
# Print the hyperparameters that minimize the loss function
gmm_best = fmin(objective, space=space, algo=tpe.suggest, max_evals=MAX_EVALS, trials= bayes_trials, rstate = rstate)
print(gmm_best)

100%|██████████| 50/50 [00:04<00:00, 10.66it/s, best loss: 0.19769602378298012]
{'covariance_type': 0, 'n_components': 4.0}


In [60]:
# Hyperparameters for HBOS algorithm
from hyperopt import STATUS_OK
from sklearn.model_selection import KFold
from hyperopt import tpe
from hyperopt import Trials
from hyperopt import fmin
from hyperopt import hp


# Perform k-fold cross validation
from sklearn.model_selection import KFold
X_train_val=pd.concat([pd.DataFrame(X_train), pd.DataFrame(np.zeros(len(X_train)))], axis=1)
metric=[]

def Average(lst): 
    return sum(lst) / len(lst) 

def objective(params):
  n_bins, alpha, tol = int(params['n_bins']), float(params['alpha']), float(params['tol'])
  #n_components,  covariance_type= int(params['n_components']), str(params['covariance_type']) #int(params['n_clusters']) , int(params['beta']), (params['alpha'])
  kf = KFold(n_splits = 5)
  kf.get_n_splits(X_train_val)
  
  model = HBOS(contamination=0.05, n_bins= n_bins, alpha=alpha, tol=tol)
  for train, test in kf.split(X_train_val):
    train_data = np.array(X_train_val)[train]
    train_label = train_data[:,-1]
    test_data = np.array(X_train_val)[test]
    test_label = test_data[:, -1]
    train_data = np.vstack([train_data, np.hstack([train_data[:,24:], train_data[:,:24]])])
    train_label = np.concatenate([train_label, train_label])
    model.fit(train_data)
    pred_train = model.predict(train_data)
    pred_test = model.predict(test_data)
    metric.append(metrics.accuracy_score(pred_test, test_label))
    best_score = Average(metric)
    loss=1-best_score
    return {'loss':loss, 'status':STATUS_OK}

num_leaves = {'num_leaves': hp.quniform('num_leaves', 30, 150, 1)}
learning_rate={'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.01))}

space= {'n_bins': hp.quniform('n_bins', 10, 20, 1), 
   'alpha': hp.quniform('alpha', 0.1, 0.9, 0.1),
    'tol': hp.quniform('tol', 0.1, 0.5, 0.1) }


# HBOS(contamination=0.05, n_bins=15, alpha=0.6), 

tpe_algorithm =tpe.suggest
bayes_trials =Trials()
MAX_EVALS=50
rstate = np.random.RandomState(0)
# Print the hyperparameters that minimize the loss function
hbos_best = fmin(objective, space=space, algo=tpe.suggest, max_evals=MAX_EVALS, trials= bayes_trials, rstate = rstate)
print('Best HBOS hyperparemeters:', hbos_best)

100%|██████████| 50/50 [00:01<00:00, 32.03it/s, best loss: 0.4346014492753624]
Best HBOS hyperparemeters: {'alpha': 0.6000000000000001, 'n_bins': 15.0, 'tol': 0.1}


In [14]:
# Hyperparameters for OCSVM algorithm
from hyperopt import STATUS_OK
from sklearn.model_selection import KFold
from hyperopt import tpe
from hyperopt import Trials
from hyperopt import fmin
from hyperopt import hp

# Perform k-fold cross validation
from sklearn.model_selection import KFold
X_train_val=pd.concat([pd.DataFrame(X_train), pd.DataFrame(np.zeros(len(X_train)))], axis=1)
metric=[]

def Average(lst): 
    return sum(lst) / len(lst) 

def objective(params):
  nu, degree, gamma=  float(params['nu']) , int(params['degree']),float(params['gamma'])
  kf = KFold(n_splits = 5)
  kf.get_n_splits(X_train_val)
  
  model =  OCSVM(contamination=0.05, kernel='rbf' , nu= nu, degree=degree, gamma=gamma)

  for train, test in kf.split(X_train_val):
    train_data = np.array(X_train_val)[train]
    train_label = train_data[:,-1]
    test_data = np.array(X_train_val)[test]
    test_label = test_data[:, -1]
    train_data = np.vstack([train_data, np.hstack([train_data[:,24:], train_data[:,:24]])])
    train_label = np.concatenate([train_label, train_label])
    model.fit(train_data)
    pred_train = model.predict(train_data)
    pred_test = model.predict(test_data)
    metric.append(metrics.accuracy_score(pred_test, test_label))
    best_score = Average(metric)
    loss=1-best_score
    return {'loss':loss, 'status':STATUS_OK}

num_leaves = {'num_leaves': hp.quniform('num_leaves', 30, 150, 1)}
learning_rate={'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.01))}

space= {'nu': hp.quniform('nu', 0.1, 0.9, 0.1),
        'degree': hp.quniform('degree', 2, 10, 1) , 
        'gamma' : hp.quniform('gamma', 1, 10, 1)}
 
tpe_algorithm =tpe.suggest
bayes_trials =Trials()
MAX_EVALS=50
rstate = np.random.RandomState(0)

# Print the hyperparameters that minimize the loss function
ocsvm_best= fmin(objective, space=space, algo=tpe.suggest, max_evals=MAX_EVALS, trials= bayes_trials, rstate = rstate)
print('Best OCSVM hyperparemeters', ocsvm_best)

100%|██████████| 50/50 [01:03<00:00,  1.28s/it, best loss: 0.15652173913043477]
Best OCSVM hyperparemeters {'degree': 10.0, 'gamma': 7.0, 'nu': 0.5}


In [71]:
# Hyperparameters for Feature Bagging algorithm
from hyperopt import STATUS_OK
from sklearn.model_selection import KFold
from hyperopt import tpe
from hyperopt import Trials
from hyperopt import fmin
from hyperopt import hp

# Perform k-fold cross validation
from sklearn.model_selection import KFold
X_train_val=pd.concat([pd.DataFrame(X_train), pd.DataFrame(np.zeros(len(X_train)))], axis=1)
metric=[]

def Average(lst): 
    return sum(lst) / len(lst) 

def objective(params):
  n_neighbors = int(params['n_neighbors'])
  kf = KFold(n_splits = 5)
  kf.get_n_splits(X_train_val)
  
  model = FeatureBagging(LOF(n_neighbors=n_neighbors), contamination=0.05, random_state=0)
  for train, test in kf.split(X_train_val):
    train_data = np.array(X_train_val)[train]
    train_label = train_data[:,-1]
    test_data = np.array(X_train_val)[test]
    test_label = test_data[:, -1]
    train_data = np.vstack([train_data, np.hstack([train_data[:,24:], train_data[:,:24]])])
    train_label = np.concatenate([train_label, train_label])
    model.fit(train_data)
    pred_train = model.predict(train_data)
    pred_test = model.predict(test_data)
    metric.append(metrics.accuracy_score(pred_test, test_label))
    best_score = Average(metric)
    loss=1-best_score
    return {'loss':loss, 'status':STATUS_OK}

num_leaves = {'num_leaves': hp.quniform('num_leaves', 30, 150, 1)}
learning_rate={'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.01))}

space= {'n_neighbors':hp.quniform('n_neighbors', 1, 15, 1)}

tpe_algorithm =tpe.suggest
bayes_trials =Trials()
MAX_EVALS=50
rstate = np.random.RandomState(0)
# Print the hyperparameters that minimize the loss function
featbag_best = fmin(objective, space=space, algo=tpe.suggest, max_evals=MAX_EVALS, trials= bayes_trials, rstate = rstate)
print('Best Feature Bagging hyperparemeters:' , featbag_best)

100%|██████████| 50/50 [02:49<00:00,  3.39s/it, best loss: 0.08695652173913049]
Best Feature Bagging hyperparemeters: {'n_neighbors': 8.0}


In [49]:
# Hyperparameters for kNN algorithm
from hyperopt import STATUS_OK
from sklearn.model_selection import KFold
from hyperopt import tpe
from hyperopt import Trials
from hyperopt import fmin
from hyperopt import hp

# Perform k-fold cross validation
from sklearn.model_selection import KFold
X_train_val=pd.concat([pd.DataFrame(X_train), pd.DataFrame(np.zeros(len(X_train)))], axis=1)
metric=[]

def Average(lst): 
    return sum(lst) / len(lst) 

def objective(params):
  method, n_neighbors = str(params['methods']) , int(params['n_neighbors'])
  kf = KFold(n_splits = 5)
  kf.get_n_splits(X_train_val)
  model =  KNN(contamination=0.05, method=method, n_neighbors=n_neighbors) 
  LOF(n_neighbors=n_neighbors, contamination=0.05)
  for train, test in kf.split(X_train_val):
    train_data = np.array(X_train_val)[train]
    train_label = train_data[:,-1]
    test_data = np.array(X_train_val)[test]
    test_label = test_data[:, -1]
    train_data = np.vstack([train_data, np.hstack([train_data[:,24:], train_data[:,:24]])])
    train_label = np.concatenate([train_label, train_label])
    model.fit(train_data)
    pred_train = model.predict(train_data)
    pred_test = model.predict(test_data)
    metric.append(metrics.accuracy_score(pred_test, test_label))
    best_score = Average(metric)
    loss=1-best_score
    return {'loss':loss, 'status':STATUS_OK}

num_leaves = {'num_leaves': hp.quniform('num_leaves', 30, 150, 1)}
learning_rate={'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.01))}

space= { 'methods': hp.choice('methods', ['largest' ,'median', 'mean']),
    'n_neighbors':hp.quniform('n_neighbors', 5, 25, 1)}
tpe_algorithm =tpe.suggest
bayes_trials =Trials()
MAX_EVALS=50
rstate = np.random.RandomState(0)
# Print the hyperparameters that minimize the loss function
knn_best = fmin(objective, space=space, algo=tpe.suggest, max_evals=MAX_EVALS, trials= bayes_trials, rstate = rstate)
print('Best kNN hyperparemeters:', knn_best)

100%|██████████| 50/50 [00:47<00:00,  1.05it/s, best loss: 0.08695652173913049]
Best kNN hyperparemeters: {'methods': 2, 'n_neighbors': 15.0}


In [75]:
# Hyperparameters for CBLOF algorithm
from hyperopt import STATUS_OK
from sklearn.model_selection import KFold
from hyperopt import tpe
from hyperopt import Trials
from hyperopt import fmin
from hyperopt import hp

# Perform k-fold cross validation
from sklearn.model_selection import KFold
X_train_val=pd.concat([pd.DataFrame(X_train), pd.DataFrame(np.zeros(len(X_train)))], axis=1)
metric=[]

def Average(lst): 
    return sum(lst) / len(lst) 

def objective(params):
  n_clusters,alpha, beta = int(params['n_clusters']), float(params['alpha']), int(params['beta'])
  kf = KFold(n_splits = 5)
  kf.get_n_splits(X_train_val)
  
  model = CBLOF(contamination=0.05, n_clusters=n_clusters, alpha=alpha, beta=beta, random_state=0)
  for train, test in kf.split(X_train_val):
    train_data = np.array(X_train_val)[train]
    train_label = train_data[:,-1]
    test_data = np.array(X_train_val)[test]
    test_label = test_data[:, -1]
    train_data = np.vstack([train_data, np.hstack([train_data[:,24:], train_data[:,:24]])])
    train_label = np.concatenate([train_label, train_label])
    model.fit(train_data)
    pred_train = model.predict(train_data)
    pred_test = model.predict(test_data)
    metric.append(metrics.accuracy_score(pred_test, test_label))
    best_score = Average(metric)
    loss=1-best_score
    return {'loss':loss, 'status':STATUS_OK}

num_leaves = {'num_leaves': hp.quniform('num_leaves', 30, 150, 1)}
learning_rate={'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.01))}

space= {'n_clusters':hp.quniform('n_clusters', 5, 15, 1),
    'alpha': hp.quniform('alpha', 0.2, 0.9, 0.1),
    'beta': hp.quniform('beta', 2, 10, 2)  }

#CBLOF(contamination=0.05,  alpha=0.9, beta=4, n_clusters=12

tpe_algorithm =tpe.suggest
bayes_trials =Trials()
MAX_EVALS=50
rstate = np.random.RandomState(0)
# Print the hyperparameters that minimize the loss function
cblof_best = fmin(objective, space=space, algo=tpe.suggest, max_evals=MAX_EVALS, trials= bayes_trials, rstate = rstate)
print('Best CBLOF hyperparemeters:', cblof_best)

100%|██████████| 50/50 [00:20<00:00,  2.47it/s, best loss: 0.12454710144927539]
Best CBLOF hyperparemeters: {'alpha': 0.2, 'beta': 4.0, 'n_clusters': 6.0}


In [67]:
# Hyperparameters for LOF algorithm
from hyperopt import STATUS_OK
from sklearn.model_selection import KFold
from hyperopt import tpe
from hyperopt import Trials
from hyperopt import fmin
from hyperopt import hp

# Perform k-fold cross validation
from sklearn.model_selection import KFold
X_train_val=pd.concat([pd.DataFrame(X_train), pd.DataFrame(np.zeros(len(X_train)))], axis=1)
metric=[]

def Average(lst): 
    return sum(lst) / len(lst) 

def objective(params):
  n_neighbors = int(params['n_neighbors'])
  #n_bins, alpha, tol = int(params['n_bins']), float(params['alpha']), float(params['tol'])
  #n_components,  covariance_type= int(params['n_components']), str(params['covariance_type']) #int(params['n_clusters']) , int(params['beta']), (params['alpha'])
  kf = KFold(n_splits = 5)
  kf.get_n_splits(X_train_val)
  
  model =  LOF(n_neighbors=n_neighbors, contamination=0.05)
  for train, test in kf.split(X_train_val):
    train_data = np.array(X_train_val)[train]
    train_label = train_data[:,-1]
    test_data = np.array(X_train_val)[test]
    test_label = test_data[:, -1]
    train_data = np.vstack([train_data, np.hstack([train_data[:,24:], train_data[:,:24]])])
    train_label = np.concatenate([train_label, train_label])
    model.fit(train_data)
    pred_train = model.predict(train_data)
    pred_test = model.predict(test_data)
    metric.append(metrics.accuracy_score(pred_test, test_label))
    best_score = Average(metric)
    loss=1-best_score
    return {'loss':loss, 'status':STATUS_OK}

num_leaves = {'num_leaves': hp.quniform('num_leaves', 30, 150, 1)}
learning_rate={'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.01))}

space= {'n_neighbors':hp.quniform('n_neighbors', 5, 25, 1)}
tpe_algorithm =tpe.suggest
bayes_trials =Trials()
MAX_EVALS=50
rstate = np.random.RandomState(0) 
# Print the hyperparameters that minimize the loss function
lof_best = fmin(objective, space=space, algo=tpe.suggest, max_evals=MAX_EVALS, trials= bayes_trials, rstate = rstate )
print('Best LOF hyperparemeters:',lof_best)

100%|██████████| 50/50 [00:28<00:00,  1.72it/s, best loss: 0.061352657004830946]
Best LOF hyperparemeters: {'n_neighbors': 10.0}


In [30]:
# Hyperparameters for Iforest algorithm
from hyperopt import STATUS_OK
from sklearn.model_selection import KFold
from hyperopt import tpe
from hyperopt import Trials
from hyperopt import fmin
from hyperopt import hp

# Perform k-fold cross validation
from sklearn.model_selection import KFold
X_train_val=pd.concat([pd.DataFrame(X_train), pd.DataFrame(np.zeros(len(X_train)))], axis=1)
metric=[]

def Average(lst): 
    return sum(lst) / len(lst) 

def objective(params):
  n_estimators = int(params['n_estimators'])
  kf = KFold(n_splits = 5)
  kf.get_n_splits(X_train_val)
  
  model =  IForest(behaviour="new", bootstrap=False, contamination=0.05, n_estimators=n_estimators,  max_features=1.0, max_samples=1000)
  for train, test in kf.split(X_train_val):
    train_data = np.array(X_train_val)[train]
    train_label = train_data[:,-1]
    test_data = np.array(X_train_val)[test]
    test_label = test_data[:, -1]
    train_data = np.vstack([train_data, np.hstack([train_data[:,24:], train_data[:,:24]])])
    train_label = np.concatenate([train_label, train_label])
    model.fit(train_data)
    pred_train = model.predict(train_data)
    pred_test = model.predict(test_data)
    metric.append(metrics.accuracy_score(pred_test, test_label))
    best_score = Average(metric)
    loss=1-best_score
    return {'loss':loss, 'status':STATUS_OK}

num_leaves = {'num_leaves': hp.quniform('num_leaves', 30, 150, 1)}
learning_rate={'learning_rate':hp.loguniform('learning_rate', np.log(0.001), np.log(0.01))}

space= {'n_estimators':hp.quniform('n_estimators', 100, 500, 10)}
tpe_algorithm =tpe.suggest
bayes_trials =Trials()
MAX_EVALS=50
rstate = np.random.RandomState(0) 
# Print the hyperparameters that minimize the loss function
ifor_best = fmin(objective, space=space, algo=tpe.suggest, max_evals=MAX_EVALS, trials= bayes_trials, rstate = rstate )
print('Best Iforest hyperparemeters:', ifor_best)

100%|██████████| 50/50 [01:35<00:00,  1.92s/it, best loss: 0.221256038647343]
Best Iforest hyperparemeters: {'n_estimators': 200.0}


In [36]:
classifiers = {
    'Gaussiann Mixture Model (GMM)': GMM(n_components= gmm_best['n_components'], covariance_type=gmm_best['covariance_type'], random_state=0), 
    'K Nearest Neighbors (KNN)': KNN(contamination=0.05, method=knn_best['methods'], n_neighbors= knn_best['n_neighbors'] ),
    'Histogram-base Outlier Detection (HBOS)': HBOS(contamination=0.05, n_bins=hbos_best['n_bins'], alpha=hbos_best['alpha']),
    'Feature Bagging':
     FeatureBagging(LOF(n_neighbors=featbag_best['n_neighbors']), contamination=0.05),
    'Isolation Forest': IForest(behaviour="new", bootstrap=False, contamination=0.05, n_estimators=ifor_best['n_estimators'],  max_features=1.0, max_samples=1000), 
    'One class SVM (OCSVM)': OCSVM(contamination=0.05, kernel='rbf' , nu=ocsvm_best['nu'] , degree=ocsvm_best['degree'], gamma=ocsvm_best['gamma']),
    'Local Outlier Factor (LOF)':
       LOF(n_neighbors=lof_best['n_neighbors'], contamination=0.05),
     'CBLOF':   CBLOF(contamination=0.05,  alpha=cblof_best['alpha'], beta=cblof_best['beta'], n_clusters=cblof_best['n_clusters'])
}

In [37]:
classifiers

{'CBLOF': CBLOF(alpha=0.6000000000000001, beta=2.0, check_estimator=False,
    clustering_estimator=None, contamination=0.05, n_clusters=6.0, n_jobs=1,
    random_state=None, use_weights=False),
 'Feature Bagging': FeatureBagging(base_estimator=LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
   metric_params=None, n_jobs=1, n_neighbors=11.0, p=2),
         bootstrap_features=False, check_detector=True,
         check_estimator=False, combination='average', contamination=0.05,
         estimator_params={}, max_features=1.0, n_estimators=10, n_jobs=1,
         random_state=None, verbose=0),
 'Gaussiann Mixture Model (GMM)': GMM(covariance_type=2, n_components=3.0, random_state=0),
 'Histogram-base Outlier Detection (HBOS)': HBOS(alpha=0.7000000000000001, contamination=0.05, n_bins=15.0, tol=0.5),
 'Isolation Forest': IForest(behaviour='new', bootstrap=False, contamination=0.05,
     max_features=1.0, max_samples=1000, n_estimators=200.0, n_jobs=1,
     random_s