### Imports

In [1]:
import numpy as np
import pandas as pd

In [2]:
#Data preparation
from sklearn.model_selection import train_test_split
#Classification models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from SmoteEnsemble import SmoteEnsemble as HSME
#Result analysis
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score as f1_score_func
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score



In [3]:
#Custom imports
from utility import calculate_results
#
from REPD_Impl import REPD
from autoencoder import AutoEncoder

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  return f(*args, **kwds)


In [4]:
import warnings

### Loading datasets

In [5]:
datasets = [
    "ant",
    #"camel",
    #"log4j",
    #"poi"
]
dataset_versions = {
    "ant":["1.5"],#,"1.6"
    #"camel":["1.2","1.4"],
    #"log4j":["1.1","1.2"]
    #"poi":["2.0","2.5"]
}
feature_types = ["da"]#"dbn",,"ca"
per_feature_type_count = 30

In [6]:
Y = {}
X = {}

for dataset in datasets:
    for version in dataset_versions[dataset]:
        #load lables
        y_file_name = dataset+"-"+version+"_y.npy"
        y = np.load("./data/"+y_file_name)
        Y[(dataset,version)] = y
    
        #Print basic dataset information
        print("Dataset",dataset,version)
        print("Dataset size",len(y))
        print("Non-Defective count",len(y[y!=1]))
        print("Defective count",len(y[y==1]))
        print("defective share",(round(100*len(y[y==1])/len(y),2)))
        print()
    
        #load datasets
        for feature_type in feature_types:
            for i in range(per_feature_type_count):
                x_file_name = dataset+"-"+version+"_"+str(i)+"_"+feature_type+"_X_feat.npy"
                x = np.load("./data/features/"+feature_type+"/"+x_file_name)
                X[(dataset,version,feature_type,i)] = x    

Dataset ant 1.5
Dataset size 292
Non-Defective count 260
Defective count 32
defective share 10.96



### Training models

In [7]:
episode_count = 30

In [8]:
warnings.simplefilter("ignore")

for dataset in datasets:
    for version in dataset_versions[dataset]:
        y = Y[(dataset,version)]
        
        for feature_type in feature_types:
            
            #TODO: remove
            #if dataset == "ant" and version == "1.5" and (feature_type == "dbn" ):#or feature_type=="da"):
                ##continue
                
            for i in range(per_feature_type_count):
                
                #TODO: remove
                if dataset == "ant" and version == "1.5" and feature_type == "da" and i!=9:
                    continue
                
                x = X[(dataset,version,feature_type,i)]
                
                print(dataset,version,feature_type,i)
                performance_data = []
                

                #Run all the models in the experiment
                for experiment_episode in range(1,episode_count+1,1):
                    try:
                        print("Running episode ", experiment_episode)

                        #Test train split
                        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

                        #ADP===========================================================================================
                        autoencoder = AutoEncoder([100,50],0.01)
                        classifer = REPD(autoencoder)
                        classifer.fit(X_train,y_train)
                        y_p = classifer.predict(X_test)
                        matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)

                        #Store results
                        data = ['REPD',accuracy, precision, recall, f1_score, dataset, version, feature_type, i]
                        performance_data.append(data)

                        #Close
                        autoencoder.close()
                        #GaussianNB===============================================================================================
                        classifier = GaussianNB()
                        classifier.fit(X_train,y_train)
                        y_p = classifier.predict(X_test)
                        matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)

                        #Store results
                        data = ['GaussianNB',accuracy, precision, recall, f1_score, dataset, version, feature_type, i]
                        performance_data.append(data)
                        #LogisticRegression===========================================================================================
                        classifier = LogisticRegression()
                        classifier.fit(X_train,y_train)
                        y_p = classifier.predict(X_test)
                        matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)

                        #Store results
                        data = ['LogisticRegression',accuracy, precision, recall, f1_score, dataset, version, feature_type, i]
                        performance_data.append(data)
                        #KNeighborsClassifier=========================================================================================
                        classifier = KNeighborsClassifier(n_neighbors=3)
                        classifier.fit(X_train,y_train)
                        y_p = classifier.predict(X_test)
                        matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)

                        #Store results
                        data = ['KNeighborsClassifier',accuracy, precision, recall, f1_score, dataset, version, feature_type, i]
                        performance_data.append(data)

                        #DecisionTreeClassifier=======================================================================================
                        classifier = DecisionTreeClassifier()
                        classifier.fit(X_train,y_train)
                        y_p = classifier.predict(X_test)
                        matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)

                        #Store results
                        data = ['DecisionTreeClassifier',accuracy, precision, recall, f1_score, dataset, version, feature_type, i]
                        performance_data.append(data)
                        #HSME=======================================================================================
                        classifier = HSME()
                        classifier.fit(X_train,y_train)
                        y_p = classifier.predict(X_test)
                        matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
                        accuracy = balanced_accuracy_score(y_test,y_p)

                        #Store results
                        data = ['HSME',accuracy, precision, recall, f1_score, dataset, version, feature_type, i]
                        performance_data.append(data)                    
                        #=============================================================================================================
                    except:
                        pass
                results_df = pd.DataFrame(performance_data, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 score', "Dataset", "Version", "Feature_type", "i"])

                with open("results/"+dataset+'_'+version+'_'+feature_type+'_'+str(i), 'a') as f:
                    results_df.to_csv(f, header=False)

                print()
            print()

ant 1.5 da 9
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Running episode  9
Running episode  10
Running episode  11
Running episode  12
Running episode  13
Running episode  14
Running episode  15
Running episode  16
Running episode  17
Running episode  18
Running episode  19
Running episode  20
Running episode  21
Running episode  22
Running episode  23
Running episode  24
Running episode  25
Running episode  26
Running episode  27
Running episode  28
Running episode  29
Running episode  30


