## Imports

In [1]:
#Data loading
from scipy.io import arff
#General
import pandas as pd
import numpy as np
from numpy import std, mean, sqrt
import tensorflow as tf
import math
#Statistics
from scipy.stats import normaltest
from scipy.stats import chisquare
from scipy.stats import ttest_ind
#Visualization
from matplotlib import pyplot as plt
#Utility
from tabulate import tabulate

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  return f(*args, **kwds)


In [2]:
#Data preparation
from sklearn.model_selection import train_test_split
#Classification models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from SmoteEnsemble import SmoteEnsemble as HSME
#Result analysis
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score as f1_score_func
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score



In [3]:
#Custom imports
from utility import calculate_results
from utility import print_confusion_matrix
from utility import print_results
#
from REPD_Impl import REPD
from autoencoder import AutoEncoder

In [4]:
import warnings

## Dataset preparation

In [5]:
def calculate_pd(matrix):
    return matrix[1][1]/(matrix[1][0]+matrix[1][1])

def calculate_pf(matrix):
    return matrix[0][1]/(matrix[0][0]+matrix[0][1])

In [6]:
datasets = ["cm1","jm1","kc1","kc2","pc1"]
dataset_settings = {
  "cm1": ["defects", lambda x: 1 if str(x)=="b'true'" else 0 ],
  "jm1": ["defects", lambda x: 1 if str(x)=="b'true'" else 0 ],
  "kc1": ["defects", lambda x: 1 if str(x)=="b'true'" else 0 ],
  "kc2": ["problems", lambda x: 1 if str(x)=="b'yes'" else 0 ],
  "pc1": ["defects", lambda x: 1 if str(x)=="b'true'" else 0 ]
}

In [7]:
episode_count = 30

In [8]:
warnings.simplefilter("ignore")
for dataset in datasets:
    print(dataset)
    defect_column_name = dataset_settings[dataset][0]
    defect_column_map_function = dataset_settings[dataset][1]

    # Load dataset
    data, meta = arff.loadarff("./data/"+dataset+".arff")

    # Wrap data into a pandas dataframe
    df = pd.DataFrame(data)

    #Adjust defects column
    df[defect_column_name] = df[defect_column_name].map(defect_column_map_function)

    #Remove all with missing values
    df = df.dropna()

    #Remove duplicate instances
    df = df.drop_duplicates()

    #Calculate dataset property constants
    total_count = len(df)
    non_defective_count = len(df[df[defect_column_name]==0])
    defective_count = len(df[df[defect_column_name]==1])
    total_count = len(df)
    non_defective_count = len(df[df[defect_column_name]==0])
    defective_count = len(df[df[defect_column_name]==1])

    #Run experiment

    X = df.drop(columns=[defect_column_name]).values
    y = df[defect_column_name].values

    performance_data = []

    #Run all the models in the experiment
    for experiment_episode in range(1,episode_count+1,1):
        print("Running episode ", experiment_episode)

        #Test train split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        #REPD===========================================================================================
        autoencoder = AutoEncoder([21,10],0.01,100,50)
        classifer = REPD(autoencoder)
        classifer.fit(X_train,y_train)
        y_p = classifer.predict(X_test)
        matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
        PD = calculate_pd(matrix)
        PF = calculate_pf(matrix)
        accuracy = balanced_accuracy_score(y_test,y_p)

        #Store results
        data = ['REPD',accuracy, precision, recall, f1_score,PD,PF]
        performance_data.append(data)

        #Close
        autoencoder.close()
        #GaussianNB===============================================================================================
        classifier = GaussianNB()
        classifier.fit(X_train,y_train)
        y_p = classifier.predict(X_test)
        matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
        PD = calculate_pd(matrix)
        PF = calculate_pf(matrix)
        accuracy = balanced_accuracy_score(y_test,y_p)

        #Store results
        data = ['GaussianNB',accuracy, precision, recall, f1_score,PD,PF]
        performance_data.append(data)
        #LogisticRegression===========================================================================================
        classifier = LogisticRegression()
        classifier.fit(X_train,y_train)
        y_p = classifier.predict(X_test)
        matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
        PD = calculate_pd(matrix)
        PF = calculate_pf(matrix)
        accuracy = balanced_accuracy_score(y_test,y_p)

        #Store results
        data = ['LogisticRegression',accuracy, precision, recall, f1_score,PD,PF]
        performance_data.append(data)
        #KNeighborsClassifier=========================================================================================
        classifier = KNeighborsClassifier(n_neighbors=3)
        classifier.fit(X_train,y_train)
        y_p = classifier.predict(X_test)
        matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
        PD = calculate_pd(matrix)
        PF = calculate_pf(matrix)
        accuracy = balanced_accuracy_score(y_test,y_p)

        #Store results
        data = ['KNeighborsClassifier',accuracy, precision, recall, f1_score,PD,PF]
        performance_data.append(data)

        #DecisionTreeClassifier=======================================================================================
        classifier = DecisionTreeClassifier()
        classifier.fit(X_train,y_train)
        y_p = classifier.predict(X_test)
        matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
        PD = calculate_pd(matrix)
        PF = calculate_pf(matrix)
        accuracy = balanced_accuracy_score(y_test,y_p)

        #Store results
        data = ['DecisionTreeClassifier',accuracy, precision, recall, f1_score,PD,PF]
        performance_data.append(data)
        #HSME=======================================================================================
        classifier = HSME()
        classifier.fit(X_train,y_train)
        y_p = classifier.predict(X_test)
        matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
        PD = calculate_pd(matrix)
        PF = calculate_pf(matrix)
        accuracy = balanced_accuracy_score(y_test,y_p)

        #Store results
        data = ['HSME',accuracy, precision, recall, f1_score,PD,PF]
        performance_data.append(data)
        #=============================================================================================================

    results_df = pd.DataFrame(performance_data, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 score','PD','PF'])
    results_df.to_csv("results/"+dataset)

cm1
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Running episode  9
Running episode  10
Running episode  11
Running episode  12
Running episode  13
Running episode  14
Running episode  15
Running episode  16
Running episode  17
Running episode  18
Running episode  19
Running episode  20
Running episode  21
Running episode  22
Running episode  23
Running episode  24
Running episode  25
Running episode  26
Running episode  27
Running episode  28
Running episode  29
Running episode  30
jm1
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Running episode  9
Running episode  10
Running episode  11
Running episode  12
Running episode  13
Running episode  14
Running episode  15
Running episode  16
Running episode  17
Running episode  18
Running episode  19
Running episode  20
Running ep