## Imports

In [29]:
#Data loading
from scipy.io import arff
#General
import pandas as pd
import numpy as np
from numpy import std, mean, sqrt
import tensorflow as tf
import math
#Statistics
from scipy.stats import normaltest
from scipy.stats import chisquare
from scipy.stats import ttest_ind
#Visualization
from matplotlib import pyplot as plt
#Utility
from tabulate import tabulate

In [30]:
#Data preparation
from sklearn.model_selection import train_test_split
#Classification models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from SmoteEnsemble import SmoteEnsemble as HSME
#Result analysis
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score as f1_score_func
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score

In [31]:
#Custom imports
from utility import calculate_results
from utility import print_confusion_matrix
from utility import print_results
#
from REPD_Impl import REPD
from autoencoder import AutoEncoder

In [32]:
import warnings

## Dataset preparation

In [33]:
def calculate_pd(matrix):
    return matrix[1][1]/(matrix[1][0]+matrix[1][1])

def calculate_pf(matrix):
    return matrix[0][1]/(matrix[0][0]+matrix[0][1])

In [34]:
dataset_settings = {
  "cm1": ["defects", lambda x: 1 if str(x)=="b'true'" else 0 ],
  "jm1": ["defects", lambda x: 1 if str(x)=="b'true'" else 0 ],
  "kc1": ["defects", lambda x: 1 if str(x)=="b'true'" else 0 ],
  "kc2": ["problems", lambda x: 1 if str(x)=="b'yes'" else 0 ],
  "pc1": ["defects", lambda x: 1 if str(x)=="b'true'" else 0 ]
}
#
percentages = [0.05,0.06,0.07,0.08,0.09,0.10,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.20]
percentages.reverse()

In [35]:
episode_count = 30

## Undersampling

In [36]:
warnings.simplefilter("ignore")
for percentage in percentages:
    print('Percentage',percentage)
    for dataset in dataset_settings:
        defect_column_name = dataset_settings[dataset][0]
        defect_column_map_function = dataset_settings[dataset][1]

        # Load dataset
        data, meta = arff.loadarff("./data/"+dataset+".arff")

        # Wrap data into a pandas dataframe
        df = pd.DataFrame(data)

        #Adjust defects column
        df[defect_column_name] = df[defect_column_name].map(defect_column_map_function)

        #Remove all with missing values
        df = df.dropna()

        #Remove duplicate instances
        df = df.drop_duplicates()
        
        #Calculate dataset property constants
        total_count = len(df)
        non_defective_count = len(df[df[defect_column_name]==0])
        defective_count = len(df[df[defect_column_name]==1])
        total_count = len(df)
        non_defective_count = len(df[df[defect_column_name]==0])
        defective_count = len(df[df[defect_column_name]==1])
        #
        desired_defective_count = int(percentage*total_count)
        remove_count = defective_count-desired_defective_count
        #
        print(dataset,
              'Defective count:',defective_count,
              'Desired defective count',desired_defective_count,
              'Remove count:',remove_count)
        
        #
        if remove_count < 0:
            continue
        #
        performance_data = []

        #Run all the models in the experiment
        for experiment_episode in range(1,episode_count+1,1):                
            print("Running episode ", experiment_episode)
                        
            #Remove exampels
            indexes =np.argwhere(df[defect_column_name]==1).flatten()
            drop_indices = np.random.choice(indexes, size=remove_count, replace=False)
            #
            df_p = df.drop(df.index[drop_indices])
            #
            defective_count = len(df_p[df_p[defect_column_name]==1])
                        
            #Run experiment
            X = df_p.drop(columns=[defect_column_name]).values
            y = df_p[defect_column_name].values

            #Test train split
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

            #ADP===========================================================================================
            autoencoder = AutoEncoder([21,10],0.01,100,50)
            classifer = REPD(autoencoder)
            classifer.fit(X_train,y_train)
            y_p = classifer.predict(X_test)
            matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
            PD = calculate_pd(matrix)
            PF = calculate_pf(matrix)
            accuracy = balanced_accuracy_score(y_test,y_p)

            #Store results
            data = ['REPD',accuracy, precision, recall, f1_score,PD,PF,percentage]
            performance_data.append(data)

            #Close
            autoencoder.close()
            #GaussianNB===============================================================================================
            classifier = GaussianNB()
            classifier.fit(X_train,y_train)
            y_p = classifier.predict(X_test)
            matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
            PD = calculate_pd(matrix)
            PF = calculate_pf(matrix)
            accuracy = balanced_accuracy_score(y_test,y_p)

            #Store results
            data = ['GaussianNB',accuracy, precision, recall, f1_score,PD,PF,percentage]
            performance_data.append(data)
            #LogisticRegression===========================================================================================
            classifier = LogisticRegression()
            classifier.fit(X_train,y_train)
            y_p = classifier.predict(X_test)
            matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
            PD = calculate_pd(matrix)
            PF = calculate_pf(matrix)
            accuracy = balanced_accuracy_score(y_test,y_p)

            #Store results
            data = ['LogisticRegression',accuracy, precision, recall, f1_score,PD,PF,percentage]
            performance_data.append(data)
            #KNeighborsClassifier=========================================================================================
            classifier = KNeighborsClassifier(n_neighbors=3)
            classifier.fit(X_train,y_train)
            y_p = classifier.predict(X_test)
            matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
            PD = calculate_pd(matrix)
            PF = calculate_pf(matrix)
            accuracy = balanced_accuracy_score(y_test,y_p)

            #Store results
            data = ['KNeighborsClassifier',accuracy, precision, recall, f1_score,PD,PF,percentage]
            performance_data.append(data)

            #DecisionTreeClassifier=======================================================================================
            classifier = DecisionTreeClassifier()
            classifier.fit(X_train,y_train)
            y_p = classifier.predict(X_test)
            matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
            PD = calculate_pd(matrix)
            PF = calculate_pf(matrix)
            accuracy = balanced_accuracy_score(y_test,y_p)

            #Store results
            data = ['DecisionTreeClassifier',accuracy, precision, recall, f1_score,PD,PF,percentage]
            performance_data.append(data)
            #HSME=======================================================================================
            classifier = HSME()
            classifier.fit(X_train,y_train)
            y_p = classifier.predict(X_test)
            matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
            PD = calculate_pd(matrix)
            PF = calculate_pf(matrix)
            accuracy = balanced_accuracy_score(y_test,y_p)

            #Store results
            data = ['HSME',accuracy, precision, recall, f1_score,PD,PF,percentage]
            performance_data.append(data)
            #=============================================================================================================

        results_df = pd.DataFrame(performance_data, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 score','PD','PF','Percentage'])
        
        with open("results/"+dataset+'_'+str(percentage), 'a') as f:
            results_df.to_csv(f, header=False)
        
        print()
    print()

Percentage 0.2
cm1 Defective count: 48 Desired defective count 88 Remove count: -40
jm1 Defective count: 2004 Desired defective count 1781 Remove count: 223
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Running episode  9
Running episode  10
Running episode  11
Running episode  12
Running episode  13
Running episode  14
Running episode  15
Running episode  16
Running episode  17
Running episode  18
Running episode  19
Running episode  20
Running episode  21
Running episode  22
Running episode  23
Running episode  24
Running episode  25
Running episode  26
Running episode  27
Running episode  28
Running episode  29
Running episode  30

kc1 Defective count: 315 Desired defective count 242 Remove count: 73
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Running episode  9
Running ep

Running episode  16
Running episode  17
Running episode  18
Running episode  19
Running episode  20
Running episode  21
Running episode  22
Running episode  23
Running episode  24
Running episode  25
Running episode  26
Running episode  27
Running episode  28
Running episode  29
Running episode  30

pc1 Defective count: 70 Desired defective count 162 Remove count: -92

Percentage 0.16
cm1 Defective count: 48 Desired defective count 70 Remove count: -22
jm1 Defective count: 2004 Desired defective count 1425 Remove count: 579
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Running episode  9
Running episode  10
Running episode  11
Running episode  12
Running episode  13
Running episode  14
Running episode  15
Running episode  16
Running episode  17
Running episode  18
Running episode  19
Running episode  20
Running episode  21
Running episode  22
Running episode  23
Running episode  24

Running episode  30

kc2 Defective count: 105 Desired defective count 48 Remove count: 57
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Running episode  9
Running episode  10
Running episode  11
Running episode  12
Running episode  13
Running episode  14
Running episode  15
Running episode  16
Running episode  17
Running episode  18
Running episode  19
Running episode  20
Running episode  21
Running episode  22
Running episode  23
Running episode  24
Running episode  25
Running episode  26
Running episode  27
Running episode  28
Running episode  29
Running episode  30

pc1 Defective count: 70 Desired defective count 124 Remove count: -54

Percentage 0.12
cm1 Defective count: 48 Desired defective count 53 Remove count: -5
jm1 Defective count: 2004 Desired defective count 1068 Remove count: 936
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episod

Running episode  26
Running episode  27
Running episode  28
Running episode  29
Running episode  30

pc1 Defective count: 70 Desired defective count 95 Remove count: -25

Percentage 0.09
cm1 Defective count: 48 Desired defective count 39 Remove count: 9
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Running episode  9
Running episode  10
Running episode  11
Running episode  12
Running episode  13
Running episode  14
Running episode  15
Running episode  16
Running episode  17
Running episode  18
Running episode  19
Running episode  20
Running episode  21
Running episode  22
Running episode  23
Running episode  24
Running episode  25
Running episode  26
Running episode  27
Running episode  28
Running episode  29
Running episode  30

jm1 Defective count: 2004 Desired defective count 801 Remove count: 1203
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Runni

Running episode  26
Running episode  27
Running episode  28
Running episode  29
Running episode  30

pc1 Defective count: 70 Desired defective count 66 Remove count: 4
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Running episode  9
Running episode  10
Running episode  11
Running episode  12
Running episode  13
Running episode  14
Running episode  15
Running episode  16
Running episode  17
Running episode  18
Running episode  19
Running episode  20
Running episode  21
Running episode  22
Running episode  23
Running episode  24
Running episode  25
Running episode  26
Running episode  27
Running episode  28
Running episode  29
Running episode  30


Percentage 0.06
cm1 Defective count: 48 Desired defective count 26 Remove count: 22
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Run

## Oversampling

In [37]:
episode_count = 30

In [38]:
warnings.simplefilter("ignore")
for percentage in percentages:
    print('Percentage',percentage)
    for dataset in dataset_settings:
        defect_column_name = dataset_settings[dataset][0]
        defect_column_map_function = dataset_settings[dataset][1]

        # Load dataset
        data, meta = arff.loadarff("./data/"+dataset+".arff")

        # Wrap data into a pandas dataframe
        df = pd.DataFrame(data)

        #Adjust defects column
        df[defect_column_name] = df[defect_column_name].map(defect_column_map_function)

        #Remove all with missing values
        df = df.dropna()

        #Remove duplicate instances
        df = df.drop_duplicates()
        
        #Calculate dataset property constants
        total_count = len(df)
        non_defective_count = len(df[df[defect_column_name]==0])
        defective_count = len(df[df[defect_column_name]==1])
        total_count = len(df)
        non_defective_count = len(df[df[defect_column_name]==0])
        defective_count = len(df[df[defect_column_name]==1])
        #
        performance_data = []
        #
        print(dataset)

        #Run all the models in the experiment
        for experiment_episode in range(1,episode_count+1,1):
            #Test train split
            df_train, df_test = train_test_split(df, test_size=0.2)
            #
            train_defective_count = len(df_train[df_train[defect_column_name]==1])
            add_count = int(round(train_defective_count/percentage))-len(df_train)
            if add_count<0:
                continue

            print("Running episode ", experiment_episode)

            #Add examples exampels
            indexes =np.argwhere(df_train[defect_column_name]==0).flatten()
            add_indices = np.random.choice(indexes, add_count, replace=True)
            #
            selected_df = df_train.iloc[add_indices,:]
            df_train_p = pd.concat([df_train,selected_df])        
            #
            #print(len(df_train_p[df_train_p[defect_column_name]==0]),len(df_train_p))
            #print(len(df_train_p[df_train_p[defect_column_name]==0])/len(df_train_p))
            
            #Run experiment
            X_train = df_train_p.drop(columns=[defect_column_name]).values
            y_train = df_train_p[defect_column_name].values
            X_test = df_test.drop(columns=[defect_column_name]).values
            y_test = df_test[defect_column_name].values
            
            #
            #print(len(X_train))
            #ADP===========================================================================================
            autoencoder = AutoEncoder([21,10],0.01,100,50)
            classifer = REPD(autoencoder)
            classifer.fit(X_train,y_train)
            y_p = classifer.predict(X_test)
            matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
            PD = calculate_pd(matrix)
            PF = calculate_pf(matrix)
            accuracy = balanced_accuracy_score(y_test,y_p)

            #Store results
            data = ['REPD',accuracy, precision, recall, f1_score,PD,PF,percentage]
            performance_data.append(data)

            #Close
            autoencoder.close()
            #GaussianNB===============================================================================================
            classifier = GaussianNB()
            classifier.fit(X_train,y_train)
            y_p = classifier.predict(X_test)
            matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
            PD = calculate_pd(matrix)
            PF = calculate_pf(matrix)
            accuracy = balanced_accuracy_score(y_test,y_p)

            #Store results
            data = ['GaussianNB',accuracy, precision, recall, f1_score,PD,PF,percentage]
            performance_data.append(data)
            #LogisticRegression===========================================================================================
            classifier = LogisticRegression()
            classifier.fit(X_train,y_train)
            y_p = classifier.predict(X_test)
            matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
            PD = calculate_pd(matrix)
            PF = calculate_pf(matrix)
            accuracy = balanced_accuracy_score(y_test,y_p)

            #Store results
            data = ['LogisticRegression',accuracy, precision, recall, f1_score,PD,PF,percentage]
            performance_data.append(data)
            #KNeighborsClassifier=========================================================================================
            classifier = KNeighborsClassifier(n_neighbors=3)
            classifier.fit(X_train,y_train)
            y_p = classifier.predict(X_test)
            matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
            PD = calculate_pd(matrix)
            PF = calculate_pf(matrix)
            accuracy = balanced_accuracy_score(y_test,y_p)

            #Store results
            data = ['KNeighborsClassifier',accuracy, precision, recall, f1_score,PD,PF,percentage]
            performance_data.append(data)

            #DecisionTreeClassifier=======================================================================================
            classifier = DecisionTreeClassifier()
            classifier.fit(X_train,y_train)
            y_p = classifier.predict(X_test)
            matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
            PD = calculate_pd(matrix)
            PF = calculate_pf(matrix)
            accuracy = balanced_accuracy_score(y_test,y_p)

            #Store results
            data = ['DecisionTreeClassifier',accuracy, precision, recall, f1_score,PD,PF,percentage]
            performance_data.append(data)
            #HSME=======================================================================================
            classifier = HSME()
            classifier.fit(X_train,y_train)
            y_p = classifier.predict(X_test)
            matrix, accuracy, precision, recall, f1_score = calculate_results(y_test,y_p)
            PD = calculate_pd(matrix)
            PF = calculate_pf(matrix)
            accuracy = balanced_accuracy_score(y_test,y_p)

            #Store results
            data = ['HSME',accuracy, precision, recall, f1_score,PD,PF,percentage]
            performance_data.append(data)
            #=============================================================================================================

        results_df = pd.DataFrame(performance_data, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 score','PD','PF','Percentage'])
        
        with open("results/"+dataset+'_add_'+str(percentage), 'a') as f:
            results_df.to_csv(f, header=False)
        print()
    print()

Percentage 0.2
cm1

jm1
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Running episode  9
Running episode  10

kc1
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Running episode  9
Running episode  10

kc2
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Running episode  9
Running episode  10

pc1


Percentage 0.19
cm1

jm1
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Running episode  9
Running episode  10

kc1
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Running episo

Running episode  10

kc1
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Running episode  9
Running episode  10

kc2
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Running episode  9
Running episode  10

pc1
Running episode  8
Running episode  9


Percentage 0.07
cm1
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Running episode  9
Running episode  10

jm1
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  7
Running episode  8
Running episode  9
Running episode  10

kc1
Running episode  1
Running episode  2
Running episode  3
Running episode  4
Running episode  5
Running episode  6
Running episode  