In [6]:
import warnings
#
from scipy.io import arff
#
import pandas as pd
import numpy as np
#
from tabulate import tabulate
#
from imblearn.over_sampling import SMOTE
#
from sklearn.tree import DecisionTreeClassifier
#
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
#
from sklearn.model_selection import train_test_split
#
from sklearn.metrics import f1_score

## Loading the dataset

In [7]:
datasets = ["cm1","jm1","kc1","kc2","pc1"]
dataset_settings = {
  "cm1": ["defects", lambda x: 1 if str(x)=="b'true'" else 0 ],
  "jm1": ["defects", lambda x: 1 if str(x)=="b'true'" else 0 ],
  "kc1": ["defects", lambda x: 1 if str(x)=="b'true'" else 0 ],
  "kc2": ["problems", lambda x: 1 if str(x)=="b'yes'" else 0 ],
  "pc1": ["defects", lambda x: 1 if str(x)=="b'true'" else 0 ]
}

In [8]:
X = dict()
y = dict()
for dataset in datasets:
    print("Loading: ", dataset)
    defect_column_name = dataset_settings[dataset][0]
    defect_column_map_function = dataset_settings[dataset][1]

    # Load dataset
    data, meta = arff.loadarff("./data/"+dataset+".arff")

    # Wrap data into a pandas dataframe
    df = pd.DataFrame(data)

    #Adjust defects column
    df[defect_column_name] = df[defect_column_name].map(defect_column_map_function)

    #Remove all with missing values
    df = df.dropna()

    #Remove duplicate instances
    df = df.drop_duplicates()

    #Calculate dataset property constants
    total_count = len(df)
    non_defective_count = len(df[df[defect_column_name]==0])
    defective_count = len(df[df[defect_column_name]==1])
    total_count = len(df)
    non_defective_count = len(df[df[defect_column_name]==0])
    defective_count = len(df[df[defect_column_name]==1])

    #Run experiment

    X[dataset] = df.drop(columns=[defect_column_name]).values
    y[dataset] = df[defect_column_name].values

Loading:  cm1
Loading:  jm1
Loading:  kc1
Loading:  kc2
Loading:  pc1


## Determine the best model

In [9]:
models = [
            ("Ada",AdaBoostClassifier()),
            ("Bagging",BaggingClassifier(base_estimator=DecisionTreeClassifier())),
            ("RandomForest",RandomForestClassifier())
        ]

In [10]:
warnings.simplefilter("ignore")
#
REPEAT = 1
#
best_model = dict()
for dataset in datasets:
    best_model[dataset] = dict()
    for name, _ in models:
        best_model[dataset][name] = 0
#
for i in range(REPEAT):
    print("Epoch",i)
    for dataset in datasets:
        best = ""
        best_f1 = -1
        #
        X_train, X_test, y_train, y_test = train_test_split(X[dataset], y[dataset], test_size=0.2)
        #
        for name, model in models:
            model.fit(X_train,y_train)
            #
            y_pred = model.predict(X_test)
            #
            f1 = f1_score(y_test,y_pred)
            #
            if f1>best_f1:
                best_f1 = f1
                best = name
        #
        print("\tFor dataset",dataset,"the best model is",best,"having F1",best_f1)
        best_model[dataset][best] = best_model[dataset][best] + 1     
    print()

Epoch 0
	For dataset cm1 the best model is Ada having F1 0.2
	For dataset jm1 the best model is Bagging having F1 0.29333333333333333
	For dataset kc1 the best model is Ada having F1 0.3571428571428572
	For dataset kc2 the best model is Bagging having F1 0.4324324324324324
	For dataset pc1 the best model is Bagging having F1 0.22222222222222224



In [11]:
for dataset in datasets:
    print(best_model[dataset])

{'Ada': 1, 'Bagging': 0, 'RandomForest': 0}
{'Ada': 0, 'Bagging': 1, 'RandomForest': 0}
{'Ada': 1, 'Bagging': 0, 'RandomForest': 0}
{'Ada': 0, 'Bagging': 1, 'RandomForest': 0}
{'Ada': 0, 'Bagging': 1, 'RandomForest': 0}


In [12]:
for dataset in datasets:
    best = ''
    max_count = 0
    for model_name in best_model[dataset]:
        count = best_model[dataset][model_name]
        if max_count < count:
            max_count = count
            best = model_name
    best_model[dataset] = best

## Perfrom SMOTE on the dataset

In [13]:
#sm = SMOTE()

In [14]:
#Xs = dict()
#ys = dict()
##
#for dataset in datasets:
#    Xs[dataset], ys[dataset] = sm.fit_resample(X[dataset], y[dataset])

## Train the best model on the oversampled dataset

In [15]:
def avg(l):
    return sum(l)/len(l)

In [17]:
performance_data = dict()
for dataset in datasets:
    performance_data[dataset] = []
REPEAT = 30
for i in range(REPEAT):
    for dataset in datasets:
        X_train, X_test, y_train, y_test = train_test_split(X[dataset], y[dataset], test_size=0.2)
        #
        sm = SMOTE()
        X_train, y_train = sm.fit_resample(X_train,y_train)
        #
        model = None
        if best_model[dataset]=="Ada":
            model = AdaBoostClassifier()
        elif best_model[dataset]=="Bagging":
            model = BaggingClassifier(base_estimator=DecisionTreeClassifier())
        elif best_model[dataset]=="RandomForest":
            model = RandomForestClassifier()
        #
        model.fit(X_train,y_train)
        #
        y_pred = model.predict(X_test)
        #
        f1 = f1_score(y_test,y_pred)
        #
        performance_data[dataset].append(f1)

In [18]:
for dataset in datasets:
    print(dataset,"F1 score\n\t",avg(performance_data[dataset]))    

cm1 F1 score
	 0.2722158328147624
jm1 F1 score
	 0.35286602431562497
kc1 F1 score
	 0.40635381673021903
kc2 F1 score
	 0.5181199391878842
pc1 F1 score
	 0.30121339319251705
