## Imports

In [1]:
#General
import pandas as pd
import numpy as np
#Visualization
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib

## Result analysis

In [2]:
models = ["DecisionTreeClassifier","GaussianNB","KNeighborsClassifier","LogisticRegression","REPD"]
datasets = [
    "ant",
    "camel",
    "log4j",
    "poi"
]
dataset_versions = {
    "ant":["1.5","1.6"],
    "camel":["1.2","1.4"],
    "log4j":["1.1","1.2"],
    "poi":["2.0","2.5"]
}
feature_types = ["dbn","da","ca"]
per_feature_type_count = 30
colnames = ['Unnamed: 0','Model', 'Accuracy', 'Precision', 'Recall', 'F1 score', "Dataset", "Version", "Feature_type", "i"]

In [6]:
font = {'size': 80}
matplotlib.rc('font', **font)

for dataset in datasets:
    for version in dataset_versions[dataset]:
        main_df = None
        #
        for feature_type in feature_types:
            for i in range(per_feature_type_count):
                results_df = pd.read_csv("results/"+dataset+'_'+version+'_'+feature_type+'_'+str(i),names=colnames, header=None)
                results_df = results_df.drop(columns=["Unnamed: 0"])
                #
                if main_df is None:
                    main_df = results_df
                else: 
                    main_df = pd.concat([main_df,results_df])
       
        #Print performance results     
        print(dataset,version)
        res = main_df.groupby(by=["Feature_type","Model"]).mean()["F1 score"]
        print()
        print(res)#.to_latex()
        print()
        
        """
        #Plot performance results
        for measure_name in ["F1 score","Recall","Precision"]:
            df = main_df.drop(columns=['Accuracy'])
            new_dfs = []

            for n, feature_type in enumerate(list(set(df.Feature_type.values))):
                new_df = []
                for model in list(set(df.Model.values)):
                    f1_scores = []
                    temp = df.loc[(df['Feature_type'] == feature_type) & (df['Model'] == model)][measure_name].values
                    for value in temp:
                        d = {
                            'Feature_typeModel' : '{: <26}'.format(", ".join([feature_type, model])), measure_name : value
                        }
                        new_df.append(d)

                new_df = pd.DataFrame(new_df)
                new_dfs.append(new_df)

            fig, axs = plt.subplots(3, 1, figsize=(64,64), sharex=True, sharey="row", gridspec_kw={'hspace': 0.1})

            for n, new_df in enumerate(new_dfs):
                ax=axs[n]
                sns.violinplot(measure_name,'Feature_typeModel', data=new_df, ax = ax)
                #
                yax = ax.get_yaxis()
                yax.grid(True)
                yax.set_ticks_position('right')
                pad = max([len(i) for i in new_df['Feature_typeModel'].values])
                yax.set_tick_params(pad=pad)
                y_label = yax.get_label()
                y_label.set_visible(False)
                #
                xax = ax.get_xaxis()
                xax.grid(True)
                x_label = xax.get_label()
                x_label.set_visible(False)
                #
                ax.set_title("")
            fig.suptitle(measure_name)
            st = fig.suptitle(measure_name, fontsize="x-large")
            st.set_y(0.95)
            fig.subplots_adjust(top=0.93)
            fig.tight_layout()
            plt.tight_layout()
            plt.savefig(dataset+"_"+version+"_"+measure_name+"_results.pdf",format='pdf',bbox_inches='tight')
            plt.show()
        """
        


ant 1.5

Feature_type  Model                 
ca            DecisionTreeClassifier    0.189696
              GaussianNB                0.397834
              HSME                      0.205727
              KNeighborsClassifier      0.172217
              LogisticRegression        0.000000
              REPD                      0.346162
da            DecisionTreeClassifier    0.193566
              GaussianNB                0.348395
              HSME                      0.230333
              KNeighborsClassifier      0.257090
              LogisticRegression        0.043307
              REPD                      0.267059
dbn           DecisionTreeClassifier    0.129656
              GaussianNB                0.116983
              HSME                      0.102012
              KNeighborsClassifier      0.030831
              LogisticRegression        0.076182
              REPD                      0.197420
Name: F1 score, dtype: float64

ant 1.6

Feature_type  Model            

In [4]:
from scipy.stats import sem, t
from scipy import mean

In [5]:
confidence = 0.95
for dataset in datasets:
    for version in dataset_versions[dataset]:
        main_df = None
        #
        for feature_type in feature_types:
            for i in range(per_feature_type_count):
                results_df = pd.read_csv("results/"+dataset+'_'+version+'_'+feature_type+'_'+str(i),names=colnames, header=None)
                results_df = results_df.drop(columns=["Unnamed: 0"])
                #
                if main_df is None:
                    main_df = results_df
                else: 
                    main_df = pd.concat([main_df,results_df])
                    
        print(dataset,version)
        for feature_type in feature_types:
            intervals = []
            for model in models:
                f1_values = main_df[(main_df["Feature_type"]==feature_type) & (main_df["Model"]==model)]["F1 score"].values
                #
                n = len(f1_values)
                m = mean(f1_values)
                std_err = sem(f1_values)
                h = std_err * t.ppf((1 + confidence) / 2, n - 1)
                #
                start = m-h
                end = m+h
                intervals.append((model,start,end))
                #
                print("\t",feature_type, model, n, round(m,4),"+/-",round(h,4))
            count = 0
            for i in range(len(intervals)):
                for j in range(i+1,len(intervals)):
                    m1,s1,e1 = intervals[i]
                    m2,s2,e2 = intervals[j]
                    if s1 <= e2 and s2 <= e1:
                        if m1=="REPD" or m2=="REPD":
                            count = count + 1
                            print(m1,m2)
            if count > 0:
                print("Overlap count:", count)
            print()
        print()
        print()

ant 1.5
	 dbn DecisionTreeClassifier 900 0.1297 +/- 0.0077
	 dbn GaussianNB 900 0.117 +/- 0.0082
	 dbn KNeighborsClassifier 900 0.0308 +/- 0.0057
	 dbn LogisticRegression 900 0.0762 +/- 0.0076
	 dbn REPD 900 0.1974 +/- 0.004

	 da DecisionTreeClassifier 900 0.1936 +/- 0.0093
	 da GaussianNB 900 0.3484 +/- 0.0085
	 da KNeighborsClassifier 900 0.2571 +/- 0.0108
	 da LogisticRegression 900 0.0433 +/- 0.0066
	 da REPD 900 0.2671 +/- 0.0072
KNeighborsClassifier REPD
Overlap count: 1

	 ca DecisionTreeClassifier 900 0.1897 +/- 0.0094
	 ca GaussianNB 900 0.3978 +/- 0.01
	 ca KNeighborsClassifier 900 0.1722 +/- 0.01
	 ca LogisticRegression 900 0.0 +/- 0.0
	 ca REPD 900 0.3462 +/- 0.009



ant 1.6
	 dbn DecisionTreeClassifier 900 0.2807 +/- 0.0062
	 dbn GaussianNB 900 0.2512 +/- 0.0066
	 dbn KNeighborsClassifier 900 0.1659 +/- 0.0065
	 dbn LogisticRegression 900 0.2276 +/- 0.0061
	 dbn REPD 900 0.4162 +/- 0.0039

	 da DecisionTreeClassifier 900 0.4969 +/- 0.0065
	 da GaussianNB 900 0.6396 +/- 0