# 1. Reading the results

We read the results from txt files into Pandas DataFrames for analysis.

In [41]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join


# Read all results from txt files
parameter_iter_path = "./parameter_iter/"
parameter_iter_files = [f for f in listdir(parameter_iter_path) if isfile(join(parameter_iter_path, f))]

parameter_e_path = "./parameter_e/"
parameter_e_files = [f for f in listdir(parameter_e_path) if isfile(join(parameter_e_path, f))]

# Parameter Setting (ACO & CLu) - maximum iteration
column_names = ["Data-set", "Size", "Algorithm", "Support", "Run-time", "Memory", "Patterns", "Pattern-count", "Max-iteration", "E"]
df_piter = pd.DataFrame(columns = column_names)

for file in parameter_iter_files:
    f_path = join(parameter_iter_path,file)
    try:
        res = pd.read_csv(f_path, names=["A","B"], sep = ':', header=None, engine='python')
    except Exception:
        print(f_path)
    
    run = float(res['B'][0][0:6]) # run-time
    mem = str(res['B'][1]) # memory
    alg = str(res['B'][2]) # algorithm
    att = int(res['B'][3]) # number of attributes in the data set
    size = int(res['B'][4])  # data set size

    if alg == " GRAANK ":
        evp = 0
        mit = 0
        sup = float(res['B'][5]) # minimum support
        pat = int(res['B'][7]) # patterns
    else:
        evp = float(res['B'][5]) # evaporation-factor / erasure-probability
        mit = int(res['B'][6]) # maximum iteraction count
    
        sup = float(res['B'][7]) # minimum support
        pat = int(res['B'][9]) # patterns
    
    # Fetch patterns
    gps = [""] * pat
    gp_pos = res.loc[res['A'] == 'Pattern '].index[0] + 1 # 1st position
    for i in range(pat):
        pos = i + gp_pos
        gps[i] = [res['A'][pos], float(res['B'][pos])]
    
    if att == 10:
        col = "BCR"
    elif att == 15:
        col = "AQY"
    else:
        col = ""
    df_piter = df_piter.append({"Data-set": col, "Size": size, "Algorithm":alg, "Support": sup, "Run-time":run, "Memory":mem, "Patterns":gps, "Pattern-count":pat, "Max-iteration":mit, "E": evp}, ignore_index=True)
df_piter['Algorithm'].replace({' Clu-GRAD (v1.3)' : 'CluGRAD'}, inplace=True)
df_piter['Algorithm'].replace({' ACO-GRAANK (v4.0)' : 'ACoGRAD'}, inplace=True)




# Parameter Setting (ACO & CLu) - evaporation-factor/erasure-probability
column_names = ["Data-set", "Size", "Algorithm", "Support", "Run-time", "Memory", "Patterns", "Pattern-count", "Misclustered-patterns", "Support-error", "Max-iteration", "E"]
df_pe = pd.DataFrame(columns = column_names)

for file in parameter_e_files:
    f_path = join(parameter_e_path,file)
    try:
        res = pd.read_csv(f_path, names=["A","B"], sep = ':', header=None, engine='python')
    except Exception:
        print(f_path)
    
    run = float(res['B'][0][0:6]) # run-time
    mem = str(res['B'][1]) # memory
    alg = str(res['B'][2]) # algorithm
    att = int(res['B'][3]) # number of attributes in the data set
    size = int(res['B'][4])  # data set size

    if alg == " GRAANK ":
        evp = 0
        mit = 0
        sup = float(res['B'][5]) # minimum support
        pat = int(res['B'][7]) # patterns
    else:
        evp = float(res['B'][5]) # evaporation-factor / erasure-probability
        mit = int(res['B'][6]) # maximum iteraction count
    
        sup = float(res['B'][7]) # minimum support
        pat = int(res['B'][9]) # patterns
    
    # Fetch patterns
    if alg == " Clu-GRAD (v1.4)":
        miss = 0
        gps = [""] * pat
        errs = np.zeros(pat)  #[0.0] * pat
        gp_pos = res.loc[res['A'] == 'Comparison '].index[0] + 1 # 1st position
        for i in range(pat):
            pos = i + gp_pos
            sups = str(res['B'][pos])
            gps[i] = [res['A'][pos], sups]
            sup_arr = sups.split(", ")
            est_sup = float(sup_arr[0])
            true_sup = float(sup_arr[1])
            if true_sup > 0:
                temp_err = (true_sup - est_sup) / true_sup
            else:
                miss += 1
            errs[i] = abs(temp_err)
        # print(errs)
        error_rate = errs.mean()
    else:
        gps = [""] * pat
        gp_pos = res.loc[res['A'] == 'Pattern '].index[0] + 1 # 1st position
        for i in range(pat):
            pos = i + gp_pos
            gps[i] = [res['A'][pos], float(res['B'][pos])]
        error_rate = 0
        miss = 0
    if att == 10:
        col = "BCR"
    elif att == 15:
        col = "AQY"
    elif att == 98:
        col = "C2K"
    else:
        col = ""
    df_pe = df_pe.append({"Data-set": col, "Size": size, "Algorithm":alg, "Support": sup, "Run-time":run, "Memory":mem, "Patterns":gps, "Pattern-count":pat, "Misclustered-patterns":miss, "Support-error":error_rate, "Max-iteration":mit, "E": evp}, ignore_index=True)
df_pe['Algorithm'].replace({' Clu-GRAD (v1.4)' : 'CluGRAD'}, inplace=True)
df_pe['Algorithm'].replace({' ACO-GRAANK (v4.0)' : 'ACoGRAD'}, inplace=True)


# df_pe

# 2. Parameter Setting

We conduct an analysis to determine:

* The optimum maximum-iteration for the ACO-GRAD algorithm and the optimum max-iteration for estimating score-vectors for the Clu-GRAD algorithm.

* The optimum values for evaporation-factor (ACO-GRAD) and erasure-probability (CLu-GRAD).


## 2.1 Parameter Setting (Maximum Iterations)


In [42]:
import numpy as np

# data_sets = ['BCR', 'DIR', 'C2K', 'AQY', 'HPC']
data_sets = ['BCR', 'AQY']
algorithms = ['ACoGRAD', 'CluGRAD', 'GRAANK']
evals_it = [2, 5, 10, 100, 500, 1000]
evals_e = [0.25, 0.5, 0.75, 0.9]

eval_data_iter = []
eval_data_e = []


# Only include these columns
df_res = df_piter[["Data-set", "Algorithm", "Support", "Run-time", "Pattern-count", "Max-iteration", "Patterns", 'E']]

for ds in data_sets:
    df1 = df_res.loc[df_res['Data-set'] == ds]

    for alg in algorithms:
        df2 = df1.loc[df1['Algorithm'].str.contains(alg)]
        # print(df1)
        for ev in evals_it:
            df3 = df2.loc[df2['Max-iteration'] == ev]
            if not df3.empty:
                # print(df2)
                pat_arr = []
                for index,row in df3.iterrows():
                    # 10/100/1000 evaluations
                    pat_arr.append(row['Patterns'])
                    # print(pat_arr)
                eval_data_iter.append([ds, alg, ev, df3['Pattern-count'].mean(), df3['Pattern-count'].std(), df3['Run-time'].mean(), df3['Run-time'].std()])
                       
df_data_1 = pd.DataFrame(data=eval_data_iter, columns=['Dataset', 'Algorithm', 'Iterations', 'Patterns (mean)', 'Patterns (std)', 'Run-time (mean)', 'Run-time (std)'])
df_data_1


Unnamed: 0,Dataset,Algorithm,Iterations,Patterns (mean),Patterns (std),Run-time (mean),Run-time (std)
0,BCR,ACoGRAD,2,2.0,0.0,0.083333,0.013317
1,BCR,ACoGRAD,5,4.333333,0.57735,0.084333,0.002517
2,BCR,ACoGRAD,10,7.666667,0.57735,0.102333,0.001528
3,BCR,ACoGRAD,100,23.666667,1.154701,1.745333,0.224179
4,BCR,ACoGRAD,500,28.666667,0.57735,33.053333,0.677225
5,BCR,ACoGRAD,1000,29.333333,1.154701,113.433333,10.600629
6,BCR,CluGRAD,2,5.0,0.0,1.586333,0.017926
7,BCR,CluGRAD,5,5.0,0.0,3.023,0.030512
8,BCR,CluGRAD,10,5.0,0.0,5.385,0.137022
9,BCR,CluGRAD,100,5.0,0.0,47.94,1.075546


## 2.2 Parameter Setting (Evaporation Factor/Erasure Probability)



In [43]:
import numpy as np

# data_sets = ['BCR', 'DIR', 'C2K', 'AQY', 'HPC']
data_sets = ['BCR', 'AQY', 'C2K']
algorithms = ['ACoGRAD', 'CluGRAD', 'GRAANK']
evals_it = [2, 5, 10, 100, 500, 1000]
evals_e = [0.25, 0.5, 0.75, 0.9]

eval_data_iter = []
eval_data_e = []


# Only include these columns
df_res = df_pe[["Data-set", "Algorithm", "Support", "Run-time", "Pattern-count", "Misclustered-patterns", "Support-error", "Max-iteration", "Patterns", 'E']]

for ds in data_sets:
    df1 = df_res.loc[df_res['Data-set'] == ds]

    for alg in algorithms:
        df2 = df1.loc[df1['Algorithm'].str.contains(alg)]
        # print(df1)
        for ev in evals_e:
            df3 = df2.loc[df2['E'] == ev]
            if not df3.empty:
                # print(df2)
                pat_arr = []
                for index,row in df3.iterrows():
                    # 10/100/1000 evaluations
                    pat_arr.append(row['Patterns'])
                    # print(pat_arr)
                eval_data_e.append([ds, alg, ev, df3['Pattern-count'].mean(), df3['Pattern-count'].std(), df3['Misclustered-patterns'].mean(), df3['Misclustered-patterns'].std(), df3['Support-error'].mean(), df3['Support-error'].std(), df3['Run-time'].mean(), df3['Run-time'].std()])
                       
df_data_2 = pd.DataFrame(data=eval_data_e, columns=['Dataset', 'Algorithm', 'EFactor/ EProbability', 'Patterns (mean)', 'Patterns (std)', 'Misclustered (mean)', 'Misclustered (std)', 'Support-error (mean)', 'Support-error (std)', 'Run-time (mean)', 'Run-time (std)'])
df_data_2



Unnamed: 0,Dataset,Algorithm,EFactor/ EProbability,Patterns (mean),Patterns (std),Misclustered (mean),Misclustered (std),Support-error (mean),Support-error (std),Run-time (mean),Run-time (std)
0,BCR,ACoGRAD,0.25,21.166667,1.722401,0.0,0.0,0.0,0.0,23.836667,6.607325
1,BCR,ACoGRAD,0.5,19.333333,1.505545,0.0,0.0,0.0,0.0,13.6485,4.351064
2,BCR,ACoGRAD,0.75,18.857143,1.345185,0.0,0.0,0.0,0.0,8.5,1.584887
3,BCR,ACoGRAD,0.9,19.0,1.732051,0.0,0.0,0.0,0.0,5.429778,0.812165
4,BCR,CluGRAD,0.25,4.888889,0.600925,0.666667,0.5,0.006222,0.00288,1.15,0.073651
5,BCR,CluGRAD,0.5,5.444444,1.589899,0.777778,0.440959,0.014778,0.006027,0.831,0.07504
6,BCR,CluGRAD,0.75,4.555556,1.509231,0.444444,0.527046,0.037007,0.010533,0.556222,0.053993
7,BCR,CluGRAD,0.9,5.0,1.322876,0.444444,0.527046,0.085282,0.017019,0.457667,0.042152
8,AQY,ACoGRAD,0.25,114.5,3.109126,0.0,0.0,0.0,0.0,65.6475,0.460172
9,AQY,ACoGRAD,0.5,111.333333,6.250333,0.0,0.0,0.0,0.0,65.536667,1.834608
