# 1. Reading the results

We read the results from txt files into Pandas DataFrames for analysis.

In [114]:
import pandas as pd
from os import listdir
from os.path import isfile, join


# Read all results from txt files
parameter_iter_path = "./parameter_iter/"
parameter_iter_files = [f for f in listdir(parameter_iter_path) if isfile(join(parameter_iter_path, f))]

parameter_e_path = "./parameter_e/"
parameter_e_files = [f for f in listdir(parameter_e_path) if isfile(join(parameter_e_path, f))]

# Parameter Setting (ACO & CLu) - maximum iteration
column_names = ["Data-set", "Size", "Algorithm", "Support", "Run-time", "Memory", "Patterns", "Pattern-count", "Max-iteration", "E"]
df_piter = pd.DataFrame(columns = column_names)

for file in parameter_iter_files:
    f_path = join(parameter_iter_path,file)
    try:
        res = pd.read_csv(f_path, names=["A","B"], sep = ':', header=None, engine='python')
    except Exception:
        print(f_path)
    
    run = float(res['B'][0][0:6]) # run-time
    mem = str(res['B'][1]) # memory
    alg = str(res['B'][2]) # algorithm
    att = int(res['B'][3]) # number of attributes in the data set
    size = int(res['B'][4])  # data set size

    if alg == " GRAANK ":
        evp = 0
        mit = 0
        sup = float(res['B'][5]) # minimum support
        pat = int(res['B'][7]) # patterns
    else:
        evp = float(res['B'][5]) # evaporation-factor / erasure-probability
        mit = int(res['B'][6]) # maximum iteraction count
    
        sup = float(res['B'][7]) # minimum support
        pat = int(res['B'][9]) # patterns
    
    # Fetch patterns
    gps = [""] * pat
    gp_pos = res.loc[res['A'] == 'Pattern '].index[0] + 1 # 1st position
    for i in range(pat):
        pos = i + gp_pos
        gps[i] = [res['A'][pos], float(res['B'][pos])]
    
    if att == 10:
        col = "BCR"
    elif att == 15:
        col = "AQY"
    else:
        col = ""
    df_piter = df_piter.append({"Data-set": col, "Size": size, "Algorithm":alg, "Support": sup, "Run-time":run, "Memory":mem, "Patterns":gps, "Pattern-count":pat, "Max-iteration":mit, "E": evp}, ignore_index=True)
df_piter['Algorithm'].replace({' Clu-GRAD (v1.3)' : 'CluGRAD'}, inplace=True)
df_piter['Algorithm'].replace({' ACO-GRAANK (v4.0)' : 'ACoGRAD'}, inplace=True)




# Parameter Setting (ACO & CLu) - evaporation-factor/erasure-probability
column_names = ["Data-set", "Size", "Algorithm", "Support", "Run-time", "Memory", "Patterns", "Pattern-count", "Max-iteration", "E"]
df_pe = pd.DataFrame(columns = column_names)

for file in parameter_e_files:
    f_path = join(parameter_e_path,file)
    try:
        res = pd.read_csv(f_path, names=["A","B"], sep = ':', header=None, engine='python')
    except Exception:
        print(f_path)
    
    run = float(res['B'][0][0:6]) # run-time
    mem = str(res['B'][1]) # memory
    alg = str(res['B'][2]) # algorithm
    att = int(res['B'][3]) # number of attributes in the data set
    size = int(res['B'][4])  # data set size

    if alg == " GRAANK ":
        evp = 0
        mit = 0
        sup = float(res['B'][5]) # minimum support
        pat = int(res['B'][7]) # patterns
    else:
        evp = float(res['B'][5]) # evaporation-factor / erasure-probability
        mit = int(res['B'][6]) # maximum iteraction count
    
        sup = float(res['B'][7]) # minimum support
        pat = int(res['B'][9]) # patterns
    
    # Fetch patterns
    gps = [""] * pat
    gp_pos = res.loc[res['A'] == 'Pattern '].index[0] + 1 # 1st position
    for i in range(pat):
        pos = i + gp_pos
        gps[i] = [res['A'][pos], float(res['B'][pos])]
    
    if att == 10:
        col = "BCR"
    elif att == 15:
        col = "AQY"
    else:
        col = ""
    df_pe = df_pe.append({"Data-set": col, "Size": size, "Algorithm":alg, "Support": sup, "Run-time":run, "Memory":mem, "Patterns":gps, "Pattern-count":pat, "Max-iteration":mit, "E": evp}, ignore_index=True)
df_pe['Algorithm'].replace({' Clu-GRAD (v1.3)' : 'CluGRAD'}, inplace=True)
df_pe['Algorithm'].replace({' ACO-GRAANK (v4.0)' : 'ACoGRAD'}, inplace=True)


df_pe

Unnamed: 0,Data-set,Size,Algorithm,Support,Run-time,Memory,Patterns,Pattern-count,Max-iteration,E
0,AQY,1291,CluGRAD,0.5,131.3,"[140.10546875, 139.21875, 145.93359375, 145.9...","[[['2-', '4-', '7-', '9-'] , 0.681], [['3+', '...",6,2,0.5
1,AQY,1291,ACoGRAD,0.5,63.36,"[109.66015625, 109.609375, 127.1875, 128.5937...","[[['10+', '14+', '3+', '5+'] , 0.519], [['8-',...",114,500,0.5
2,BCR,116,ACoGRAD,0.5,21.16,"[103.93359375, 103.91015625, 103.91015625, 10...","[[['6-', '1+'] , 0.581], [['7-', '2-'] , 0.52]...",21,500,0.25
3,AQY,1291,ACoGRAD,0.5,43.67,"[109.546875, 109.49609375, 126.796875, 128.50...","[[['3+', '4+', '2+', '7+', '8-'] , 0.51], [['7...",96,500,0.9
4,AQY,1291,CluGRAD,0.5,67.46,"[142.0625, 142.19921875, 142.734375, 142.7343...","[[['3-', '5-', '6-', '8+', '10-', '11-'] , 0.7...",5,2,0.75
5,BCR,116,ACoGRAD,0.5,14.31,"[103.7734375, 103.75, 103.75, 103.75, 103.75,...","[[['5+', '7+'] , 0.548], [['2+', '5+'] , 0.562...",18,500,0.5
6,BCR,116,CluGRAD,0.5,0.572,"[105.85546875, 105.9140625, 106.08984375, 106...","[[['1-', '5-'] , 0.717], [['3-', '4-'] , 0.893...",5,2,0.75
7,BCR,116,CluGRAD,0.5,1.056,"[106.6640625, 106.72265625, 106.765625, 106.9...","[[['3+', '4+'] , 0.93], [['3-', '4-'] , 0.929]...",4,2,0.25
8,BCR,116,CluGRAD,0.5,0.747,"[104.14453125, 106.19921875, 106.2421875, 106...","[[['3-', '4-'] , 0.919], [['6-', '7+'] , 0.58]...",5,2,0.5
9,BCR,116,CluGRAD,0.5,0.462,"[105.80078125, 105.859375, 105.984375, 105.98...","[[['1-', '5-'] , 0.681], [['1+', '5+'] , 0.656...",4,2,0.9


# 2. Parameter Setting

We conduct an analysis to determine:

* The optimum maximum-iteration for the ACO-GRAD algorithm and the optimum max-iteration for estimating score-vectors for the Clu-GRAD algorithm.

* The optimum values for evaporation-factor (ACO-GRAD) and erasure-probability (CLu-GRAD).


## 2.1 Parameter Setting (Maximum Iterations)


In [112]:
import numpy as np

# data_sets = ['BCR', 'DIR', 'C2K', 'AQY', 'HPC']
data_sets = ['BCR', 'AQY']
algorithms = ['ACoGRAD', 'CluGRAD', 'GRAANK']
evals_it = [2, 5, 10, 100, 500, 1000]
evals_e = [0.25, 0.5, 0.75, 0.9]

eval_data_iter = []
eval_data_e = []


# Only include these columns
df_res = df_piter[["Data-set", "Algorithm", "Support", "Run-time", "Pattern-count", "Max-iteration", "Patterns", 'E']]

for ds in data_sets:
    df1 = df_res.loc[df_res['Data-set'] == ds]

    for alg in algorithms:
        df2 = df1.loc[df1['Algorithm'].str.contains(alg)]
        # print(df1)
        for ev in evals_it:
            df3 = df2.loc[df2['Max-iteration'] == ev]
            if not df3.empty:
                # print(df2)
                pat_arr = []
                for index,row in df3.iterrows():
                    # 10/100/1000 evaluations
                    pat_arr.append(row['Patterns'])
                    # print(pat_arr)
                eval_data_iter.append([ds, alg, ev, df3['Pattern-count'].mean(), df3['Pattern-count'].std(), df3['Run-time'].mean(), df3['Run-time'].std()])
                       
df_data_1 = pd.DataFrame(data=eval_data_iter, columns=['Dataset', 'Algorithm', 'Iterations', 'Patterns (mean)', 'Patterns (std)', 'Run-time (mean)', 'Run-time (std)'])
df_data_1


Unnamed: 0,Dataset,Algorithm,Iterations,Patterns (mean),Patterns (std),Run-time (mean),Run-time (std)
0,BCR,ACoGRAD,2,2.0,0.0,0.083333,0.013317
1,BCR,ACoGRAD,5,4.333333,0.57735,0.084333,0.002517
2,BCR,ACoGRAD,10,7.666667,0.57735,0.102333,0.001528
3,BCR,ACoGRAD,100,23.666667,1.154701,1.745333,0.224179
4,BCR,ACoGRAD,500,28.666667,0.57735,33.053333,0.677225
5,BCR,ACoGRAD,1000,29.333333,1.154701,113.433333,10.600629
6,BCR,CluGRAD,2,5.0,0.0,1.586333,0.017926
7,BCR,CluGRAD,5,5.0,0.0,3.023,0.030512
8,BCR,CluGRAD,10,5.0,0.0,5.385,0.137022
9,BCR,CluGRAD,100,5.0,0.0,47.94,1.075546


## 2.2 Parameter Setting (Evaporation Factor/Erasure Probability)



In [113]:
import numpy as np

# data_sets = ['BCR', 'DIR', 'C2K', 'AQY', 'HPC']
data_sets = ['BCR', 'AQY']
algorithms = ['ACoGRAD', 'CluGRAD', 'GRAANK']
evals_it = [2, 5, 10, 100, 500, 1000]
evals_e = [0.25, 0.5, 0.75, 0.9]

eval_data_iter = []
eval_data_e = []


# Only include these columns
df_res = df_pe[["Data-set", "Algorithm", "Support", "Run-time", "Pattern-count", "Max-iteration", "Patterns", 'E']]

for ds in data_sets:
    df1 = df_res.loc[df_res['Data-set'] == ds]

    for alg in algorithms:
        df2 = df1.loc[df1['Algorithm'].str.contains(alg)]
        # print(df1)
        for ev in evals_e:
            df3 = df2.loc[df2['E'] == ev]
            if not df3.empty:
                # print(df2)
                pat_arr = []
                for index,row in df3.iterrows():
                    # 10/100/1000 evaluations
                    pat_arr.append(row['Patterns'])
                    # print(pat_arr)
                eval_data_e.append([ds, alg, ev, df3['Pattern-count'].mean(), df3['Pattern-count'].std(), df3['Run-time'].mean(), df3['Run-time'].std()])
                       
df_data_2 = pd.DataFrame(data=eval_data_e, columns=['Dataset', 'Algorithm', 'Iterations', 'Patterns (mean)', 'Patterns (std)', 'Run-time (mean)', 'Run-time (std)'])
df_data_2



Unnamed: 0,Dataset,Algorithm,Iterations,Patterns (mean),Patterns (std),Run-time (mean),Run-time (std)
0,BCR,ACoGRAD,0.25,21.666667,2.081666,24.863333,5.264659
1,BCR,ACoGRAD,0.5,19.333333,2.309401,12.340333,6.097939
2,BCR,ACoGRAD,0.75,16.0,4.582576,9.084,2.589615
3,BCR,ACoGRAD,0.9,18.0,1.732051,5.26,1.195455
4,BCR,CluGRAD,0.25,6.0,2.645751,1.175333,0.127108
5,BCR,CluGRAD,0.5,4.333333,0.57735,0.766,0.055973
6,BCR,CluGRAD,0.75,4.666667,0.57735,0.574333,0.004041
7,BCR,CluGRAD,0.9,6.0,2.0,0.482333,0.018175
8,AQY,ACoGRAD,0.25,107.0,15.874508,67.92,4.395009
9,AQY,ACoGRAD,0.5,114.0,2.0,64.533333,1.048872
