# 1. Reading the results

We read the results from txt files into Pandas DataFrames for analysis.

In [90]:
import pandas as pd
from os import listdir
from os.path import isfile, join


# Read all results from txt files
parameter_iter_path = "./parameter_iter/"
parameter_iter_files = [f for f in listdir(parameter_iter_path) if isfile(join(parameter_iter_path, f))]

# Parameter Setting (ACO & CLu) - maximum iteration
column_names = ["Data-set", "Size", "Algorithm", "Support", "Run-time", "Memory", "Patterns", "Pattern-count", "Max-iteration", "E"]
df_piter = pd.DataFrame(columns = column_names)

for file in parameter_iter_files:
    f_path = join(parameter_iter_path,file)
    try:
        res = pd.read_csv(f_path, names=["A","B"], sep = ':', header=None, engine='python')
    except Exception:
        print(f_path)
    
    run = float(res['B'][0][0:6]) # run-time
    mem = str(res['B'][1]) # memory
    alg = str(res['B'][2]) # algorithm
    att = int(res['B'][3]) # number of attributes in the data set
    size = int(res['B'][4])  # data set size

    if alg == " GRAANK ":
        evp = 0
        mit = 0
        sup = float(res['B'][5]) # minimum support
        pat = int(res['B'][7]) # patterns
    else:
        evp = float(res['B'][5]) # evaporation-factor / erasure-probability
        mit = int(res['B'][6]) # maximum iteraction count
    
        sup = float(res['B'][7]) # minimum support
        pat = int(res['B'][9]) # patterns
    
    # Fetch patterns
    gps = [""] * pat
    gp_pos = res.loc[res['A'] == 'Pattern '].index[0] + 1 # 1st position
    for i in range(pat):
        pos = i + gp_pos
        gps[i] = [res['A'][pos], float(res['B'][pos])]
    
    col = "AQY"

    df_piter = df_piter.append({"Data-set": col, "Size": size, "Algorithm":alg, "Support": sup, "Run-time":run, "Memory":mem, "Patterns":gps, "Pattern-count":pat, "Max-iteration":mit, "E": evp}, ignore_index=True)

df_piter['Algorithm'].replace({' Clu-GRAD (v1.3)' : 'CluGRAD'}, inplace=True)
df_piter['Algorithm'].replace({' ACO-GRAANK (v4.0)' : 'ACoGRAD'}, inplace=True)

df_piter

Unnamed: 0,Data-set,Size,Algorithm,Support,Run-time,Memory,Patterns,Pattern-count,Max-iteration,E
0,AQY,1291,CluGRAD,0.5,570.8,"[176.45703125, 172.50390625, 172.96875, 172.9...","[[['3+', '5+', '6+', '8-', '10+', '11+'] , 0.7...",5,10,0.0
1,AQY,1291,GRAANK,0.5,84.0,"[111.30859375, 111.2890625, 128.3984375, 131....","[[['12+', '2+'] , 0.532], [['14-', '2+'] , 0.5...",108,0,0.0
2,AQY,1291,ACoGRAD,0.5,1.93,"[107.6640625, 107.64453125, 126.0546875, 127....","[[['9+', '2+', '11+', '4+'] , 0.526], [['9+', ...",2,2,0.0
3,AQY,1291,ACoGRAD,0.5,7.72,"[110.6484375, 109.49609375, 127.30859375, 128...","[[['11+', '10+', '2+', '4+', '3+'] , 0.5], [['...",58,100,0.0
4,AQY,1291,CluGRAD,0.5,249.8,"[174.84765625, 174.875, 175.33984375, 175.339...","[[['3+', '5+', '6+', '8-', '10+', '11+'] , 0.7...",5,2,0.0
5,AQY,1291,CluGRAD,0.5,385.4,"[172.02734375, 172.07421875, 172.72265625, 17...","[[['3+', '5+', '6+', '8-', '10+', '11+'] , 0.7...",5,5,0.0
6,AQY,1291,CluGRAD,0.5,378.9,"[151.7421875, 152.03515625, 152.5703125, 152....","[[['3+', '5+', '6+', '8-', '10+', '11+'] , 0.7...",5,5,0.0
7,AQY,1291,CluGRAD,0.5,4028.0,"[177.85546875, 177.91796875, 178.453125, 178....","[[['3+', '5+', '6+', '8-', '10+', '11+'] , 0.7...",5,100,0.0
8,AQY,1291,ACoGRAD,0.5,2.262,"[107.76953125, 107.71875, 126.07421875, 127.5...","[[['13-', '10+'] , 0.516], [['14-', '6-', '11-...",10,10,0.0
9,AQY,1291,ACoGRAD,0.5,2.246,"[107.76953125, 107.71875, 126.07421875, 127.6...","[[['14-', '6-'] , 0.566], [['10+', '13-'] , 0....",9,10,0.0


# 2. Parameter Setting

We conduct an analysis to determine:

* The optimum maximum-iteration for the ACO-GRAD algorithm and the optimum max-iteration for estimating score-vectors for the Clu-GRAD algorithm.

* The optimum values for evaporation-factor (ACO-GRAD) and erasure-probability (CLu-GRAD).


In [91]:
import numpy as np

algorithms = ['ACoGRAD', 'CluGRAD', 'GRAANK']
evals = [2, 5, 10, 100, 500]
eval_data = []


# Only include these columns
df_res = df_piter[["Algorithm", "Support", "Run-time", "Pattern-count", "Max-iteration", "Patterns"]]

for alg in algorithms:
    df1 = df_res.loc[df_res['Algorithm'].str.contains(alg)]
    # print(df1)
    for ev in evals:
        df2 = df1.loc[df1['Max-iteration'] == ev]
        if not df2.empty:
            # print(df2)
        
            pat_arr = []
            for index,row in df2.iterrows():
                # 10/100/1000 evaluations
                pat_arr.append(row['Patterns'])
                # print(pat_arr)

            eval_data.append([alg, df2['Pattern-count'].mean(), df2['Pattern-count'].std(), df2['Run-time'].mean(), df2['Run-time'].std()])

df_data = pd.DataFrame(data=eval_data, columns=['Algorithm', 'Patterns (mean)', 'Patterns (std)', 'Run-time (mean)', 'Run-time (std)'])
df_data


Unnamed: 0,Algorithm,Patterns (mean),Patterns (std),Run-time (mean),Run-time (std)
0,ACoGRAD,2.0,0.0,1.925,0.006245
1,ACoGRAD,4.666667,0.57735,2.036333,0.01893
2,ACoGRAD,9.0,1.0,2.251667,0.008963
3,ACoGRAD,62.0,5.656854,7.834,0.16122
4,CluGRAD,5.0,0.0,251.566667,3.059956
5,CluGRAD,5.0,0.0,377.6,8.52467
6,CluGRAD,5.0,0.0,568.7,2.594224
7,CluGRAD,5.0,0.0,4082.5,77.074639
