In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import matplotlib as mpl
mpl.style.use('default') #seaborn-white is also not bad
mpl.style.available

In [None]:
def translate_names(ds):
    ds_names = {
        "viscode_t4_limited" : "VC",
        "viscode_t4_limited_art" : "ART",
        "gen" : "GEN",
        "astnn_t4" : "ASTNN"
    }
    
    if "dataset" in ds.columns:
        ds["dataset"].replace(ds_names, inplace=True)
    return ds
        
def full_names(ls):
    fulls = {"VC": "bcb non-clones",
             "ART": "generated non-clones",
             "GEN": "generalization",
             
             "bc": "neural network",
             "knn": "kNN",
             "svm": "SVM",
             "astnn": "ASTNN",
             
             "st": "simple text",
             "sh": "color highlighting",
             "kp": "geometric keywords",
             "as": "condensed AST"
                 }
    if isinstance(ls, str):
        return fulls[ls] if ls in fulls else ls
    else:
        return [fulls[x] if x in fulls else x for x in ls]

In [None]:
filename_pattern = "experiment%s<timestamp>.csv"
exp1_results = translate_names(pd.read_csv(filename_pattern % 1))
print(exp1_results.head())
print(exp1_results.columns)
exp1_results.drop("dataset", axis=1, inplace=True)
exp1_results.drop("run", axis=1, inplace=True)
print(len(exp1_results.index))


In [None]:
for alg in exp1_results["algorithm"].unique():
    ss = exp1_results[exp1_results["algorithm"] == alg]
    er1 = ss[ss.columns.difference(["algorithm"])]
    g = er1.groupby(["visualization" ]).mean()
    print(g)
    g.plot.bar(figsize=(4.5,5))
    plt.title("RQ1: Visualization influence\non algorithm: %s" % full_names(alg), fontweight="bold")
    ax = plt.gca()
    plt.legend(loc='lower center')
    ax.yaxis.grid(True)
    ax.set_xticklabels(full_names(g.index), rotation=45, ha="right")
    ax.set_yticks([x*0.1 for x in range(11)])
    plt.show()

In [None]:
exp2_results = translate_names(pd.read_csv(filename_pattern % 2))
print(exp2_results.head())
print(exp2_results.columns)
exp2_results.drop("visualization", axis=1, inplace=True)
exp2_results.drop("run", axis=1, inplace=True)
print(len(exp2_results.index))


In [None]:
for dataset in exp2_results["dataset"].unique():
    ss = exp2_results[exp2_results["dataset"] == dataset]
    er1 = ss[ss.columns.difference(["dataset"])]
    g = er1.groupby(["algorithm" ]).mean()
    print(g)
    g.plot.bar(figsize=(4.5,5))
    plt.title("RQ2: Algorithm importance\non '%s' dataset" % full_names(dataset), fontweight="bold")
    ax = plt.gca()
    ax.yaxis.grid(True)
    plt.legend(loc='lower center')
    ax.set_xticklabels(full_names(g.index), rotation=45, ha="right")
    ax.set_yticks([x*0.1 for x in range(11)])
    plt.show()

In [None]:
exp3_results = translate_names(pd.read_csv(filename_pattern % 3))
print(exp3_results.head())
print(exp3_results.columns)
exp3_results.drop("visualization", axis=1, inplace=True)
exp3_results.drop("run", axis=1, inplace=True)
print(len(exp3_results.index))

exp3_results = pd.concat([exp3_results, exp2_results[exp2_results["algorithm"]=="bc"]])


In [None]:
for dataset in exp3_results["dataset"].unique():
    ss = exp3_results[exp3_results["dataset"] == dataset]
    er1 = ss[ss.columns.difference(["dataset"])]
    g = er1.groupby(["algorithm" ]).mean()
    print(g)
    g.plot.bar(figsize=(2.8,5))
    plt.title("RQ3: Comparision to SOTA\n on '%s' dataset" % full_names(dataset), fontweight="bold")
    ax = plt.gca()
    ax.yaxis.grid(True)
    plt.legend(loc='lower center')
    ax.set_xticklabels(full_names(g.index), rotation=45, ha="right")
    ax.set_yticks([x*0.1 for x in range(11)])
    plt.show()

In [None]:
exp4_results = translate_names(pd.read_csv(filename_pattern % 4))
print(exp4_results.head())
print(exp4_results.columns)
exp4_results.drop("visualization", axis=1, inplace=True)
exp4_results.drop("run", axis=1, inplace=True)
print(len(exp4_results.index))

In [None]:
for dataset in exp4_results["dataset"].unique():
    ss = exp4_results[exp4_results["dataset"] == dataset]
    er1 = ss[ss.columns.difference(["dataset"])]
    g = er1.groupby(["algorithm" ]).mean()
    print(g)
    g.plot.bar(figsize=(9,5))
    plt.title("RQ4: Generalization abilities\n on '%s' dataset" % full_names(dataset), fontweight="bold")
    ax = plt.gca()
    plt.legend(loc='lower center')
    ax.yaxis.grid(True)
    ax.set_xticklabels(full_names(g.index), rotation=45, ha="right")
    ax.set_yticks([x*0.1 for x in range(11)])
    plt.show()

In [None]:
exp5_results = translate_names(pd.read_csv(filename_pattern % 5))
print(exp5_results.head())
print(exp5_results.columns)
exp5_results.drop("algorithm", axis=1, inplace=True)
exp5_results.drop("run", axis=1, inplace=True)
print(len(exp5_results.index))

In [None]:
for dataset in exp5_results["dataset"].unique():
    ss = exp5_results[exp5_results["dataset"] == dataset]
    er1 = ss[ss.columns.difference(["dataset"])]
    g = er1.groupby(["visualization" ]).mean()
    print(g)
    g.plot.bar(figsize=(4.5,5))
    plt.title("RQ5: Code Classification Performance\n on '%s' dataset" % full_names(dataset), fontweight="bold")
    ax = plt.gca()
    plt.legend(loc='center')
    ax.yaxis.grid(True)
    ax.set_xticklabels(full_names(g.index), rotation=45, ha="right")
    ax.set_yticks([x*0.1 for x in range(11)])
    plt.show()