In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
GPT_RESULTS_PATH = os.path.join("../gpt", "results", "2023-07-03")
CS_RESULTS_PATH = os.path.join("../results", "2023-07-04")

In [3]:
# plot number of files per directory in GPT_RESULTS_PATH
gpt_idiom_counts = {}
cs_idiom_counts = {}

# for each library in GPT_RESULTS_PATH
for dir in os.listdir(GPT_RESULTS_PATH):
    for subdir in os.listdir(os.path.join(GPT_RESULTS_PATH, dir)):
        gpt_idiom_counts[subdir] = len(os.listdir(os.path.join(GPT_RESULTS_PATH, dir, subdir)))

# sort counts by api (key)
gpt_idiom_counts = {k: v for k, v in sorted(gpt_idiom_counts.items(), key=lambda item: item[0])}

# for each library in CS_RESULTS_PATH
for dir in os.listdir(CS_RESULTS_PATH):
    if not os.path.isdir(os.path.join(CS_RESULTS_PATH, dir)):
        continue

    for subdir in os.listdir(os.path.join(CS_RESULTS_PATH, dir)):
        if cs_idiom_counts.get(subdir, None) is None:
            cs_idiom_counts[subdir] = 0

        for file in os.listdir(os.path.join(CS_RESULTS_PATH, dir, subdir, "idioms", "progs")):
            _, size, cluster, nhood_count, hole = file.split("_")
            hole = hole.split(".")[0]

            if int(hole) == 0 and int(nhood_count) > 0:
                cs_idiom_counts[subdir] = cs_idiom_counts.get(subdir, 0) + 1

# sort counts by api (key)
cs_idiom_counts = {k: v for k, v in sorted(cs_idiom_counts.items(), key=lambda item: item[0])}

In [4]:
# plt.figure(figsize=(20, 10))
# plt.bar(cs_idiom_counts.keys(), cs_idiom_counts.values(), label="CodeScholar")
# plt.xticks(rotation=90)
# plt.xlabel("API")
# plt.ylabel("Number of idioms")
# plt.title("Number of idioms per API")
# plt.legend()
# plt.show()

In [5]:
# plt.figure(figsize=(20, 10))
# plt.bar(gpt_idiom_counts.keys(), gpt_idiom_counts.values(), label="GPT-3")
# plt.xticks(rotation=90)
# plt.xlabel("API")
# plt.ylabel("Number of idioms")
# plt.title("Number of idioms per API")
# plt.legend()
# plt.show()

In [6]:
# read and parse /emd/emd.log file
# the format is:
# ========== [pandas: pd.read_csv] ==========
# Programs: 20000
# CS idioms: 149
# GPT idioms: 8
# Random idioms: 10
# CS EMD: 0.40105256416916146
# GPT EMD: 0.4697192690205559
# Random EMD: 0.4489479035710553
# =====================================

import regex as re


def parse_emd_log(logs):
    emd = {}

    logs = re.findall(
        r"========== \[(.+): (.+)\] ==========\nPrograms: (.+)\nCS idioms: (.+)\nGPT idioms: (.+)\nRandom idioms: (.+)\nCS EMD: (.+)\nGPT EMD: (.+)\nRandom EMD: (.+)\n=====================================",
        logs,
    )
    for log in logs:
        lib, api, n_progs, n_cs_idioms, n_gpt_idioms, n_rand_idioms, cs_emd, gpt_emd, rand_emd = log
        emd[lib] = emd.get(lib, {})
        emd[lib][api] = (float(cs_emd), float(gpt_emd), float(rand_emd))

    return emd

In [7]:
emd = parse_emd_log(open(os.path.join("../emd", "emd.single.result")).read())

In [8]:
emd.keys()

dict_keys(['pandas', 'numpy', 'os', 'sklearn', 'matplotlib', 'torch'])

In [9]:
# averages per library
print(list(emd.keys()))
avg_cs_emd_lib = [np.mean([x[0] for x in emd[lib].values()]) for lib in emd]
avg_gpt_emd_lib = [np.mean([x[1] for x in emd[lib].values()]) for lib in emd]
avg_random_emd_lib = [np.mean([x[2] for x in emd[lib].values()]) for lib in emd]

print(avg_cs_emd_lib)
print(avg_gpt_emd_lib)
print(avg_random_emd_lib)

['pandas', 'numpy', 'os', 'sklearn', 'matplotlib', 'torch']
[0.3958083827801948, 0.4454906900131255, 0.42249569537148246, 0.34591476259962783, 0.3778598258208419, 0.35508324619462783]
[0.4626735489715698, 0.5188748104768457, 0.4973094433083336, 0.4487554271372074, 0.44957474155123683, 0.47480430686455793]
[0.4501101558725966, 0.4982254780190106, 0.49790619287662324, 0.44881009700113594, 0.4193334408839467, 0.38672776583364393]


In [10]:
# overall averages
np.mean(avg_cs_emd_lib), np.mean(avg_gpt_emd_lib), np.mean(avg_random_emd_lib)

(0.39044210046331673, 0.4753320463849586, 0.45018552174782617)

In [11]:
multi_emd = parse_emd_log(open(os.path.join("../emd", "emd.multi.result")).read())

In [12]:
# averages per type
print(list(multi_emd.keys()))
avg_cs_emd_lib = [np.mean([x[0] for x in multi_emd[type].values()]) for type in multi_emd]
avg_gpt_emd_lib = [np.mean([x[1] for x in multi_emd[type].values()]) for type in multi_emd]
avg_random_emd_lib = [np.mean([x[2] for x in multi_emd[type].values()]) for type in multi_emd]
print(avg_cs_emd_lib)
print(avg_gpt_emd_lib)
print(avg_random_emd_lib)

['pairs', 'mixpairs', 'triplets']
[0.45090168404697223, 0.46902650682675545, 0.46512558713657565]
[0.5183084022576386, 0.5316574641163595, 0.5246211114152105]
[0.48588109059481643, 0.47140997601739915, 0.48647951108040993]


In [13]:
# Overall averages
np.mean(avg_cs_emd_lib), np.mean(avg_gpt_emd_lib), np.mean(avg_random_emd_lib)

(0.46168459267010115, 0.5248623259297363, 0.4812568592308752)