In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import os
import json
from typing import List, Dict, Callable
from cf_shared.convert_previous import convert_optional_old, convert_optional_original
from cf_shared.convert import convert_optional, convert_paths, ConversionResult


In [None]:
# Change current directory to scratch disk with all the data on it
os.chdir('/mnt/mturk/cf_sample_data/')

# Make sure to download the following zip and the files into the folder below
# https://huggingface.co/rgismondi/python-50k-dedup/blob/main/pretrain_dataset.zip
PY_SOURCE_LOCATION = './deduplicated_code_fill_pretrain/'

# Location to use for the output of the conversion for the benchmarks.
BENCHMARKS_LOCATION = './paper-benchmarks'

# Location where the figures will be placed
FIGURES_LOCATION = '/home/mturk/rp/codefill/notebooks/'

In [None]:
name_size_pairs = []
for elem in os.scandir(PY_SOURCE_LOCATION):
    size = os.path.getsize(elem)
    name_size_pairs.append([elem.name,size])

df_sizes = pd.DataFrame(name_size_pairs, columns=['name','size'])

In [None]:
# Arbitrary number, for consistency across runs
RANDOM_SEED = 42

conversion_functions: Dict[str, Callable[[str, str], ConversionResult]] = {
  'new': convert_optional,
  'second': convert_optional_old,
  'original': convert_optional_original
}

# Variables for benchmarking specific part of the dataset
# min_fs and max_fs are file sizes in bytes
BENCHMARKS = {
  'under-1k': {
    'file_amount': 1000,
    'min_fs': 400,
    'max_fs': 600,
    'conversion_functions': ['new', 'second', 'original'],
    'label': '400-600',
    'ytickformat': '{:,.0f}',
  },
  'under-10k': {
    'file_amount': 1000,
    'min_fs': 4_000,
    'max_fs': 6_000,
    'conversion_functions': ['new', 'second', 'original'],
    'label': '4K-6K',
    'ytickformat': '{:,.1f}'
  },
  'under-100k': {
    'file_amount': 1000,
    'min_fs': 40_000,
    'max_fs': 60_000,
    'conversion_functions': ['new', 'second'],
    'label': '40K-60K',
    'ytickformat': '{:,.0f}'
  },
  'under-1000k': {
    'file_amount': 500,
    'min_fs': 400_000,
    'max_fs': 600_000,
    'conversion_functions': ['new', 'second'],
    'label': '400K-600K',
    'ytickformat': '{:,.0f}'
  }
}

In [None]:
files = {}

for name, config in BENCHMARKS.items():
  files_in_range = df_sizes[(df_sizes['size'] >= config['min_fs']) & (df_sizes['size'] < config['max_fs'])]
  all_files = list(files_in_range.itertuples(index=False))
  random.seed(RANDOM_SEED)
  files[name] = random.sample(all_files, config['file_amount'])

In [None]:
sum((df_sizes['size'] >= 400000) & (df_sizes['size'] < 600000))

In [None]:
filesizes = {}
for name, config in BENCHMARKS.items():
  filesizes[name] = np.array([file.size for file in files[name]])

In [None]:
extra_graph_space_factor = 1.02

fig, axs = plt.subplots(1, len(BENCHMARKS.keys()), figsize=(20, 3), sharey=False)
for index, (name, config) in enumerate(BENCHMARKS.items()):
  axs[index].boxplot(filesizes[name], sym='', widths=0.4)
  # axs[index].set_xticklabels([name])
  axs[index].set_xticklabels([config['label']])
  # axs[index]
  if filesizes[name].max() > 1000:
    ylabels = [config['ytickformat'].format(x) + 'K' for x in axs[index].get_yticks()/1000]
    axs[index].set_yticklabels(ylabels)
    axs[index].set_ylim(config['min_fs']/extra_graph_space_factor, config['max_fs']*extra_graph_space_factor)
    
    

fig.set_figwidth(8)

plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=0.9, 
                    top=0.9, 
                    wspace=0.4, 
                    hspace=0.4)
fig.suptitle('File sizes of the datasets')

fig.savefig(os.path.join(FIGURES_LOCATION, 'filesizes.png'), dpi=300)

In [None]:
results: Dict[str, Dict[str, List[ConversionResult]]] = {}

In [None]:
for name, config in BENCHMARKS.items():
  results[name] = {}
  
  for conv_name in config['conversion_functions']:
    times_json = os.path.join(BENCHMARKS_LOCATION, f"{name}-{conv_name}.json")
    with open(times_json, 'r') as fd:
      results[name][conv_name] = json.load(fd)

In [None]:

for name, config in BENCHMARKS.items():
  results[name] = {}
  
  for conv_name in config['conversion_functions']:
    print(f"Dataset: {name} Conversion: {conv_name}")
    converted_path = os.path.join(BENCHMARKS_LOCATION, name, conv_name)
    if not os.path.exists(converted_path):
      os.makedirs(converted_path)
    
    conversion_results = convert_paths(
      [os.path.join(PY_SOURCE_LOCATION, file.name) for file in files[name]], 
      converted_path,
      # Set times_json to None if you do not wish to write results to disk
      times_json=os.path.join(BENCHMARKS_LOCATION, f"{name}-{conv_name}.json"),
      n_threads=20,
      convert_optional_function=conversion_functions[conv_name]
    )
    results[name][conv_name] = conversion_results
    


In [None]:
[len(l) for l in results['under-1k'].values()]

In [None]:
# Times of all successful conversions in microseconds (int) by dataset and conversion name
times_success: Dict[str, Dict[str, List[int]]] = {}

for name, config in BENCHMARKS.items():
  times_success[name] = {}
  
  for conv_name in config['conversion_functions']:
    successful_conversions = filter(lambda x: x[3] == "s", results[name][conv_name])
    times_success[name][conv_name] = np.array(list(map(lambda x: x[2], successful_conversions)))


In [None]:
total_times = {}

for name, config in BENCHMARKS.items():
  total_times[name] = {}
  for conv_name in ['second', 'new']:
    print(f"Dataset: {name} Conversion: {conv_name}")
    n_successful_conversions = sum([1 for el in results[name][conv_name] if el[3] == "s"])

    print(f"  Amount of successful conversions: {n_successful_conversions} out of {len(results[name][conv_name])}")
    total_time = sum([res[2] for res in results[name][conv_name]]) / 1e6
    total_times[name][conv_name] = total_time
    print(f"  Time taken: {total_time} seconds")
  
  print("Speedup: {:.2f}x".format(total_times[name]['second']/total_times[name]['new']))

In [None]:
# Multi column bar graph adapted from https://stackoverflow.com/a/20132614/8209335

def set_box_color(bp, color):
    plt.setp(bp['boxes'], color=color)
    plt.setp(bp['whiskers'], color=color)
    plt.setp(bp['caps'], color=color)
    plt.setp(bp['medians'], color=color)
# Material design blue-500 and green-500
colors = ['#1565C0', '#2E7D32']
labels = ['First', 'Final']
data_old = [times_success[name]['second'] / 1e6 for name in BENCHMARKS.keys()]
data_new = [times_success[name]['new'] / 1e6 for name in BENCHMARKS.keys()]

ticks = [v['label'] for v in BENCHMARKS.values()]

bp_old = plt.boxplot(data_old, positions=np.array(range(len(data_old)))*2.0-0.4, sym='', widths=0.6)
bp_new = plt.boxplot(data_new, positions=np.array(range(len(data_new)))*2.0+0.4, sym='', widths=0.6)
set_box_color(bp_old, colors[0]) # colors are from http://colorbrewer2.org/
set_box_color(bp_new, colors[1])

# draw temporary red and blue lines and use them to create a legend
plt.plot([], c=colors[0], label=labels[0])
plt.plot([], c=colors[1], label=labels[1])
plt.legend()

# X-Axis
plt.xticks(range(0, len(ticks) * 2, 2), ticks)
plt.xlim(-2, len(ticks)*2)
plt.xlabel('Dataset')

plt.yscale('log')
plt.ylabel('Conversion Time (seconds, log scale)')

plt.tight_layout()
plt.savefig(os.path.join(FIGURES_LOCATION, 'conversion-time.png'), dpi=300)

In [None]:
data_old = [times_success[name]['second'] / 1e6 for name in BENCHMARKS.keys()]
data_new = [times_success[name]['new'] / 1e6 for name in BENCHMARKS.keys()]

ticks = [v['label'] for v in BENCHMARKS.values()]

data_old = [x['second']/x['new'] for x in total_times.values()]
data_new = np.ones(len(ticks))


x = np.arange(len(ticks))
width = 0.8
colors = ['#2196F3', '#4CAF50']
fig, ax = plt.subplots()

bar_old = ax.bar(x*2-0.4, data_new, width, color=colors[0])
bar_new = ax.bar(x*2+0.4, data_old, width, color=colors[1])

# Adapted from https://stackoverflow.com/a/42498711/8209335
def label_speedup(rects, formatter):
  for rect in rects:
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2., 0.05+height,
      formatter(height),
      ha='center', va='bottom')

label_speedup(bar_old, lambda height: '%dx' % int(height))
label_speedup(bar_new, lambda height: '%.2fx' % float(height))

# draw temporary red and blue lines and use them to create a legend
plt.plot([], c=colors[0], label=labels[0])
plt.plot([], c=colors[1], label=labels[1])
plt.legend(loc='upper left')

# X-Axis
plt.xticks(range(0, len(ticks) * 2, 2), ticks)
plt.xlim(-2, len(ticks)*2)
plt.xlabel('Dataset')

plt.ylabel('Speedup')
plt.ylim(0,5.5)

plt.tight_layout()
plt.savefig(os.path.join(FIGURES_LOCATION, 'conversion-time-comparison.png'), dpi=300)

In [None]:
# Results exploration, for debugging the results
df = pd.DataFrame(results['under-1000k']['new'], columns=["input", "output","time","status","error"])
subdf = df[(df.time < 50000) & (df.status == "s")]
subdf