In [39]:
import numpy as np
import matplotlib.pyplot as plt
import re

%matplotlib inline

## Regex Matching

In [40]:
LOG_REGEX = re.compile(r'INFO:root:([A-Za-z_0-9= ]+): benchmarks\/data\/test-data-([0-9]+)-([0-9]+)\.csv; Size: ([0-9]+) bytes; time: ([0-9.]+)s')
group_dict = lambda matched: {
    'name': matched.group(1),
    'rows': int(matched.group(2)),
    'cols': int(matched.group(3)),
    'bytes': int(matched.group(4)),
    'time': float(matched.group(5))
}

## Parsing

In [41]:
modin_files = glob.glob('../../benchmark-results/modin-*.log')
pandas_files = glob.glob('../../benchmark-results/pandas-*.log')

['../../benchmark-results/modin-arithmetic.log',
 '../../benchmark-results/modin-io.log',
 '../../benchmark-results/modin-groupby.log']

In [42]:
modin_results = {}
for filename in modin_files:
    with open(filename) as f:
        for line in f:
            m = re.search(LOG_REGEX, line)
            if m is None:
                continue
            data = group_dict(m)
            if data['name'] not in modin_results:
                modin_results[data['name']] = {}
            if data['bytes'] not in modin_results[data['name']]:
                modin_results[data['name']][data['bytes']] = []
            modin_results[data['name']][data['bytes']].append(data['time'])

    for k in modin_results:
        for k2 in modin_results[k]:
            modin_results[k][k2] = np.mean(modin_results[k][k2])

In [44]:
pandas_results = {}
for filename in pandas_files:
    with open(filename) as f:
        for line in f:
            m = re.search(LOG_REGEX, line)
            if m is None:
                continue
            data = group_dict(m)
            if data['name'] not in pandas_results:
                pandas_results[data['name']] = {}
            if data['bytes'] not in pandas_results[data['name']]:
                pandas_results[data['name']][data['bytes']] = []
            pandas_results[data['name']][data['bytes']].append(data['time'])

    for k in pandas_results:
        for k2 in pandas_results[k]:
            pandas_results[k][k2] = np.mean(pandas_results[k][k2])

In [59]:
for op in pandas_results.keys():
    modin_data = modin_results[op]
    modin_x = modin_data.keys()
    modin_y = [modin_data[k] for k in modin_x]
    modin_x = [x/1024/1024 for x in modin_x] # convert to MB

    pandas_data = pandas_results[op]
    pandas_x = pandas_data.keys()
    pandas_y = [pandas_data[k] for k in pandas_x]
    pandas_x = [x/1024/1024 for x in pandas_x]

    plt.scatter(pandas_x, pandas_y, c='b', label='pandas')
    plt.scatter(modin_x, modin_y, c='r', label='modin')
    plt.legend(loc='upper left')
    plt.title(op)
    plt.xlabel('Data Size (MB)')
    plt.ylabel('Time taken (s)')
    plt.savefig(op.replace(" ", "_") + ".png")
    plt.close()