# Data analysis for assignment 2

## Import packages

For this analysis, we need pandas for data manipulation and seaborn for plotting. Matplotlib is sometimes useful in tandem with seaborn because it exposes some lower-level plotting functions.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load data

In [None]:
results_combined_parallelism = pd.read_csv('./data/raw/results_combined_parallelism.tsv', sep = '\t')
results_combined_parallelism = results_combined_parallelism.replace('parallel_combined', 'Combined task and data parallelism')
results_combined_parallelism = results_combined_parallelism.replace('sequential_with_parallel_mergesort', 'Task parallelism')
results_combined_parallelism = results_combined_parallelism.replace('parallel', 'Data parallelism')
results_combined_parallelism = results_combined_parallelism.replace('sequential', 'No parallelism')

results_combined_parallelism = results_combined_parallelism.drop('idx', axis=1)

results_combined_parallelism

In [None]:
results_scheduling = pd.read_csv('./data/raw/results_scheduling.tsv', sep = '\t')
results_scheduling = results_scheduling.replace('sequential', 'No parallelism')
results_scheduling = results_scheduling.replace('parallel_static', 'Static scheduling, chunk size 100')
results_scheduling = results_scheduling.replace('parallel_dynamic_small_chunk_size', 'Dynamic scheduling, chunk size 10')
results_scheduling = results_scheduling.replace('parallel_dynamic_medium_chunk_size', 'Dynamic scheduling, chunk size 50')
results_scheduling = results_scheduling.replace('parallel_dynamic_large_chunk_size', 'Dynamic scheduling, large chunk 100')
results_scheduling = results_scheduling.replace('parallel_guided_small_chunk_size', 'Guided scheduling, chunk size 10')
results_scheduling = results_scheduling.replace('parallel_guided_medium_chunk_size', 'Guided scheduling, chunk size 50')
results_scheduling = results_scheduling.replace('parallel_guided_large_chunk_size', 'Guided scheduling, chunk size 100')

results_scheduling = results_scheduling.drop('idx', axis=1)

results_scheduling

In [None]:
results_input_size_small = pd.read_csv('./data/raw/results_input_size_small.tsv', sep = '\t')
results_input_size_medium = pd.read_csv('./data/raw/results_input_size_medium.tsv', sep = '\t')
results_input_size_large = pd.read_csv('./data/raw/results_input_size_large.tsv', sep = '\t')

results_input_size_small = results_input_size_small.drop('idx', axis=1)
results_input_size_medium = results_input_size_medium.drop('idx', axis=1)
results_input_size_large = results_input_size_large.drop('idx', axis=1)

results_input_size_small['range'] = results_input_size_small['inner_max'] - results_input_size_small['inner_min']
results_input_size_medium['range'] = results_input_size_medium['inner_max'] - results_input_size_medium['inner_min']
results_input_size_large['range'] = results_input_size_large['inner_max'] - results_input_size_large['inner_min']

results_input_size = pd.concat([
  results_input_size_small,
  results_input_size_medium,
  results_input_size_large
], ignore_index = True)

results_input_size

In [None]:
results_extra = pd.read_csv('./data/raw/results_extra.tsv', sep = '\t')
results_extra = results_extra.replace('parallel_presorted', 'Pre-sorted')
results_extra = results_extra.replace('parallel_autovectorized', 'Autovectorized')

results_extra = results_extra.drop('idx', axis=1)

results_extra = results_extra.replace('sequential', 'Sequential')
results_extra = results_extra.replace('parallel', 'Parallel (dynamic, 100)')

results_extra

In [None]:
# Concatenate all of the separate input files
results = pd.concat([
  results_combined_parallelism,
  results_scheduling,
  results_input_size,
  results_extra
])

# Preview combined data
results

In [None]:
sns.set_theme(style="whitegrid")
sns.set_palette("tab10")

In [None]:
results_combined_parallelism = results_combined_parallelism.rename(columns={'version': 'Parallelism type'})
g = sns.lineplot(data = results_combined_parallelism, x = 'outer', y = 'time', hue = 'Parallelism type', marker = 'o')
g.set_ylabel("Time (s)", fontsize = 13)
g.set_xlabel("Size of outer vector", fontsize = 13)
plt.xscale('log')
plt.savefig('./data/graphs/combined_parallelism.pdf', dpi = 700, bbox_inches='tight')

In [None]:
results_input_size = results_input_size.rename(columns={'outer': 'Size of outer vector'})
g = sns.barplot(data = results_input_size, x = 'range', y = 'time', hue = 'Size of outer vector')
g.set_ylabel("Time (s)", fontsize = 13)
g.set_xlabel("Difference between min. and max. inner vector size", fontsize = 13)
plt.yscale('log')
plt.savefig('./data/graphs/inner_vector_size.pdf', dpi = 700, bbox_inches='tight')

In [None]:
results_scheduling = results_scheduling.rename(columns={'version': 'Scheduling strategy'})
g = sns.barplot(data = results_scheduling, x = 'outer', y = 'time', hue = 'Scheduling strategy')
g.set_ylabel("Time (s)", fontsize = 13)
g.set_xlabel("Size of outer vector", fontsize = 13)
sns.move_legend(g, "upper left", bbox_to_anchor=(1, 1))
plt.yscale('log')
plt.savefig('./data/graphs/scheduling.pdf', dpi = 700, bbox_inches='tight')

In [None]:
results_extra = results_extra.rename(columns={'version': 'Condition'})
g = sns.barplot(data = results_extra, x = 'outer', y = 'time', hue = 'Condition')
g.set_ylabel("Time (s)", fontsize = 13)
g.set_xlabel("Size of outer vector", fontsize = 13)
plt.yscale('log')
plt.savefig('./data/graphs/additional_research.pdf', dpi = 700, bbox_inches='tight')