In [None]:
# Personally I had to add the root folder of the repo to the sys.path.  If certain imports do not work you should uncomment and set the following.
# import sys
# sys.path.append('/root/of/repo/folder/')

# Dataset Evaluation

In this notebook we explore and plot the characteristics of the collected dataset.  Make sure that you have used the `/dataset/dataset_analyzer.py` as we depend on its output.

In [None]:
page_map_file = "../dataset/out/dataset-pages-stats.csv"
resource_file = "../dataset/out/dataset-resources-stats.csv"

Read both statistics files.

In [None]:
import pandas as pd

resource_data = pd.read_csv(resource_file, delimiter=';')
resource_data = resource_data[resource_data['size'] > 0]

page_data = pd.read_csv(page_map_file, delimiter=';')

## Resource Size Distribution

Here we plot the distribution of resource sizes using a histogram.

In [None]:
resource_sizes = list(resource_data['size'])

In [None]:
from palettable.colorbrewer.qualitative import Paired_10
colors = list(Paired_10.mpl_colors) + [ '#9CA3AF' ]

In [None]:
import matplotlib.pyplot as plt
import experiments.plotter.neat_plotter

plt.figure(num=None, figsize=(14,4), dpi=300)
plt.grid(which='major', axis='y')
plt.hist(resource_sizes, bins=200)
plt.yscale('log')
plt.ylabel('No. of resources.')
plt.xlabel('Resource Size (bytes)')
plt.title('Distribution of Resource Size')
plt.show()

## Resource Violin Plot

In [None]:
fig, ax = plt.subplots(1, figsize=(7, 7), dpi=300)
resource_by_type = resource_data.groupby('type')
parts = ax.violinplot([ g['size'] for _, g in resource_by_type], vert=False, widths=0.8)
for i, pc in enumerate(parts['bodies']):
    pc.set_facecolor(colors[i])
    pc.set_alpha(0.5)
ax.set_xscale('log')
ax.xaxis.grid(True, which='both')
ax.set_yticks(range(len(resource_by_type) + 1))
ax.set_yticklabels([ "" ] + [ f"{n}\n({round(len(g)/len(resource_data) * 10000)/100}%, {round(sum(g['size'])/sum(resource_data['size']) * 10000)/100}%)" for n, g in resource_by_type ])
ax.set_title("File size ditribution by type")
ax.set_xlabel("File size (bytes)")

In [None]:
fig, ax = plt.subplots(1, figsize=(7, 7), dpi=300)
ax.yaxis.grid(which='both', linestyle='dashed')
ax.xaxis.grid(which='both', linestyle='dashed')
ax.set_xscale('log')
ax.hist(page_data['size'], page_data['no_resources'], marker=".", alpha=0.25)
ax.set_title("Page Size vs No. Resources")
ax.set_ylabel("No. Resources")
ax.set_xlabel("Page Size (bytes)")