In [1]:
%reload_ext autoreload
%autoreload 2

import sys

sys.path.append("src")

In [None]:
from data_loader import read_experiment_results
import matplotlib.pyplot as plt

results_df = read_experiment_results()

# Calculate averages for each metric by model type
avg_time_by_memory = results_df.groupby(["model_type", "executor_memory_size"])["total_time"].mean().reset_index()
avg_time_by_disk = results_df.groupby(["model_type", "driver_memory_size"])["total_time"].mean().reset_index()
avg_time_by_cores = (
    results_df.groupby(["model_type", "cores"])["total_time"].mean().reset_index()
    if "cores" in results_df.columns
    else None
)

avg_acc_by_memory = results_df.groupby(["model_type", "executor_memory_size"])["accuracy"].mean().reset_index()
avg_acc_by_disk = results_df.groupby(["model_type", "driver_memory_size"])["accuracy"].mean().reset_index()
avg_acc_by_cores = (
    results_df.groupby(["model_type", "cores"])["accuracy"].mean().reset_index()
    if "cores" in results_df.columns
    else None
)

fig, axes = plt.subplots(1, 3, figsize=(18, 6), dpi=300)


def create_dual_axis_time_plot(ax, df, x_col, title):
    logistic_data = df[df["model_type"] == "logistic"]
    gbt_data = df[df["model_type"] == "gbt"]

    # Left axis for logistic regression
    color = "tab:blue"
    ax.set_xlabel(x_col)
    ax.set_ylabel("Logistic Time (ms)", color=color)
    ax.plot(logistic_data[x_col], logistic_data["total_time"], "o-", color=color, label="Logistic")
    ax.tick_params(axis="y", labelcolor=color)

    # Right axis for GBT
    ax2 = ax.twinx()
    color = "tab:red"
    ax2.set_ylabel("GBT Time (ms)", color=color)
    ax2.plot(gbt_data[x_col], gbt_data["total_time"], "o-", color=color, label="GBT")
    ax2.tick_params(axis="y", labelcolor=color)

    # Add legend
    lines1, labels1 = ax.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax.legend(lines1 + lines2, labels1 + labels2, loc="upper left")

    ax.set_title(title)
    ax.grid(True, alpha=0.3)


create_dual_axis_time_plot(axes[0], avg_time_by_memory, "executor_memory_size", "Training Time vs Executor Memory Size")
create_dual_axis_time_plot(axes[1], avg_time_by_disk, "driver_memory_size", "Training Time vs Driver Memory Size")

if avg_time_by_cores is not None:
    create_dual_axis_time_plot(axes[2], avg_time_by_cores, "cores", "Training Time vs Number of Cores")
else:
    axes[2].set_visible(False)

plt.tight_layout()


plt.savefig("plots/training_time.png", dpi=600, bbox_inches="tight", format="png")

plt.show()

As we can see, for all cases, the training time decreases with an increase of executor and driver memory size from 1g to 2g as well as from 1 to 2 cores. After that, however, the training time in general increases, although there are some variations.

For logistic regression, the training time decreases between 1-4g executor memory, then increases between 4-6g, decreases between 6-10g, and then starts to increase again between 10-16g. For driver memory, it also decreases between 1-4g, then increases from 4-10g, and then decreases from 10-16g. For number of cores, it decreases from 1 to 2 cores, then increases with 3 cores, then decreases with 4 cores, and then steadily increases with 6 and 8 cores.

For GBT, the pattern for executor memory is very similar to the one seen with logistic regression. For driver memory, it also decreases from 1-2g, but then increases from 2-4g and is almost the same at 4g as 6g, but then decreases between 6g-10g, and then increases again between 10g-16g (but by very little). For the number of cores, the training time also decreases from 1-2 cores, and almost the same at 3 cores as 2 cores, but then increases steadily from 3 to 8 cores, being at a longer training time level with 8 cores than with 1 core. That's interesting.

In all cases, the training time is about 4.5 times longer with GBT than with logistic regression.

Now, we have only tested memory ranges (both executor and driver memory) 1g-16g and numbers of cores 1-8, so making further experiments would maybe give some more insights.

Of course, the decreases and increases are relatively small. The training time in all plots ranges from min 640ms to max 880ms for logistic regression, and from 3620ms to 3740ms for GBT.

Consequently, changing these parameters seems to not have had much of an effect on training time, and it's difficult to draw any conclusions with regards to the scalability.

It would also be interesting to see if the prediction quality has any connection to the experiment parameters.


In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 6), dpi=300)


def create_accuracy_plot(ax, df, x_col, title):
    df = df.copy()

    logistic_data = df[df["model_type"] == "logistic"]
    gbt_data = df[df["model_type"] == "gbt"]

    ax.plot(logistic_data[x_col], logistic_data["accuracy"], "o-", color="tab:blue", label="Logistic")
    ax.plot(gbt_data[x_col], gbt_data["accuracy"], "o-", color="tab:red", label="GBT")

    ax.set_xlabel(x_col)
    ax.set_ylabel("Accuracy (%)")
    ax.set_title(title)
    ax.grid(True, alpha=0.3)
    ax.legend(loc="lower right")

    # Set y-axis to start from a reasonable minimum
    y_min = max(df["accuracy"].min() - 0.1, 0)
    y_max = min(df["accuracy"].max() + 0.1, 100)
    ax.set_ylim(y_min, y_max)


create_accuracy_plot(axes[0], avg_acc_by_memory, "executor_memory_size", "Accuracy vs Memory Size")
create_accuracy_plot(axes[1], avg_acc_by_disk, "driver_memory_size", "Accuracy vs Disk Memory Size")

if avg_acc_by_cores is not None:
    create_accuracy_plot(axes[2], avg_acc_by_cores, "cores", "Accuracy vs Number of Cores")
else:
    axes[2].set_visible(False)

plt.tight_layout()

plt.savefig("plots/accuracy.png", dpi=600, bbox_inches="tight", format="png")

plt.show()

Well, clearly a stupid test. The accuracy obviously won't increase or decrease depending on the experiment parameters - it's constantly 50%. That is, on average, there's no way to predict the outcome based on the features, in general.

Other parameters will clearly affect the prediction accuracy, such as features used, which leagues to focus on, size of training data, etc.

However, the focus of the course is to make computational experiments, not make accurate predictions. So we will leave it here.
