In [9]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [10]:
res_3_df = pd.read_csv('results_1.3.csv')

In [11]:
# 1.3 Array Analysis
import os
os.makedirs('plots', exist_ok=True)

# Normalize dataframe
df = res_3_df.copy()
df['user'] = df.get('user', 'unknown')
df['version'] = df['version'].astype(str).str.strip().str.lower()
df['size'] = pd.to_numeric(df.get('size'), errors='coerce').fillna(0).astype(int)

time_cols = ["time_init", "time_compute", "time_thread_create", "time_thread_join", "time_cleanup", "time_total"]
for col in time_cols:
    df[col] = pd.to_numeric(df.get(col), errors='coerce').fillna(0)

# Get unique users and sizes
unique_users = sorted(df["user"].unique())
unique_sizes = sorted(df["size"].unique())
versions = ["sequential", "parallel"]
colors = {"sequential": "#7f7f7f", "parallel": "#2ca02c"}

def size_label(n):
    return f"{n//1_000_000}M" if n >= 1_000_000 else f"{n//1000}K" if n >= 1000 else str(n)

# Create plots for each user
for user in unique_users:
    user_df = df[df["user"] == user].copy()
    
    if user_df.empty:
        continue
    
    # Aggregate by version and size
    stats = user_df.groupby(["version", "size"])[time_cols].agg(["mean", "std"]).reset_index()
    stats.columns = ["_".join(c).strip("_") for c in stats.columns]
    
    # Create separate plots for each metric
    for metric in ["time_total", "time_compute"]:
        fig, ax = plt.subplots(figsize=(12, 6))
        fig.suptitle(f"Array Analysis — {metric.replace('time_', '').replace('_', ' ').title()} — User: {user}", 
                     fontsize=14, fontweight="bold")
        
        unique_versions = sorted(stats["version"].unique())
        jitter_offset = np.linspace(-0.15, 0.15, len(unique_versions))
        
        for version_idx, version in enumerate(unique_versions):
            version_stats = stats[stats["version"] == version].copy()
            
            if version_stats.empty:
                continue
            
            x_jitter = np.arange(len(unique_sizes)) + jitter_offset[version_idx]
            y_mean = version_stats[f"{metric}_mean"].values
            y_std = version_stats[f"{metric}_std"].values
            
            ax.errorbar(x_jitter, y_mean, yerr=y_std, marker='o', 
                       label=version.capitalize(), linewidth=2, capsize=5, 
                       linestyle='-', alpha=0.8, color=colors.get(version, "#444"))
        
        ax.set_xlabel("Array Size", fontsize=11)
        ax.set_ylabel(f"{metric.replace('time_', '').replace('_', ' ').title()} (s)", fontsize=11)
        ax.set_xticks(range(len(unique_sizes)))
        ax.set_xticklabels([size_label(s) for s in unique_sizes])
        ax.legend(fontsize=10)
        ax.grid(True, alpha=0.3)
        ax.set_yscale("log")
        
        plt.tight_layout()
        outpath = f"plots/plot_1_3_{user}_{metric}.png"
        fig.savefig(outpath, dpi=300, bbox_inches="tight")
        plt.close(fig)
        print(f"Saved: {outpath}")
    
    # Create speedup plot for this user
    seq_stats = stats[stats["version"] == "sequential"].copy()
    par_stats = stats[stats["version"] == "parallel"].copy()
    
    if not seq_stats.empty and not par_stats.empty:
        # Only compute speedup for sizes that exist in both versions
        common_sizes = set(seq_stats["size"]) & set(par_stats["size"])
        common_sizes = sorted(common_sizes)
        
        if common_sizes:
            seq_times = seq_stats[seq_stats["size"].isin(common_sizes)].set_index("size")["time_total_mean"].reindex(common_sizes).values
            par_times = par_stats[par_stats["size"].isin(common_sizes)].set_index("size")["time_total_mean"].reindex(common_sizes).values
            speedup = seq_times / par_times
            
            fig, ax = plt.subplots(figsize=(12, 6))
            fig.suptitle(f"Array Analysis — Speedup — User: {user}", fontsize=14, fontweight="bold")
            
            ax.plot(range(len(common_sizes)), speedup, marker='o', linewidth=2, markersize=10, color='green')
            ax.fill_between(range(len(common_sizes)), speedup, alpha=0.3, color='green')
            ax.set_xlabel("Array Size", fontsize=11)
            ax.set_ylabel("Speedup", fontsize=11)
            ax.set_xticks(range(len(common_sizes)))
            ax.set_xticklabels([size_label(s) for s in common_sizes])
            ax.grid(True, alpha=0.3)
            
            plt.tight_layout()
            outpath = f"plots/plot_1_3_{user}_speedup.png"
            fig.savefig(outpath, dpi=300, bbox_inches="tight")
            plt.close(fig)
            print(f"Saved: {outpath}")

# Create combined plots with all users
if len(unique_users) > 1:
    stats_all = df.groupby(["version", "size"])[time_cols].agg(["mean", "std"]).reset_index()
    stats_all.columns = ["_".join(c).strip("_") for c in stats_all.columns]
    
    # Create separate plots for each metric
    for metric in ["time_total", "time_compute"]:
        fig, ax = plt.subplots(figsize=(12, 6))
        fig.suptitle(f"Array Analysis — {metric.replace('time_', '').replace('_', ' ').title()} — All Users", 
                     fontsize=14, fontweight="bold")
        
        unique_versions = sorted(stats_all["version"].unique())
        jitter_offset = np.linspace(-0.15, 0.15, len(unique_versions))
        
        for version_idx, version in enumerate(unique_versions):
            version_stats = stats_all[stats_all["version"] == version].copy()
            
            if version_stats.empty:
                continue
            
            x_jitter = np.arange(len(unique_sizes)) + jitter_offset[version_idx]
            y_mean = version_stats[f"{metric}_mean"].values
            y_std = version_stats[f"{metric}_std"].values
            
            ax.errorbar(x_jitter, y_mean, yerr=y_std, marker='o', 
                       label=version.capitalize(), linewidth=2, capsize=5, 
                       linestyle='-', alpha=0.8, color=colors.get(version, "#444"))
        
        ax.set_xlabel("Array Size", fontsize=11)
        ax.set_ylabel(f"{metric.replace('time_', '').replace('_', ' ').title()} (s)", fontsize=11)
        ax.set_xticks(range(len(unique_sizes)))
        ax.set_xticklabels([size_label(s) for s in unique_sizes])
        ax.legend(fontsize=10)
        ax.grid(True, alpha=0.3)
        ax.set_yscale("log")
        
        plt.tight_layout()
        outpath = f"plots/plot_1_3_all_users_{metric}.png"
        fig.savefig(outpath, dpi=300, bbox_inches="tight")
        plt.close(fig)
        print(f"Saved: {outpath}")
    
    # Create speedup plot for all users combined
    seq_stats_all = stats_all[stats_all["version"] == "sequential"].copy()
    par_stats_all = stats_all[stats_all["version"] == "parallel"].copy()
    
    if not seq_stats_all.empty and not par_stats_all.empty:
        # Only compute speedup for sizes that exist in both versions
        common_sizes = set(seq_stats_all["size"]) & set(par_stats_all["size"])
        common_sizes = sorted(common_sizes)
        
        if common_sizes:
            seq_times_all = seq_stats_all[seq_stats_all["size"].isin(common_sizes)].set_index("size")["time_total_mean"].reindex(common_sizes).values
            par_times_all = par_stats_all[par_stats_all["size"].isin(common_sizes)].set_index("size")["time_total_mean"].reindex(common_sizes).values
            speedup_all = seq_times_all / par_times_all
            
            fig, ax = plt.subplots(figsize=(12, 6))
            fig.suptitle(f"Array Analysis — Speedup — All Users", fontsize=14, fontweight="bold")
            
            ax.plot(range(len(common_sizes)), speedup_all, marker='o', linewidth=2, markersize=10, color='green')
            ax.fill_between(range(len(common_sizes)), speedup_all, alpha=0.3, color='green')
            ax.set_xlabel("Array Size", fontsize=11)
            ax.set_ylabel("Speedup", fontsize=11)
            ax.set_xticks(range(len(common_sizes)))
            ax.set_xticklabels([size_label(s) for s in common_sizes])
            ax.grid(True, alpha=0.3)
            
            plt.tight_layout()
            outpath = f"plots/plot_1_3_all_users_speedup.png"
            fig.savefig(outpath, dpi=300, bbox_inches="tight")
            plt.close(fig)
            print(f"Saved: {outpath}")

print(f"\nUnique users: {list(unique_users)}")
print(f"Unique sizes: {[size_label(s) for s in unique_sizes]}")
print("All plots generated with error bars and means.")

Saved: plots/plot_1_3_marr_time_total.png
Saved: plots/plot_1_3_marr_time_compute.png
Saved: plots/plot_1_3_marr_time_compute.png
Saved: plots/plot_1_3_marr_speedup.png
Saved: plots/plot_1_3_marr_speedup.png
Saved: plots/plot_1_3_phoebus_time_total.png
Saved: plots/plot_1_3_phoebus_time_total.png
Saved: plots/plot_1_3_phoebus_time_compute.png
Saved: plots/plot_1_3_phoebus_time_compute.png
Saved: plots/plot_1_3_phoebus_speedup.png
Saved: plots/plot_1_3_phoebus_speedup.png
Saved: plots/plot_1_3_all_users_time_total.png
Saved: plots/plot_1_3_all_users_time_total.png
Saved: plots/plot_1_3_all_users_time_compute.png
Saved: plots/plot_1_3_all_users_time_compute.png
Saved: plots/plot_1_3_all_users_speedup.png

Unique users: ['marr', 'phoebus']
Unique sizes: ['100K', '1M', '5M']
All plots generated with error bars and means.
Saved: plots/plot_1_3_all_users_speedup.png

Unique users: ['marr', 'phoebus']
Unique sizes: ['100K', '1M', '5M']
All plots generated with error bars and means.


In [13]:
# Debug: Print actual speedup values
for user in unique_users:
    user_df = df[df["user"] == user]
    stats = user_df.groupby(["version", "size"])[time_cols].agg(["mean", "std"]).reset_index()
    stats.columns = ["_".join(c).strip("_") for c in stats.columns]
    
    seq_stats = stats[stats["version"] == "sequential"].copy()
    par_stats = stats[stats["version"] == "parallel"].copy()
    
    common_sizes = set(seq_stats["size"]) & set(par_stats["size"])
    common_sizes = sorted(common_sizes)
    
    if common_sizes:
        seq_times = seq_stats[seq_stats["size"].isin(common_sizes)].set_index("size")["time_total_mean"].reindex(common_sizes).values
        par_times = par_stats[par_stats["size"].isin(common_sizes)].set_index("size")["time_total_mean"].reindex(common_sizes).values
        speedup = seq_times / par_times
        
        print(f"\nUser: {user}")
        for i, size in enumerate(common_sizes):
            print(f"  Size {size_label(size)}: seq={seq_times[i]:.4f}s, par={par_times[i]:.4f}s, speedup={speedup[i]:.4f}x")


User: marr
  Size 100K: seq=0.0145s, par=0.0343s, speedup=0.4238x
  Size 1M: seq=0.1563s, par=0.1294s, speedup=1.2074x
  Size 5M: seq=0.3334s, par=0.3809s, speedup=0.8754x

User: phoebus
  Size 100K: seq=0.0069s, par=0.0069s, speedup=1.0082x
  Size 1M: seq=0.0631s, par=0.0639s, speedup=0.9886x
  Size 5M: seq=0.3128s, par=0.3158s, speedup=0.9905x
