In [1]:
# Imports and CSV load
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

CSV_PATH = os.path.join(os.getcwd(), 'results_2.2.csv')
PLOTS_DIR = os.path.join(os.getcwd(), 'plots')
os.makedirs(PLOTS_DIR, exist_ok=True)

# Read CSV and normalize types
res = pd.read_csv(CSV_PATH)
if 'user' not in res.columns:
    res['user'] = 'unknown'

res['n'] = res['n'].astype(int)
res['sparsity'] = res['sparsity'].astype(int)
res['reps'] = res['reps'].astype(int)
# Preserve original threads label (so 'sequential' is not lost) and create a numeric threads column for plotting
res['threads_label'] = res['threads'].astype(str)

def threads_to_int(x):
    if str(x).lower() == 'sequential':
        return 1
    try:
        return int(x)
    except Exception:
        return 1

res['threads'] = res['threads_label'].apply(threads_to_int)

# Helper to create a filesystem-safe username
def sanitize_user(u):
    u = str(u)
    u = u.strip()
    u = re.sub(r"[^0-9A-Za-z._-]", '_', u)
    return u

In [None]:
# Compact grouped bar plotting function
def plot_compact_timing(df, user, n, sparsity):
    data = df[(df['user'] == user) & (df['n'] == n) & (df['sparsity'] == sparsity)].copy()
    if data.empty:
        return
    summary_mean = data.groupby('threads').mean(numeric_only=True).reset_index()
    summary_std = data.groupby('threads').std(numeric_only=True).reset_index().fillna(0)
    summary_mean = summary_mean.sort_values('threads')
    summary_std = summary_std.set_index('threads').reindex(summary_mean['threads']).reset_index(drop=True)

    threads = summary_mean['threads'].values
    width = 0.25
    x = np.arange(len(threads))

    fig, ax = plt.subplots(figsize=(6,3.5))
    ax.bar(x - width, summary_mean['time_csr_construct'].values, width, yerr=summary_std['time_csr_construct'].values, capsize=3, label='CSR construct', color='#4C72B0')
    ax.bar(x, summary_mean['time_spmv_total'].values, width, yerr=summary_std['time_spmv_total'].values, capsize=3, label='CSR SpMV', color='#55A868')
    ax.bar(x + width, summary_mean['time_dense_total'].values, width, yerr=summary_std['time_dense_total'].values, capsize=3, label='Dense', color='#C44E52')
    ax.set_xticks(x)
    ax.set_xticklabels(threads)
    ax.set_xlabel('Threads')
    ax.set_ylabel('Time (s)')
    ax.set_title(f'n={n} s={sparsity}%')
    ax.grid(axis='y', alpha=0.25)
    ax.legend(fontsize=8)
    plt.tight_layout()

    uname = sanitize_user(user)
    fname = os.path.join(PLOTS_DIR, f'timing_n{n}_s{sparsity}_user_{uname}.png')
    fig.savefig(fname, dpi=200)
    plt.close(fig)

# Speedup plot: baseline is threads==1
def plot_speedup(df, user, n):
    data = df[(df['user'] == user) & (df['n'] == n)].copy()
    if data.empty:
        return
    grouped = data.groupby(['sparsity','threads']).mean(numeric_only=True).reset_index()

    fig, ax = plt.subplots(figsize=(6,3.5))
    for s, sdata in grouped.groupby('sparsity'):
        sdata = sdata.sort_values('threads')
        # Use raw data for baseline detection so we prefer 'sequential' label when present
        raw_s = data[data['sparsity'] == s]
        baseline_raw = raw_s[raw_s['threads_label'].str.lower() == 'sequential']
        if not baseline_raw.empty:
            b = baseline_raw['time_spmv_total'].mean()
            sb = baseline_raw['time_spmv_total'].std() if baseline_raw['time_spmv_total'].std() else 0.0
        else:
            b_series = raw_s[raw_s['threads'] == 1]['time_spmv_total']
            if b_series.empty or b_series.mean() == 0:
                continue
            b = b_series.mean(); sb = b_series.std() if not np.isnan(b_series.std()) else 0.0

        # compute means and stds for other threads
        s_mean = sdata.groupby('threads')['time_spmv_total'].mean().reset_index()
        s_std = sdata.groupby('threads')['time_spmv_total'].std().reset_index().fillna(0)
        s_mean = s_mean.sort_values('threads').reset_index(drop=True)
        s_std = s_std.set_index('threads').reindex(s_mean['threads']).reset_index(drop=True)

        m = s_mean['time_spmv_total'].values
        sm = s_std['time_spmv_total'].values
        speedup_mean = b / m
        # propagate uncertainty from baseline and measured times
        var = (sb**2) / (m**2) + (b**2) * (sm**2) / (m**4)
        speedup_std = np.sqrt(var)
        ax.errorbar(s_mean['threads'], speedup_mean, yerr=speedup_std, marker='o', label=f's={s}%', capsize=4)

    # Show integer thread counts on the x-axis (use actual thread values)
    all_threads = sorted(grouped['threads'].unique())
    ax.set_xticks(all_threads)
    ax.set_xticklabels([str(int(t)) for t in all_threads])
    ax.set_xlabel('Threads')
    ax.set_ylabel('Speedup (x)')
    ax.set_title(f'Speedup (CSR SpMV) n={n}')
    ax.grid(True, alpha=0.25)
    ax.legend(fontsize=8)
    plt.tight_layout()

    uname = sanitize_user(user)
    fname = os.path.join(PLOTS_DIR, f'speedup_n{n}_user_{uname}.png')
    fig.savefig(fname, dpi=200)
    plt.close(fig)

In [3]:
# Generate plots per user
for user in sorted(res['user'].unique()):
    user_df = res[res['user'] == user]
    uname = sanitize_user(user)
    print(f'Generating plots for user: {user}')

    for n in sorted(user_df['n'].unique()):
        for s in sorted(user_df[user_df['n']==n]['sparsity'].unique()):
            plot_compact_timing(res, user, n, s)
        plot_speedup(res, user, n)

# Summary plot: average relative improvement (dense - csr_spmv) / dense
# for threads = 1 and threads = max(>1)
for user in sorted(res['user'].unique()):
    user_df = res[res['user'] == user]
    if user_df.empty: continue
    uname = sanitize_user(user)
    for n in sorted(user_df['n'].unique()):
        svals = []
        improvements_seq = []
        improvements_par = []
        for s in sorted(user_df[user_df['n']==n]['sparsity'].unique()):
            d = user_df[(user_df['n']==n) & (user_df['sparsity']==s)]
            if d.empty: continue
            # sequential (threads==1)
            d1 = d[d['threads']==1]
            if d1.empty: continue
            dense_seq = d1['time_dense_total'].mean()
            csr_seq = d1['time_spmv_total'].mean()
            if dense_seq > 0:
                improvements_seq.append((dense_seq - csr_seq) / dense_seq)
            # parallel: use max threads available in this subset
            max_th = d['threads'].max()
            dmax = d[d['threads']==max_th]
            if not dmax.empty:
                dense_par = dmax['time_dense_total'].mean()
                csr_par = dmax['time_spmv_total'].mean()
                if dense_par > 0:
                    improvements_par.append((dense_par - csr_par) / dense_par)
            svals.append(s)
        if not svals: continue
        avg_seq = np.mean(improvements_seq) if improvements_seq else 0.0
        avg_par = np.mean(improvements_par) if improvements_par else 0.0

        fig, ax = plt.subplots(figsize=(6,3.0))
        ax.bar([0,1], [avg_seq, avg_par], color=['#4C72B0','#55A868'], width=0.5)
        ax.set_xticks([0,1]); ax.set_xticklabels(['sequential','parallel'])
        ax.set_ylabel('Avg relative improvement')
        ax.set_title(f'Avg CSR vs Dense improvement n={n} user={user}')
        ax.set_ylim(bottom=0)
        ax.grid(axis='y', alpha=0.25)
        plt.tight_layout()
        fname = os.path.join(PLOTS_DIR, f'avg_improv_n{n}_user_{uname}.png')
        fig.savefig(fname, dpi=200)
        plt.close(fig)

print('All plots generated and saved to', PLOTS_DIR)

Generating plots for user: marr
All plots generated and saved to /home/marr/threads/Thread-Experiments/2_2_sparse_array_vector_multiplication/plots
