# Parallel Linear Regression - Performance Analysis

This notebook analyzes the performance results from Strong Scaling experiments of parallel linear regression implementations (OLS and GD) using MPI.

**Analysis Goals:**
- Calculate Speedup and Efficiency metrics
- Compare OLS vs GD parallel performance
- Fit Amdahl's Law to quantify serial fraction
- Generate publication-quality plots for the technical report

## 1. Setup and Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import glob
import os

# Set plot style for professional appearance
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 300
plt.rcParams['font.size'] = 11
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['legend.fontsize'] = 10
plt.rcParams['lines.linewidth'] = 2
plt.rcParams['lines.markersize'] = 8

print("Libraries imported successfully")
print(f"Current directory: {os.getcwd()}")

## 2. Load Experimental Data

Load CSV files from the `results/` directory. Expected filename format:
- `ols_n{N}_d{D}_p{P}_run{R}.csv` for OLS
- `gd_n{N}_d{D}_p{P}_run{R}.csv` for GD

Where:
- N = number of samples
- D = number of features
- P = number of processes
- R = run number

In [None]:
# Define data directories
RESULTS_DIR = 'results/'
PLOTS_DIR = 'plots/'

# Create plots directory if it doesn't exist
os.makedirs(PLOTS_DIR, exist_ok=True)

def load_experimental_data(algorithm='ols', n=100000, d=100):
    """
    Load experimental results from CSV files.
    
    Parameters:
    - algorithm: 'ols' or 'gd'
    - n: number of samples
    - d: number of features
    
    Returns:
    - DataFrame with columns: [nprocs, run, time]
    """
    pattern = f"{RESULTS_DIR}{algorithm}_n{n}_d{d}_p*_run*.csv"
    files = glob.glob(pattern)
    
    if not files:
        print(f"‚ö†Ô∏è  No files found matching pattern: {pattern}")
        return None
    
    data = []
    for file in files:
        # Extract nprocs and run from filename
        basename = os.path.basename(file)
        parts = basename.replace('.csv', '').split('_')
        
        # Find p and run values
        nprocs = int([p for p in parts if p.startswith('p')][0][1:])
        run = int([r for r in parts if r.startswith('run')][0][3:])
        
        # Read CSV (assuming it contains timing data)
        df = pd.read_csv(file)
        # Assuming CSV has a 'time' column or similar
        time = df['time'].values[0] if 'time' in df.columns else df.iloc[0, 0]
        
        data.append({
            'nprocs': nprocs,
            'run': run,
            'time': time
        })
    
    df = pd.DataFrame(data)
    print(f"‚úÖ Loaded {len(files)} files for {algorithm.upper()}")
    print(f"   Process counts: {sorted(df['nprocs'].unique())}")
    print(f"   Runs per configuration: {df.groupby('nprocs').size().values}")
    
    return df.sort_values('nprocs')

# Try to load data (will show warning if files don't exist yet)
ols_data = load_experimental_data('ols')
gd_data = load_experimental_data('gd')

# Display sample data
if ols_data is not None:
    print("\nSample OLS data:")
    display(ols_data.head(10))

## 3. Data Aggregation and Statistics

Aggregate multiple runs for each configuration to compute mean and standard deviation.

In [None]:
def aggregate_data(df):
    """
    Aggregate experimental data by computing mean and std for each process count.
    
    Returns:
    - DataFrame with columns: [nprocs, time_mean, time_std, runs_count]
    """
    if df is None:
        return None
    
    agg = df.groupby('nprocs')['time'].agg([
        ('time_mean', 'mean'),
        ('time_std', 'std'),
        ('runs_count', 'count')
    ]).reset_index()
    
    return agg

# Aggregate data
ols_agg = aggregate_data(ols_data)
gd_agg = aggregate_data(gd_data)

# Display aggregated data
if ols_agg is not None:
    print("OLS Aggregated Results:")
    display(ols_agg)
    
if gd_agg is not None:
    print("\nGD Aggregated Results:")
    display(gd_agg)

## 4. Compute Performance Metrics

Calculate Speedup and Efficiency:
- **Speedup**: $S_p = T_1 / T_p$
- **Efficiency**: $E_p = S_p / p = T_1 / (p \cdot T_p)$

In [None]:
def compute_metrics(agg_df):
    """
    Compute Speedup and Efficiency metrics.
    
    Parameters:
    - agg_df: Aggregated DataFrame with time_mean column
    
    Returns:
    - DataFrame with added columns: [speedup, efficiency]
    """
    if agg_df is None:
        return None
    
    df = agg_df.copy()
    
    # Get baseline time (p=1)
    T1 = df[df['nprocs'] == 1]['time_mean'].values[0]
    
    # Calculate speedup and efficiency
    df['speedup'] = T1 / df['time_mean']
    df['efficiency'] = df['speedup'] / df['nprocs']
    
    # Calculate speedup error propagation (if std available)
    if 'time_std' in df.columns:
        T1_std = df[df['nprocs'] == 1]['time_std'].values[0]
        # Error propagation for S = T1/Tp
        df['speedup_std'] = df['speedup'] * np.sqrt(
            (T1_std / T1)**2 + (df['time_std'] / df['time_mean'])**2
        )
        df['efficiency_std'] = df['speedup_std'] / df['nprocs']
    
    return df

# Compute metrics
ols_metrics = compute_metrics(ols_agg)
gd_metrics = compute_metrics(gd_agg)

# Display metrics
if ols_metrics is not None:
    print("OLS Performance Metrics:")
    display(ols_metrics[['nprocs', 'time_mean', 'speedup', 'efficiency']])
    
if gd_metrics is not None:
    print("\nGD Performance Metrics:")
    display(gd_metrics[['nprocs', 'time_mean', 'speedup', 'efficiency']])

## 5. Speedup Plot

Plot Speedup vs Number of Processes with ideal linear speedup line.

In [None]:
def plot_speedup(ols_df, gd_df=None, save_path=None):
    """
    Create Speedup vs Processes plot.
    """
    fig, ax = plt.subplots(figsize=(10, 7))
    
    # Plot OLS
    if ols_df is not None:
        ax.plot(ols_df['nprocs'], ols_df['speedup'], 
                'o-', label='OLS', color='#2E86AB', linewidth=2.5, markersize=10)
        if 'speedup_std' in ols_df.columns:
            ax.errorbar(ols_df['nprocs'], ols_df['speedup'], 
                       yerr=ols_df['speedup_std'], fmt='none', 
                       color='#2E86AB', alpha=0.3, capsize=5)
    
    # Plot GD
    if gd_df is not None:
        ax.plot(gd_df['nprocs'], gd_df['speedup'], 
                's-', label='GD', color='#A23B72', linewidth=2.5, markersize=10)
        if 'speedup_std' in gd_df.columns:
            ax.errorbar(gd_df['nprocs'], gd_df['speedup'], 
                       yerr=gd_df['speedup_std'], fmt='none', 
                       color='#A23B72', alpha=0.3, capsize=5)
    
    # Plot ideal speedup line
    max_p = max(ols_df['nprocs'].max() if ols_df is not None else 1,
                gd_df['nprocs'].max() if gd_df is not None else 1)
    ideal_p = np.arange(1, max_p + 1)
    ax.plot(ideal_p, ideal_p, '--', label='Ideal (Linear)', 
            color='gray', linewidth=2, alpha=0.7)
    
    # Formatting
    ax.set_xlabel('Number of Processes', fontsize=13, fontweight='bold')
    ax.set_ylabel('Speedup', fontsize=13, fontweight='bold')
    ax.set_title('Strong Scaling: Speedup vs Number of Processes', 
                 fontsize=15, fontweight='bold', pad=20)
    ax.legend(loc='upper left', frameon=True, shadow=True)
    ax.grid(True, alpha=0.3, linestyle='--')
    ax.set_xticks(ideal_p)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"‚úÖ Speedup plot saved to: {save_path}")
    
    plt.show()

# Generate plot
plot_speedup(ols_metrics, gd_metrics, save_path=f'{PLOTS_DIR}speedup.png')

## 6. Efficiency Plot

Plot Efficiency vs Number of Processes with 100% reference line.

In [None]:
def plot_efficiency(ols_df, gd_df=None, save_path=None):
    """
    Create Efficiency vs Processes plot.
    """
    fig, ax = plt.subplots(figsize=(10, 7))
    
    # Plot OLS
    if ols_df is not None:
        ax.plot(ols_df['nprocs'], ols_df['efficiency'] * 100, 
                'o-', label='OLS', color='#2E86AB', linewidth=2.5, markersize=10)
        if 'efficiency_std' in ols_df.columns:
            ax.errorbar(ols_df['nprocs'], ols_df['efficiency'] * 100, 
                       yerr=ols_df['efficiency_std'] * 100, fmt='none', 
                       color='#2E86AB', alpha=0.3, capsize=5)
    
    # Plot GD
    if gd_df is not None:
        ax.plot(gd_df['nprocs'], gd_df['efficiency'] * 100, 
                's-', label='GD', color='#A23B72', linewidth=2.5, markersize=10)
        if 'efficiency_std' in gd_df.columns:
            ax.errorbar(gd_df['nprocs'], gd_df['efficiency'] * 100, 
                       yerr=gd_df['efficiency_std'] * 100, fmt='none', 
                       color='#A23B72', alpha=0.3, capsize=5)
    
    # Reference lines
    max_p = max(ols_df['nprocs'].max() if ols_df is not None else 1,
                gd_df['nprocs'].max() if gd_df is not None else 1)
    ax.axhline(y=100, color='gray', linestyle='--', linewidth=2, 
               label='100% (Ideal)', alpha=0.7)
    ax.axhline(y=80, color='orange', linestyle=':', linewidth=1.5, 
               label='80% (Good)', alpha=0.5)
    
    # Formatting
    ax.set_xlabel('Number of Processes', fontsize=13, fontweight='bold')
    ax.set_ylabel('Efficiency (%)', fontsize=13, fontweight='bold')
    ax.set_title('Strong Scaling: Parallel Efficiency vs Number of Processes', 
                 fontsize=15, fontweight='bold', pad=20)
    ax.legend(loc='lower left', frameon=True, shadow=True)
    ax.grid(True, alpha=0.3, linestyle='--')
    ax.set_ylim(0, 110)
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"‚úÖ Efficiency plot saved to: {save_path}")
    
    plt.show()

# Generate plot
plot_efficiency(ols_metrics, gd_metrics, save_path=f'{PLOTS_DIR}efficiency.png')

## 7. Amdahl's Law Analysis

Fit Amdahl's Law to experimental data:

$$S_p = \frac{1}{f + \frac{1-f}{p}}$$

Where:
- $f$ = serial fraction
- $S_p$ = speedup with $p$ processes

Theoretical maximum speedup: $S_{\infty} = 1/f$

In [None]:
def amdahl_law(p, f):
    """Amdahl's Law function"""
    return 1.0 / (f + (1 - f) / p)

def fit_amdahl(df):
    """
    Fit Amdahl's Law to speedup data.
    
    Returns:
    - f: serial fraction
    - S_inf: theoretical maximum speedup
    """
    if df is None:
        return None, None
    
    p_values = df['nprocs'].values
    speedup_values = df['speedup'].values
    
    # Fit using curve_fit
    popt, pcov = curve_fit(amdahl_law, p_values, speedup_values, 
                           p0=[0.05], bounds=(0, 1))
    
    f = popt[0]
    S_inf = 1.0 / f
    
    # Calculate R-squared
    residuals = speedup_values - amdahl_law(p_values, f)
    ss_res = np.sum(residuals**2)
    ss_tot = np.sum((speedup_values - np.mean(speedup_values))**2)
    r_squared = 1 - (ss_res / ss_tot)
    
    print(f"Serial Fraction (f): {f:.4f} ({f*100:.2f}%)")
    print(f"Theoretical Max Speedup (S‚àû): {S_inf:.2f}√ó")
    print(f"R¬≤ (goodness of fit): {r_squared:.4f}")
    
    return f, S_inf

# Fit Amdahl's Law for OLS
if ols_metrics is not None:
    print("OLS Amdahl's Law Fit:")
    ols_f, ols_sinf = fit_amdahl(ols_metrics)
    print()

# Fit for GD
if gd_metrics is not None:
    print("GD Amdahl's Law Fit:")
    gd_f, gd_sinf = fit_amdahl(gd_metrics)

## 8. Amdahl's Law Plot

In [None]:
def plot_amdahl(df, f, S_inf, algorithm='OLS', save_path=None):
    """
    Plot Amdahl's Law fit with experimental data.
    """
    if df is None or f is None:
        print("‚ö†Ô∏è  No data available for Amdahl plot")
        return
    
    fig, ax = plt.subplots(figsize=(10, 7))
    
    # Experimental data
    ax.plot(df['nprocs'], df['speedup'], 'o', 
            label=f'{algorithm} (Measured)', color='#2E86AB', markersize=10)
    
    # Amdahl's Law fit
    p_fit = np.linspace(1, df['nprocs'].max() * 1.5, 100)
    speedup_fit = amdahl_law(p_fit, f)
    ax.plot(p_fit, speedup_fit, '-', 
            label=f"Amdahl's Law (f={f:.4f})", color='#F18F01', linewidth=2.5)
    
    # Ideal speedup
    ax.plot(p_fit, p_fit, '--', label='Ideal (Linear)', 
            color='gray', linewidth=2, alpha=0.7)
    
    # Theoretical limit
    ax.axhline(y=S_inf, color='red', linestyle=':', linewidth=2, 
               label=f'Theoretical Limit (S‚àû={S_inf:.1f}√ó)', alpha=0.7)
    
    # Annotations
    ax.text(df['nprocs'].max() * 0.6, S_inf * 0.9, 
            f'Serial Fraction: {f*100:.2f}%\nMax Speedup: {S_inf:.2f}√ó',
            fontsize=11, bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    # Formatting
    ax.set_xlabel('Number of Processes', fontsize=13, fontweight='bold')
    ax.set_ylabel('Speedup', fontsize=13, fontweight='bold')
    ax.set_title(f"{algorithm}: Amdahl's Law Analysis", 
                 fontsize=15, fontweight='bold', pad=20)
    ax.legend(loc='lower right', frameon=True, shadow=True)
    ax.grid(True, alpha=0.3, linestyle='--')
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"‚úÖ Amdahl plot saved to: {save_path}")
    
    plt.show()

# Generate Amdahl plots
if ols_metrics is not None and ols_f is not None:
    plot_amdahl(ols_metrics, ols_f, ols_sinf, 'OLS', 
                save_path=f'{PLOTS_DIR}amdahl_ols.png')

if gd_metrics is not None and gd_f is not None:
    plot_amdahl(gd_metrics, gd_f, gd_sinf, 'GD', 
                save_path=f'{PLOTS_DIR}amdahl_gd.png')

## 9. Summary Statistics Table

In [None]:
def create_summary_table(ols_df, gd_df=None):
    """
    Create a summary table for the report.
    """
    summary_data = []
    
    if ols_df is not None:
        for _, row in ols_df.iterrows():
            summary_data.append({
                'Algorithm': 'OLS',
                'Processes': int(row['nprocs']),
                'Time (s)': f"{row['time_mean']:.3f} ¬± {row.get('time_std', 0):.3f}",
                'Speedup': f"{row['speedup']:.2f}√ó",
                'Efficiency': f"{row['efficiency']*100:.1f}%"
            })
    
    if gd_df is not None:
        for _, row in gd_df.iterrows():
            summary_data.append({
                'Algorithm': 'GD',
                'Processes': int(row['nprocs']),
                'Time (s)': f"{row['time_mean']:.3f} ¬± {row.get('time_std', 0):.3f}",
                'Speedup': f"{row['speedup']:.2f}√ó",
                'Efficiency': f"{row['efficiency']*100:.1f}%"
            })
    
    summary_df = pd.DataFrame(summary_data)
    return summary_df

# Create and display summary table
summary_table = create_summary_table(ols_metrics, gd_metrics)

if not summary_table.empty:
    print("\n" + "="*70)
    print("PERFORMANCE SUMMARY TABLE")
    print("="*70)
    display(summary_table)
    
    # Save to CSV
    summary_table.to_csv(f'{RESULTS_DIR}summary_table.csv', index=False)
    print(f"\n‚úÖ Summary table saved to: {RESULTS_DIR}summary_table.csv")

## 10. Key Findings and Insights

This section summarizes the main observations from the analysis.

In [None]:
# Generate key findings
print("="*70)
print("KEY FINDINGS")
print("="*70)

if ols_metrics is not None:
    max_p_ols = ols_metrics['nprocs'].max()
    max_speedup_ols = ols_metrics['speedup'].max()
    eff_at_max_ols = ols_metrics[ols_metrics['nprocs'] == max_p_ols]['efficiency'].values[0]
    
    print("\nüìä OLS Performance:")
    print(f"  ‚Ä¢ Maximum speedup: {max_speedup_ols:.2f}√ó (with {max_p_ols} processes)")
    print(f"  ‚Ä¢ Efficiency at max processes: {eff_at_max_ols*100:.1f}%")
    if ols_f:
        print(f"  ‚Ä¢ Serial fraction: {ols_f*100:.2f}%")
        print(f"  ‚Ä¢ Theoretical maximum: {ols_sinf:.2f}√ó")

if gd_metrics is not None:
    max_p_gd = gd_metrics['nprocs'].max()
    max_speedup_gd = gd_metrics['speedup'].max()
    eff_at_max_gd = gd_metrics[gd_metrics['nprocs'] == max_p_gd]['efficiency'].values[0]
    
    print("\nüìä GD Performance:")
    print(f"  ‚Ä¢ Maximum speedup: {max_speedup_gd:.2f}√ó (with {max_p_gd} processes)")
    print(f"  ‚Ä¢ Efficiency at max processes: {eff_at_max_gd*100:.1f}%")
    if gd_f:
        print(f"  ‚Ä¢ Serial fraction: {gd_f*100:.2f}%")
        print(f"  ‚Ä¢ Theoretical maximum: {gd_sinf:.2f}√ó")

if ols_metrics is not None and gd_metrics is not None:
    print("\nüîÑ OLS vs GD Comparison:")
    speedup_diff = max_speedup_ols - max_speedup_gd
    eff_diff = (eff_at_max_ols - eff_at_max_gd) * 100
    print(f"  ‚Ä¢ Speedup difference: {speedup_diff:.2f}√ó (OLS is faster)")
    print(f"  ‚Ä¢ Efficiency difference: {eff_diff:.1f}% (OLS is more efficient)")
    print("  ‚Ä¢ Reason: GD has higher communication overhead due to iterative updates")

print("\n" + "="*70)

## 11. Export All Plots

Summary of all generated plots (ready for inclusion in technical report).

In [None]:
import os

print("\nüìÅ Generated Plots:")
print("="*70)

plot_files = [
    'speedup.png',
    'efficiency.png',
    'amdahl_ols.png',
    'amdahl_gd.png'
]

for plot_file in plot_files:
    full_path = os.path.join(PLOTS_DIR, plot_file)
    if os.path.exists(full_path):
        size = os.path.getsize(full_path) / 1024  # KB
        print(f"‚úÖ {plot_file:25s} ({size:.1f} KB)")
    else:
        print(f"‚ö†Ô∏è  {plot_file:25s} (not generated)")

print("\nüí° All plots are saved in high resolution (300 DPI) and ready for report inclusion.")
print(f"üìÇ Location: {os.path.abspath(PLOTS_DIR)}")

---

## Notes

**Usage Instructions:**
1. After running experiments on the cluster, download all CSV files to `results/` directory
2. Run all cells in this notebook (Cell ‚Üí Run All)
3. Review generated plots in `plots/` directory
4. Use the summary table and key findings for your technical report

**Data Format Expected:**
- CSV files should contain at least a 'time' column with execution time in seconds
- Filename format: `{algorithm}_n{N}_d{D}_p{P}_run{R}.csv`

**Customization:**
- Adjust plot colors, styles, and sizes in the plotting functions
- Modify figure sizes by changing `figsize` parameter
- Add more metrics or plots as needed

---
*Analysis completed on: January 10, 2026*