In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import os

FILES_TO_PROCESS = {
    'data/results_nas_profiling_20250815_000950.csv': ['NAS-BT'],
    'data/results_hpc_profiling_20250815_105543.csv': ['SIM-pulse_spikes'] 
}

def get_display_name(original_name):
    name = original_name.lower()
    name_mapping = {
        'nas-bt': 'NAS-BT',
        'nas-cg': 'NAS-CG', 
        'nas-ep': 'NAS-EP',
        'nas-mg': 'NAS-MG',
        'nas-sp': 'NAS-SP',
        'hpc-pulse-spikes': 'SIM-pulse_spikes',
        'hpc-sawtooth': 'SIM-sawtooth',
        'hpc-staircase': 'SIM-staircase',
        'hpc-epochs': 'SIM-epochs'
    }
    
    for key, value in name_mapping.items():
        if key in name:
            return value
    return original_name

def load_and_filter_data():
    all_data = []
    
    for file_path, apps_to_keep in FILES_TO_PROCESS.items():
        try:
            df = pd.read_csv(file_path)
            df.columns = df.columns.str.strip()
            df['Display Name'] = df['Job Name'].apply(get_display_name)
            df_filtered = df[df['Display Name'].isin(apps_to_keep)].copy()
            
            if not df_filtered.empty:
                all_data.append(df_filtered)
                
        except FileNotFoundError:
            continue
    
    return pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()

def preprocess_data(df):
    if df.empty:
        return df
    
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
    
    numeric_cols = ['Pod CPU Usage (m)', 'Pod Memory Usage (Mi)', 'VPA Target CPU (m)', 'VPA Target Mem (Mi)']
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    if not df['VPA Target Mem (Mi)'].empty and df['VPA Target Mem (Mi)'].max() > (1024**2):
        df['VPA Target Mem (Mi)'] = df['VPA Target Mem (Mi)'] / (1024**2)
    
    df['VPA Target CPU (m)'].fillna(0, inplace=True)
    df['VPA Target Mem (Mi)'].fillna(0, inplace=True)
    df.dropna(subset=['Timestamp', 'Display Name', 'Pod CPU Usage (m)', 'Pod Memory Usage (Mi)'], inplace=True)
    
    df.sort_values(by=['Display Name', 'Timestamp'], inplace=True)
    df['Start Time'] = df.groupby('Display Name')['Timestamp'].transform('min')
    df['Elapsed Time (s)'] = (df['Timestamp'] - df['Start Time']).dt.total_seconds()
    
    return df

def create_charts(df, cpu_filename='cpu_analysis.pdf', mem_filename='memory_analysis.pdf'):
    if df.empty:
        return
    
    app_names = sorted(df['Display Name'].unique())
    num_apps = len(app_names)
    
    if num_apps == 0:
        return

    plt.rcParams['font.size'] = 16
    plt.rcParams['axes.linewidth'] = 0.8
    
    with PdfPages(cpu_filename) as pdf:
        fig_cpu, axes_cpu = plt.subplots(num_apps, 1, figsize=(8, 4 * num_apps), squeeze=False)
        
        for i, app_name in enumerate(app_names):
            ax = axes_cpu[i, 0]
            app_data = df[df['Display Name'] == app_name]
    
            ax.fill_between(app_data['Elapsed Time (s)'], app_data['Pod CPU Usage (m)'], 
                           color='#2E86AB', alpha=0.7, label='CPU Usage')
            ax.plot(app_data['Elapsed Time (s)'], app_data['VPA Target CPU (m)'], 
                   color='#F24236', linestyle='--', linewidth=2, label='VPA Recommendation')
            
            #ax.set_title(f'{app_name} - CPU Usage', fontweight='bold', pad=15)
            ax.set_xlabel('Elapsed Time (s)', fontweight='bold')
            ax.set_ylabel('CPU (mCores)', fontweight='bold')
            ax.legend(frameon=True, fancybox=True, shadow=True)
            ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
            ax.set_ylim(bottom=0)
            ax.set_xlim(left=0)
        
        plt.tight_layout()
        pdf.savefig(fig_cpu, bbox_inches='tight', dpi=300)
        plt.close(fig_cpu)

    with PdfPages(mem_filename) as pdf:
        fig_mem, axes_mem = plt.subplots(num_apps, 1, figsize=(8, 4 * num_apps), squeeze=False)
        
        for i, app_name in enumerate(app_names):
            ax = axes_mem[i, 0]
            app_data = df[df['Display Name'] == app_name]
            
            ax.fill_between(app_data['Elapsed Time (s)'], app_data['Pod Memory Usage (Mi)'], 
                           color='#A23B72', alpha=0.7, label='Memory Usage')
            ax.plot(app_data['Elapsed Time (s)'], app_data['VPA Target Mem (Mi)'], 
                   color='#F18F01', linestyle='--', linewidth=2, label='VPA Recommendation')
            
            #ax.set_title(f'{app_name} - Memory Usage', fontweight='bold', pad=15)
            ax.set_xlabel('Elapsed Time (s)', fontweight='bold')
            ax.set_ylabel('Memory (MiB)', fontweight='bold')
            ax.legend(frameon=True, fancybox=True, shadow=True)
            ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)
            ax.set_ylim(bottom=0)
            ax.set_xlim(left=0)
        
        plt.tight_layout()
        pdf.savefig(fig_mem, bbox_inches='tight', dpi=300)
        plt.close(fig_mem)

def main():
    raw_data = load_and_filter_data()
    processed_data = preprocess_data(raw_data)
    
    if not processed_data.empty:
        create_charts(
            processed_data, 
            cpu_filename='vpa_cpu.pdf', 
            mem_filename='vpa_memory.pdf'
        )

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import warnings
warnings.filterwarnings('ignore')



APP_DISPLAY_NAMES = [
    'NAS-BT', 'NAS-CG', 'NAS-EP', 'NAS-MG', 'NAS-SP',
    'SIM-pulse_spikes', 'SIM-sawtooth', 'SIM-staircase', 'SIM-epochs'
]

SCENARIOS_TO_PROCESS = {
    'Guaranteed': 'data/results_combined_guaranteed_20250815_153205.csv',
    'Extreme': 'data/results_combined_extreme_20250815_144056.csv',
    'Clairvoyant': 'data/results_combined_clairvoyant_20250818_101541.csv'
}

def get_display_name(original_name):
    """Standardize job names for consistent filtering"""
    name = original_name.lower()
    name_mapping = {
        'nas-bt': 'NAS-BT',
        'nas-cg': 'NAS-CG', 
        'nas-ep': 'NAS-EP',
        'nas-mg': 'NAS-MG',
        'nas-sp': 'NAS-SP',
        'hpc-pulse-spikes': 'SIM-pulse_spikes',
        'hpc-sawtooth': 'SIM-sawtooth',
        'hpc-staircase': 'SIM-staircase',
        'hpc-epochs': 'SIM-epochs'
    }
    
    for key, value in name_mapping.items():
        if key in name:
            return value
    return original_name


def find_successful_apps(file_path, scenario_name):
    """Find applications that completed successfully in a given scenario"""
    try:
        df = pd.read_csv(file_path)
        df.columns = df.columns.str.strip()
        successful_jobs = df[df['Pod Status'] == 'Succeeded']['Job Name'].unique()
        return pd.DataFrame({'Job_Name': successful_jobs, 'Scenario': scenario_name})
    except FileNotFoundError:
        return None

def process_all_scenarios():
    """Process all scenario files and combine results"""
    all_results = []
    
    for scenario, file_path in SCENARIOS_TO_PROCESS.items():
        result_df = find_successful_apps(file_path, scenario)
        if result_df is not None:
            all_results.append(result_df)
    
    if not all_results:
        return pd.DataFrame()
    

    combined_df = pd.concat(all_results, ignore_index=True)
    combined_df['Job_Name'] = combined_df['Job_Name'].apply(get_display_name)
    combined_df = combined_df.drop_duplicates(subset=['Scenario', 'Job_Name'])
    
    return combined_df


def create_success_chart(df, output_filename='success_analysis.pdf'):
    """Create chart showing successful applications by scenario"""
    if df.empty:
        return "No data available for visualization"
    

    total_submitted_apps = len(APP_DISPLAY_NAMES)
    counts_per_scenario = df.groupby('Scenario')['Job_Name'].nunique()
    scenario_order = ['Guaranteed', 'Clairvoyant', 'Extreme']
    counts_per_scenario = counts_per_scenario.reindex(scenario_order)
    

    plt.style.use('seaborn-v0_8-whitegrid')
    plt.rcParams['font.size'] = 16
    
    with PdfPages(output_filename) as pdf:
        fig, ax = plt.subplots(figsize=(10, 7))
        

        scenario_names = counts_per_scenario.index.tolist()
        executed_counts = counts_per_scenario.values.tolist()
        colors = sns.color_palette('Blues', n_colors=len(scenario_names))
        
        bars = ax.bar(scenario_names, executed_counts, color=colors, 
                     edgecolor='black', width=0.7)
        

        ax.axhline(y=total_submitted_apps, color='grey', linestyle='--', 
                  linewidth=2, label=f'Total Submitted ({total_submitted_apps})')

        for bar in bars:
            height = bar.get_height()
            ax.annotate(f'{int(height)}',
                       xy=(bar.get_x() + bar.get_width() / 2, height),
                       xytext=(0, 3), textcoords="offset points",
                       ha='center', va='bottom', fontsize=16, fontweight='bold')
        

        ax.set_ylabel('Number of Executed Apps', fontsize=16, fontweight='bold')
        ax.tick_params(axis='x', labelsize=16)
        ax.tick_params(axis='y', labelsize=16)
        ax.set_ylim(0, total_submitted_apps * 1.2)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.legend(fontsize=14)
        
        plt.tight_layout()
        pdf.savefig(fig, bbox_inches='tight', dpi=300)
        plt.close(fig)
    
    return f"Success analysis chart saved to '{output_filename}'"


def main():
    """Main execution function"""
    processed_data = process_all_scenarios()
    
    if not processed_data.empty:
        result = create_success_chart(processed_data)
        return result
    else:
        return "No successful applications found in any scenario."

# Run the analysis
if __name__ == "__main__":
    result = main()
    print(result)

Success analysis chart saved to 'success_analysis.pdf'


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import warnings
warnings.filterwarnings('ignore')



SCENARIOS_TO_PROCESS = {
    'Guaranteed': 'data/results_combined_guaranteed_20250815_153205.csv',
    'Extreme': 'data/results_combined_extreme_20250815_144056.csv',
    'Clairvoyant': 'data/results_combined_clairvoyant_20250818_101541.csv'
}



def calculate_pending_times(file_path, scenario_name):
    """Calculate pending times for all applications in a scenario"""
    try:
        df = pd.read_csv(file_path)
        df.columns = df.columns.str.strip()
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
        df.dropna(subset=['Timestamp', 'Job Name', 'Pod Status'], inplace=True)
        
        pending_times = []
        
        for app_name in df['Job Name'].unique():
            app_df = df[df['Job Name'] == app_name].sort_values('Timestamp')
            pending_entries = app_df[app_df['Pod Status'] == 'Pending']
            
            if pending_entries.empty:
                continue
            
            # Find first pending time and exit from pending
            first_pending_time = pending_entries['Timestamp'].iloc[0]
            exit_pending_entry = app_df[app_df['Timestamp'] > first_pending_time]
            exit_pending_entry = exit_pending_entry[exit_pending_entry['Pod Status'] != 'Pending']
            
            if exit_pending_entry.empty:
                continue
                
            exit_pending_time = exit_pending_entry['Timestamp'].iloc[0]
            pending_duration_seconds = (exit_pending_time - first_pending_time).total_seconds()
            
            pending_times.append({
                'Job_Name': app_name,
                'Pending_Time_sec': pending_duration_seconds
            })
        
        if not pending_times:
            return None
            
        results_df = pd.DataFrame(pending_times)
        results_df['Scenario'] = scenario_name
        return results_df
        
    except FileNotFoundError:
        return None

def process_all_scenarios():
    """Process all scenario files and calculate pending times"""
    all_times = []
    
    for scenario, file_path in SCENARIOS_TO_PROCESS.items():
        result_df = calculate_pending_times(file_path, scenario)
        if result_df is not None:
            all_times.append(result_df)
    
    if not all_times:
        return pd.DataFrame()
    
    return pd.concat(all_times, ignore_index=True)


def create_wait_time_chart(df, output_filename='wait_time_analysis.pdf'):
    """Create chart showing average wait times by scenario"""
    if df.empty:
        return "No data available for visualization"
    
    # Calculate average wait times
    average_wait_times = df.groupby('Scenario')['Pending_Time_sec'].mean()
    scenario_order = ['Guaranteed', 'Clairvoyant', 'Extreme']
    average_wait_times = average_wait_times.reindex(scenario_order)

    plt.style.use('seaborn-v0_8-whitegrid')
    plt.rcParams['font.size'] = 16
    
    with PdfPages(output_filename) as pdf:
        fig, ax = plt.subplots(figsize=(10, 7))

        scenario_names = average_wait_times.index.tolist()
        avg_times = average_wait_times.values.tolist()
        colors = sns.color_palette('Blues', n_colors=len(scenario_names))
        
        bars = ax.bar(scenario_names, avg_times, color=colors, 
                     edgecolor='black', width=0.8)
        

        for bar in bars:
            height = bar.get_height()
            ax.annotate(f'{height:.2f}s',
                       xy=(bar.get_x() + bar.get_width() / 2, height),
                       xytext=(0, 3), textcoords="offset points",
                       ha='center', va='bottom', fontsize=16, fontweight='bold')

        ax.set_ylabel('Average Wait Time (seconds)', fontsize=16, fontweight='bold')
        ax.tick_params(axis='x', labelsize=16)
        ax.tick_params(axis='y', labelsize=16)
        ax.set_ylim(0, max(avg_times) * 1.15)
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        
        plt.tight_layout()
        pdf.savefig(fig, bbox_inches='tight', dpi=300)
        plt.close(fig)
    
    return f"Wait time analysis chart saved to '{output_filename}'"

def main():
    """Main execution function"""
    processed_data = process_all_scenarios()
    
    if not processed_data.empty:
        result = create_wait_time_chart(processed_data)
        return result
    else:
        return "No pending time data found in any scenario."


if __name__ == "__main__":
    result = main()
    print(result)

Wait time analysis chart saved to 'wait_time_analysis.pdf'


In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
import warnings
warnings.filterwarnings('ignore')



ISOLATED_RUN_FILES = [
    'data/results_hpc_profiling_20250815_105543.csv',
    'data/results_nas_profiling_20250815_000950.csv'
]

SCENARIO_FILES = {
    'Guaranteed': 'data/results_combined_guaranteed_20250815_153205.csv',
    'Clairvoyant': 'data/results_combined_clairvoyant_20250818_101541.csv',
    'Extreme': 'data/results_combined_extreme_20250815_144056.csv'
}

CLAIRVOYANT_EP_LOG_FILE = 'data/logs_clairvoyant/run-nas-ep-d-x.log'

APP_DISPLAY_NAMES = [
    'NAS-BT', 'NAS-CG', 'NAS-EP', 'NAS-MG', 'NAS-SP',
    'SIM-pulse_spikes', 'SIM-sawtooth', 'SIM-staircase', 'SIM-epochs' 
]

def get_display_name(original_name):
    """Standardize job names for consistent filtering"""
    name = original_name.lower()
    name_mapping = {
        'nas-bt': 'NAS-BT',
        'nas-cg': 'NAS-CG', 
        'nas-ep': 'NAS-EP',
        'nas-mg': 'NAS-MG',
        'nas-sp': 'NAS-SP',
        'hpc-pulse-spikes': 'SIM-pulse_spikes',
        'hpc-sawtooth': 'SIM-sawtooth',
        'hpc-staged_plateau': 'SIM-staged_plateau',
        'hpc-staircase': 'SIM-staircase',
        'hpc-epochs': 'SIM-epochs'
    }
    
    for key, value in name_mapping.items():
        if key in name:
            return value
    return original_name

def get_execution_times(file_path):
    """Extract execution times from CSV file"""
    try:
        df = pd.read_csv(file_path)
        df.columns = df.columns.str.strip()
        df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
        df.dropna(subset=['Timestamp', 'Job Name', 'Pod Status'], inplace=True)
        
        metrics = []
        for raw_app_name in df['Job Name'].unique():
            app_df = df[df['Job Name'] == raw_app_name].sort_values('Timestamp')
            running_time = app_df[app_df['Pod Status'] == 'Running']['Timestamp'].min()
            succeeded_time = app_df[app_df['Pod Status'] == 'Succeeded']['Timestamp'].min()
            
            if pd.notna(running_time) and pd.notna(succeeded_time):
                execution_time = (succeeded_time - running_time).total_seconds()
                metrics.append({
                    'Job_Name': get_display_name(raw_app_name),
                    'Execution_Time_sec': execution_time,
                    'Completion_Time': succeeded_time
                })
        
        return pd.DataFrame(metrics)
        
    except FileNotFoundError:
        return pd.DataFrame()

def parse_ep_log_file(file_path):
    """Parse execution time from EP log file"""
    try:
        with open(file_path, 'r') as f:
            for line in f:
                if 'Time in seconds =' in line:
                    time_str = line.split('=')[1].strip()
                    return float(time_str)
        return None
    except (FileNotFoundError, ValueError, IndexError):
        return None

def process_isolated_runs():
    """Process isolated run files to get baseline times"""
    isolated_dfs = [get_execution_times(file) for file in ISOLATED_RUN_FILES]
    isolated_df = pd.concat(isolated_dfs, ignore_index=True)
    return isolated_df.groupby('Job_Name')['Execution_Time_sec'].mean()

def process_scenario_runs():
    """Process scenario files and calculate slowdown"""
    scenario_times = []
    
    for scenario, file_path in SCENARIO_FILES.items():
        df = get_execution_times(file_path)
        if not df.empty:
            df['Scenario'] = scenario
            scenario_times.append(df)
    
    if not scenario_times:
        return pd.DataFrame()
    
    scenario_df = pd.concat(scenario_times, ignore_index=True)
    scenario_df = scenario_df.sort_values('Completion_Time')
    scenario_df = scenario_df.drop_duplicates(subset=['Scenario', 'Job_Name'], keep='last')

    ep_execution_time = parse_ep_log_file(CLAIRVOYANT_EP_LOG_FILE)
    if ep_execution_time is not None:
        condition = ~((scenario_df['Scenario'] == 'Clairvoyant') & (scenario_df['Job_Name'] == 'NAS-EP'))
        scenario_df = scenario_df[condition]
        
        ep_special_data = pd.DataFrame([{
            'Scenario': 'Clairvoyant', 
            'Job_Name': 'NAS-EP', 
            'Execution_Time_sec': ep_execution_time,
            'Completion_Time': pd.NaT
        }])
        scenario_df = pd.concat([scenario_df, ep_special_data], ignore_index=True)
    
    return scenario_df

def calculate_slowdown(scenario_df, isolated_times_map):
    """Calculate slowdown relative to isolated runs"""
    scenario_df['Isolated_Time_sec'] = scenario_df['Job_Name'].map(isolated_times_map)
    scenario_df.dropna(subset=['Isolated_Time_sec'], inplace=True)
    scenario_df['Slowdown'] = scenario_df.apply(
        lambda row: row['Execution_Time_sec'] / row['Isolated_Time_sec'] if row['Isolated_Time_sec'] > 0 else 0,
        axis=1
    )
    return scenario_df


def create_slowdown_charts(df):
    """Create individual slowdown charts for each scenario"""
    if df.empty:
        return "No data available for visualization"
    
    scenario_order = ['Guaranteed', 'Clairvoyant', 'Extreme']
    colors = sns.color_palette("Greens", n_colors=len(APP_DISPLAY_NAMES))
    app_color_map = {app: color for app, color in zip(APP_DISPLAY_NAMES, colors)}
    

    plt.style.use('seaborn-v0_8-whitegrid')
    plt.rcParams['font.size'] = 12
    
    generated_files = []

    for scenario in scenario_order:
        output_filename = f'slowdown_{scenario.lower()}.pdf'
        
        with PdfPages(output_filename) as pdf:
            fig, ax = plt.subplots(figsize=(14, 8))
            
            subset_df = df[df['Scenario'] == scenario].copy()
            subset_df['Job_Name'] = pd.Categorical(subset_df['Job_Name'], 
                                                 categories=APP_DISPLAY_NAMES, ordered=True)
            subset_df = subset_df.sort_values('Job_Name')
            
            bar_colors = [app_color_map.get(app, '#CCCCCC') for app in subset_df['Job_Name']]
            bars = ax.bar(subset_df['Job_Name'], subset_df['Slowdown'], color=bar_colors)
            ax.axhline(y=1, color='black', linestyle='--', linewidth=2, 
                      label='Isolated Performance')
            

            max_slowdown = subset_df['Slowdown'].max()
            y_max = max(2.0, max_slowdown * 1.1)
            ax.set_ylim(0, y_max)
            
            #ax.set_title(f'{scenario} Scenario - Application Slowdown', fontsize=16, fontweight='bold')
            ax.set_ylabel('Slowdown', fontsize=16, fontweight='bold')
            ax.tick_params(axis='x', rotation=45, labelsize=16)
            ax.tick_params(axis='y', labelsize=16)
            

            for bar, slowdown in zip(bars, subset_df['Slowdown']):
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height + 0.02,
                       f'{slowdown:.2f}', ha='center', va='bottom', 
                       fontsize=16, fontweight='bold')
            
            plt.tight_layout()
            pdf.savefig(fig, bbox_inches='tight', dpi=300)
            plt.close(fig)
        
        generated_files.append(output_filename)
    
    files_list = ', '.join(generated_files)
    return f"Slowdown analysis charts saved to: {files_list}"


def main():
    """Main execution function"""
    isolated_times_map = process_isolated_runs()
    scenario_df = process_scenario_runs()
    
    if scenario_df.empty or isolated_times_map.empty:
        return "No data available for slowdown analysis"
    
    final_df = calculate_slowdown(scenario_df, isolated_times_map)
    
    if not final_df.empty:
        result = create_slowdown_charts(final_df)
        return result
    else:
        return "No valid slowdown data could be calculated"


if __name__ == "__main__":
    result = main()
    print(result)

Slowdown analysis charts saved to: slowdown_guaranteed.pdf, slowdown_clairvoyant.pdf, slowdown_extreme.pdf
