In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# top_n_err_save_path = '../Analysis_Results/storage_server/All_models_res/error_analysis/final_results_all_models_top_n_err.csv'  # Replace with your CSV file name
top_n_err_save_path = '../storage_server/COLM_res_update/All_Models_res/error_analysis/final_results_all_models_top_n_err_by_model.csv'
df = pd.read_csv(top_n_err_save_path)
df

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns  # Import Seaborn for enhanced visualization

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

plt.figure(dpi=400)

# Assuming 'top_n_errors' is your DataFrame
groupby_keys = ['model_name']
n = 5  # Number of top categories to display
top_n_errors = df

# Specific order for the models
model_order = [
    "gpt-4-turbo", "claude-3-opus-20240229", "claude-3-sonnet-20240229", 
    "claude-3-haiku-20240307", "Meta-Llama-3-70B-Instruct", "gpt-3.5-turbo-0125",
    "Meta-Llama-3-8B-Instruct", "deepseek-coder-7b-instruct", "deepseek-coder-1.3b-instruct", 
    "phi-3-mini-4k"
]

# Use a Seaborn palette for a better color choice
palette = sns.color_palette("husl", n_colors=len(top_n_errors['error_category'].unique()))

# Sort the error_categories for consistent ordering
sorted_error_categories = sorted(top_n_errors['error_category'].unique())

# Determine the layout of subplots
num_groups = len(top_n_errors.groupby(groupby_keys))
cols = 2  # Number of columns in the subplot grid
rows = (num_groups + cols - 1) // cols  # Calculate rows needed, ensuring at least 1

# Create a figure to hold the subplots
fig, axs = plt.subplots(rows, cols, figsize=(20, 7 * rows), dpi=120, facecolor='white', sharey=True)
fig.subplots_adjust(hspace=0.4, wspace=0.2)  # Adjust space between plots

# Flatten the axs array if there's more than one row
if rows > 1:
    axs = axs.flatten()
else:
    axs = [axs]

# Iterate over the models in the specified order
for ax, model_name in zip(axs, model_order):
    mask = top_n_errors['model_name'] == model_name
    group_df = top_n_errors.loc[mask].copy()
    
    group_df['error_category'] = pd.Categorical(group_df['error_category'], categories=sorted_error_categories, ordered=True)
    group_df.sort_values('error_category', inplace=True)
    
    sns.barplot(x='error_category', y='error_percentage', data=group_df, ax=ax, palette=palette)
    
    ax.set_ylabel('Percentage', fontsize=14)
    ax.set_xlabel('Output Category', fontsize=14)
    ax.set_title(f'Top {n} Output Categories for {model_name}', fontsize=16)
    
    ax.tick_params(axis='x', rotation=45, labelsize=12)
    ax.tick_params(axis='y', labelsize=12)

# Hide any unused axes if the number of plots is not a perfect fill of the grid
for ax in axs[len(model_order):]:
    ax.set_visible(False)

# Create a legend for the error categories, added to the last plot
legend_handles = [plt.Line2D([0], [0], marker='o', color='w', label=category, 
                            markerfacecolor=palette[i], markersize=10) for i, category in enumerate(sorted_error_categories)]
axs[-1].legend(handles=legend_handles, title="Error Categories", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=12, title_fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Assuming 'top_n_errors' is your DataFrame and 'n' is the number of top categories to display
n = 5

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Define a fixed color palette for all categories, ensuring consistency across plots
unique_categories = top_n_errors['error_category'].unique()
unique_categories_sorted = sorted(unique_categories, key=lambda x: (x != 'success', x))  # 'success' first, then alphabetically
palette = sns.color_palette("husl", len(unique_categories))

# Map each category to a color
category_color_map = {category: color for category, color in zip(unique_categories_sorted, palette)}

# Specific order for the models
model_order = [
    "gpt-4-turbo", "claude-3-opus-20240229", "claude-3-sonnet-20240229", 
    "claude-3-haiku-20240307", "Meta-Llama-3-70B-Instruct", "gpt-3.5-turbo-0125",
    "Meta-Llama-3-8B-Instruct", "deepseek-coder-7b-instruct", "deepseek-coder-1.3b-instruct", 
    "phi-3-mini-4k"
]

# Filter 'top_n_errors' to only include models in 'model_order'
top_n_errors = top_n_errors[top_n_errors['model_name'].isin(model_order)]

# Ensure the models are plotted in the specified order by setting the 'model_name' column as a Categorical with the defined order
top_n_errors['model_name'] = pd.Categorical(top_n_errors['model_name'], categories=model_order, ordered=True)

# Sort 'top_n_errors' by 'model_name' to ensure the plot follows the specified order
top_n_errors.sort_values('model_name', inplace=True)

# Determine the layout of subplots
num_groups = len(model_order)  # Use the length of 'model_order' instead of unique models in DataFrame
cols = 2  # Number of columns in the subplot grid
rows = (num_groups + cols - 1) // cols  # Calculate rows needed, ensuring at least 1

# Create a figure to hold the subplots
fig, axs = plt.subplots(rows, cols, figsize=(20, 7 * rows), dpi=120, facecolor='white', sharey=True)
fig.subplots_adjust(hspace=0.4, wspace=0.2)  # Adjust space between plots

# Flatten the axs array if there's more than one row
if rows > 1:
    axs = axs.flatten()
else:
    axs = [axs]

# Iterate over the models in the specified order
for ax, model_name in zip(axs, model_order):
    # Filter the DataFrame for the current model
    group_df = top_n_errors[top_n_errors['model_name'] == model_name]
    
    # Filter out categories with zero or missing 'error_percentage'
    group_df = group_df[group_df['error_percentage'].notnull() & (group_df['error_percentage'] > 0)]
    
    # Determine the top N categories for this group, ensuring 'success' is considered first
    top_categories = group_df.groupby('error_category')['error_percentage'].sum().nlargest(n + 1).index
    top_categories_sorted = sorted(top_categories, key=lambda x: (x != 'success', x))[:n]
    
    # Filter the DataFrame to include only the top N categories
    group_df = group_df[group_df['error_category'].isin(top_categories_sorted)]
    
    # Ensure the categories are in the same order for each group
    group_df['error_category'] = pd.Categorical(group_df['error_category'], categories=top_categories_sorted, ordered=True)
    group_df.sort_values('error_category', inplace=True)

    # Convert 'error_category' to string to avoid mapping issues with Categorical types
    group_df['error_category_str'] = group_df['error_category'].astype(str)

    # Use the mapped colors for each category, using the string representation for mapping
    colors = group_df['error_category_str'].map(category_color_map).tolist()
    # sns.barplot(y='error_percentage', data=group_df, ax=ax, palette=colors)
    sns.barplot(x='error_category_str', y='error_percentage', data=group_df, ax=ax, palette=colors)


    ax.set_ylabel('Percentage', fontsize=20)
    ax.set_xlabel('Output Category', fontsize=20)
    ax.set_title(f'{model_name}', fontsize=23)

    ax.tick_params(axis='x', rotation=45, labelsize=15)
    ax.tick_params(axis='y', labelsize=15)


# Hide any unused axes if the number of plots is not a perfect fill of the grid
for ax in axs[len(model_order):]:
    ax.set_visible(False)

axs[-1].legend(handles=legend_handles, title="Output Categories", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=20, title_fontsize=20)


plt.tight_layout()
plt.savefig('plot_output.png', dpi=300, bbox_inches='tight')

plt.show()

In [None]:
all_err_save_path = '../storage_server/COLM_res_update/All_Models_res/error_analysis/final_results_all_models_all_err_by_model.csv' 
df = pd.read_csv(all_err_save_path)
df

In [None]:
import pandas as pd

# Set the option to display all rows and columns
pd.set_option('display.max_rows', None)  # None means show all rows
pd.set_option('display.max_columns', None)  # None means show all columns

# Now, when you print the DataFrame, all rows and columns will be shown
df

In [None]:
print(df)

In [None]:
# Adjusted code snippet with requested changes
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.patches as mpatches
import re

# Function to extract average and error bar from the 'error_percentage' column
def extract_avg_and_err_bar(s):
    match = re.search(r"(\d+\.\d+) ± (\d+\.\d+)", s)
    if match:
        return float(match.group(1)), float(match.group(2))
    else:
        return float(s), 0  # No error bar, return value as is

def plot_errors(df, report_err_bar=True):
    # Assuming 'df' is your DataFrame
    # Define the specific errors to plot
    errors_of_interest = ["CompilationError", "IdetatioError", "SytaxError", "NameError"]

    # Set the aesthetic style of the plots
    sns.set_style("whitegrid")

    top_n_errors = df.copy()  # Work on a copy if df is not to be modified directly

    # Original model names and their simplified display names
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Simplify model names in the DataFrame
    top_n_errors['model_display_name'] = top_n_errors['model_name'].map(model_name_mapping)

    # Define a fixed color palette for all display models
    palette = sns.color_palette("husl", len(model_name_mapping))

    # Map each display model to a color
    model_color_map = {display_name: color for display_name, color in zip(model_name_mapping.values(), palette)}

    # Calculate the number of rows needed for 2 columns
    num_rows = len(errors_of_interest) // 2 + len(errors_of_interest) % 2

    # Create a figure to hold the subplots with 2 columns
    fig, axs = plt.subplots(num_rows, 2, figsize=(20, 7 * num_rows), dpi=120, facecolor='white')
    fig.subplots_adjust(hspace=0.4, wspace=0.2)  # Adjust space between plots

    # Flatten the axs array for easy iteration
    axs = axs.flatten()

    # Error title correction mapping
    error_title_correction = {
        "CompilationError": "Compilation Error",
        "IdetatioError": "Indentation Error",
        "SytaxError": "Syntax Error",
        "NameError": "Name Error"
    }

    for ax, error_type in zip(axs, errors_of_interest):
        # Filter the DataFrame for the current error type
        error_df = top_n_errors[top_n_errors['error_category'] == error_type].copy()
        
        # Sort error_df based on the display names' order in model_name_mapping
        error_df['model_display_name'] = pd.Categorical(error_df['model_display_name'], categories=model_name_mapping.values(), ordered=True)
        error_df.sort_values('model_display_name', inplace=True)

        # Extract average and error bar values
        error_df[['avg_error_percentage', 'error_bar']] = error_df['error_percentage'].apply(lambda x: pd.Series(extract_avg_and_err_bar(x)))

        if report_err_bar:
            for model_display_name, group in error_df.groupby('model_display_name'):
                ax.errorbar(group['model_display_name'], group['avg_error_percentage'], yerr=group['error_bar'], fmt='o', color=model_color_map[model_display_name], label=model_display_name)
        else:
            sns.barplot(x='model_display_name', y='avg_error_percentage', data=error_df, ax=ax, palette=model_color_map)

        ax.set_ylabel('Percentage', fontsize=20)
        ax.set_xlabel('')  # Hide the x-label as requested
        # Use the corrected error title for the plot title
        corrected_error_title = error_title_correction.get(error_type, error_type)  # Fallback to original if not found
        ax.set_title(f'{corrected_error_title}', fontsize=23)

        ax.tick_params(axis='x', rotation=45, labelsize=15)
        ax.tick_params(axis='y', labelsize=15)

        # Adjust y-axis limits based on the data of each subplot for better visualization
        max_error_percentage = error_df['avg_error_percentage'].max()
        ax.set_ylim(0, max_error_percentage + 5)  # Adding a buffer for better visualization

    # Hide any unused axes if the number of plots is not a perfect fill of the grid
    for ax in axs[len(errors_of_interest):]:
        ax.set_visible(False)

    # Update legend handles for display model names
    legend_handles = [mpatches.Patch(color=model_color_map[display_name], label=display_name) for display_name in model_name_mapping.values()]

    # Add the legend to the figure instead of the last subplot to span across the width and be centered at the bottom
    fig.legend(handles=legend_handles, title="Model Names", loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5, fontsize=20, title_fontsize=20)

    plt.tight_layout()
    plt.savefig('error_analysis_by_type.png', dpi=300, bbox_inches='tight')

    plt.show()

plot_errors(df)

In [None]:
# Adjusted code snippet with requested changes
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.patches as mpatches
import re

# Function to extract average and error bar from the 'error_percentage' column
def extract_avg_and_err_bar(s):
    match = re.search(r"(\d+\.\d+) ± (\d+\.\d+)", s)
    if match:
        return float(match.group(1)), float(match.group(2))
    else:
        return float(s), 0  # No error bar, return value as is

def plot_errors(df, report_err_bar=True):
    # Assuming 'df' is your DataFrame
    # Define the specific errors to plot
    errors_of_interest = ["CompilationError", "IdetatioError", "SytaxError", "NameError"]

    # Set the aesthetic style of the plots
    sns.set_style("whitegrid")

    top_n_errors = df.copy()  # Work on a copy if df is not to be modified directly

    # Original model names and their simplified display names
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Simplify model names in the DataFrame
    top_n_errors['model_display_name'] = top_n_errors['model_name'].map(model_name_mapping)

    # Define a fixed color palette for all display models
    palette = sns.color_palette("husl", len(model_name_mapping))

    # Map each display model to a color
    model_color_map = {display_name: color for display_name, color in zip(model_name_mapping.values(), palette)}

    # Calculate the number of rows needed for 2 columns
    num_rows = len(errors_of_interest) // 2 + len(errors_of_interest) % 2

    # Create a figure to hold the subplots with 2 columns
    fig, axs = plt.subplots(num_rows, 2, figsize=(20, 7 * num_rows), dpi=120, facecolor='white')
    fig.subplots_adjust(hspace=0.4, wspace=0.2)  # Adjust space between plots

    # Flatten the axs array for easy iteration
    axs = axs.flatten()

    # Error title correction mapping
    error_title_correction = {
        "CompilationError": "Compilation Error",
        "IdetatioError": "Indentation Error",
        "SytaxError": "Syntax Error",
        "NameError": "Name Error"
    }

    for ax, error_type in zip(axs, errors_of_interest):
        error_df = top_n_errors[top_n_errors['error_category'] == error_type].copy()
        error_df['model_display_name'] = pd.Categorical(error_df['model_display_name'], categories=model_name_mapping.values(), ordered=True)
        error_df.sort_values('model_display_name', inplace=True)
        error_df[['avg_error_percentage', 'error_bar']] = error_df['error_percentage'].apply(lambda x: pd.Series(extract_avg_and_err_bar(x)))

        if report_err_bar:
            for model_display_name, group in error_df.groupby('model_display_name'):
                ax.errorbar(group['model_display_name'], group['avg_error_percentage'], yerr=group['error_bar'], fmt='o', color=model_color_map[model_display_name], label=model_display_name)
        else:
            sns.barplot(x='model_display_name', y='avg_error_percentage', data=error_df, ax=ax, palette=model_color_map)

        # Using seaborn's barplot to draw bars with error bars
        sns.barplot(x='model_display_name', y='avg_error_percentage', data=error_df, ax=ax, palette=model_color_map, yerr=error_df['error_bar'] if report_err_bar else None)
        ax.set_ylabel('Percentage', fontsize=20)
        ax.set_xlabel('')  # Hide the x-label as requested
        # Use the corrected error title for the plot title
        corrected_error_title = error_title_correction.get(error_type, error_type)  # Fallback to original if not found
        ax.set_title(f'{corrected_error_title}', fontsize=28, fontweight='bold')

        ax.tick_params(axis='x', rotation=45, labelsize=15)
        ax.tick_params(axis='y', labelsize=15)

        # Adjust y-axis limits based on the data of each subplot for better visualization
        max_error_percentage = error_df['avg_error_percentage'].max()
        ax.set_ylim(0, max_error_percentage + 5)  # Adding a buffer for better visualization

    # Hide any unused axes if the number of plots is not a perfect fill of the grid
    for ax in axs[len(errors_of_interest):]:
        ax.set_visible(False)

    # Update legend handles for display model names
    legend_handles = [mpatches.Patch(color=model_color_map[display_name], label=display_name) for display_name in model_name_mapping.values()]

    # Add the legend to the figure instead of the last subplot to span across the width and be centered at the bottom
    fig.legend(handles=legend_handles, title="Model Names", loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5, fontsize=20, title_fontsize=20)

    plt.tight_layout()
    plt.savefig('error_analysis_by_type.png', dpi=300, bbox_inches='tight')

    plt.show()

plot_errors(df)

In [None]:
# Adjusted code snippet with requested changes
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.patches as mpatches
import re

# Function to extract average and error bar from the 'error_percentage' column
def extract_avg_and_err_bar(s):
    match = re.search(r"(\d+\.\d+) ± (\d+\.\d+)", s)
    if match:
        return float(match.group(1)), float(match.group(2))
    else:
        return float(s), 0  # No error bar, return value as is

def plot_errors(df, report_err_bar=True):
    # Assuming 'df' is your DataFrame
    # Define the specific errors to plot
    errors_of_interest = ["CompilationError", "IdetatioError", "SytaxError", "NameError"]

    # Set the aesthetic style of the plots
    sns.set_style("whitegrid")

    top_n_errors = df.copy()  # Work on a copy if df is not to be modified directly

    # Original model names and their simplified display names
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
    }

    # Simplify model names in the DataFrame
    top_n_errors['model_display_name'] = top_n_errors['model_name'].map(model_name_mapping)

    # Define a fixed color palette for the selected display models
    palette = sns.color_palette("husl", len(model_name_mapping))

    # Map each display model to a color
    model_color_map = {display_name: color for display_name, color in zip(model_name_mapping.values(), palette)}

    # Calculate the number of rows needed for 2 columns
    num_rows = len(errors_of_interest) // 2 + len(errors_of_interest) % 2

    # Create a figure to hold the subplots with 2 columns
    fig, axs = plt.subplots(num_rows, 2, figsize=(20, 7 * num_rows), dpi=120, facecolor='white')
    fig.subplots_adjust(hspace=0.4, wspace=0.2)  # Adjust space between plots

    # Flatten the axs array for easy iteration
    axs = axs.flatten()

    # Error title correction mapping
    error_title_correction = {
        "CompilationError": "Compilation Error",
        "IdetatioError": "Indentation Error",
        "SytaxError": "Syntax Error",
        "NameError": "Name Error"
    }

    for ax, error_type in zip(axs, errors_of_interest):
        error_df = top_n_errors[top_n_errors['error_category'] == error_type].copy()
        error_df['model_display_name'] = pd.Categorical(error_df['model_display_name'], categories=model_name_mapping.values(), ordered=True)
        error_df.sort_values('model_display_name', inplace=True)
        error_df[['avg_error_percentage', 'error_bar']] = error_df['error_percentage'].apply(lambda x: pd.Series(extract_avg_and_err_bar(x)))

        if report_err_bar:
            for model_display_name, group in error_df.groupby('model_display_name'):
                ax.errorbar(group['model_display_name'], group['avg_error_percentage'], yerr=group['error_bar'], fmt='o', color=model_color_map[model_display_name], label=model_display_name)
        else:
            sns.barplot(x='model_display_name', y='avg_error_percentage', data=error_df, ax=ax, palette=model_color_map)

        # Using seaborn's barplot to draw bars with error bars
        # sns.barplot(x='model_display_name', y='avg_error_percentage', data=error_df, ax=ax, palette=model_color_map, yerr=error_df['error_bar'] if report_err_bar else None)
        sns.barplot(x='model_display_name', y='avg_error_percentage', data=error_df.head(4), ax=ax, palette=model_color_map, yerr=error_df.head(4)['error_bar'].values if report_err_bar else None)
        
        ax.set_ylabel('Percentage', fontsize=20)
        ax.set_xlabel('')  # Hide the x-label as requested
        # Use the corrected error title for the plot title
        corrected_error_title = error_title_correction.get(error_type, error_type)  # Fallback to original if not found
        ax.set_title(f'{corrected_error_title}', fontsize=23)

        ax.tick_params(axis='x', rotation=45, labelsize=15)
        ax.tick_params(axis='y', labelsize=15)

        # Adjust y-axis limits based on the data of each subplot for better visualization
        max_error_percentage = error_df['avg_error_percentage'].max()
        ax.set_ylim(0, max_error_percentage + 5)  # Adding a buffer for better visualization

    # Hide any unused axes if the number of plots is not a perfect fill of the grid
    for ax in axs[len(errors_of_interest):]:
        ax.set_visible(False)

    # Update legend handles for display model names
    legend_handles = [mpatches.Patch(color=model_color_map[display_name], label=display_name) for display_name in model_name_mapping.values()]

    # Add the legend to the figure instead of the last subplot to span across the width and be centered at the bottom
    fig.legend(handles=legend_handles, title="Model Names", loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=4, fontsize=20, title_fontsize=20)

    plt.tight_layout()
    plt.savefig('error_analysis_by_type.png', dpi=300, bbox_inches='tight')

    plt.show()

plot_errors(df)

In [None]:

# Adjusted code snippet with requested changes
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.patches as mpatches
import re

# Function to extract average and error bar from the 'error_percentage' column
def extract_avg_and_err_bar(s):
    match = re.search(r"(\d+\.\d+) ± (\d+\.\d+)", s)
    if match:
        return float(match.group(1)), float(match.group(2))
    else:
        return float(s), 0  # No error bar, return value as is

def plot_errors(df, report_err_bar=True):
    # Assuming 'df' is your DataFrame
    # Define the specific errors to plot
    errors_of_interest = ["CompilationError", "IdetatioError", "SytaxError", "NameError"]

    # Set the aesthetic style of the plots
    sns.set_style("whitegrid")

    top_n_errors = df.copy()  # Work on a copy if df is not to be modified directly

    # Original model names and their simplified display names
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Simplify model names in the DataFrame
    top_n_errors['model_display_name'] = top_n_errors['model_name'].map(model_name_mapping)

    # Define a fixed color palette for all display models
    palette = sns.color_palette("husl", len(model_name_mapping))

    # Map each display model to a color
    model_color_map = {display_name: color for display_name, color in zip(model_name_mapping.values(), palette)}

    # Calculate the number of rows needed for 2 columns
    num_rows = len(errors_of_interest) // 2 + len(errors_of_interest) % 2

    # Create a figure to hold the subplots with 2 columns
    fig, axs = plt.subplots(num_rows, 2, figsize=(20, 7 * num_rows), dpi=120, facecolor='white')
    fig.subplots_adjust(hspace=0.4, wspace=0.2)  # Adjust space between plots

    # Flatten the axs array for easy iteration
    axs = axs.flatten()

    # Error title correction mapping
    error_title_correction = {
        "CompilationError": "Compilation Error",
        "IdetatioError": "Indentation Error",
        "SytaxError": "Syntax Error",
        "NameError": "Name Error"
    }


    for i, (ax, error_type) in enumerate(zip(axs, errors_of_interest)):
        error_df = top_n_errors[top_n_errors['error_category'] == error_type].copy()
        error_df['model_display_name'] = pd.Categorical(error_df['model_display_name'], categories=model_name_mapping.values(), ordered=True)
        error_df.sort_values('model_display_name', inplace=True)
        error_df[['avg_error_percentage', 'error_bar']] = error_df['error_percentage'].apply(lambda x: pd.Series(extract_avg_and_err_bar(x)))

        if report_err_bar:
            for model_display_name, group in error_df.groupby('model_display_name'):
                ax.errorbar(group['model_display_name'], group['avg_error_percentage'], yerr=group['error_bar'], fmt='o', color=model_color_map[model_display_name], label=model_display_name)
        else:
            sns.barplot(x='model_display_name', y='avg_error_percentage', data=error_df, ax=ax, palette=model_color_map)

        # Using seaborn's barplot to draw bars with error bars
        sns.barplot(x='model_display_name', y='avg_error_percentage', data=error_df, ax=ax, palette=model_color_map, yerr=error_df['error_bar'] if report_err_bar else None)
        ax.set_ylabel('Percentage', fontsize=20, fontweight='bold')
        
        # Remove x-axis labels for all subplots
        ax.set_xlabel('')  # Hide the x-label for all plots
        ax.set_xticklabels([])  # Hide x-tick labels

        ax.tick_params(axis='y', labelsize=20)

        # Use the corrected error title for the plot title
        corrected_error_title = error_title_correction.get(error_type, error_type)  # Fallback to original if not found
        ax.set_title(f'{corrected_error_title}', fontsize=28, fontweight='bold')

        # Adjust y-axis limits based on the data of each subplot for better visualization
        max_error_percentage = error_df['avg_error_percentage'].max()
        ax.set_ylim(0, max_error_percentage + 5)  # Adding a buffer for better visualization

    # Hide any unused axes if the number of plots is not a perfect fill of the grid
    for ax in axs[len(errors_of_interest):]:
        ax.set_visible(False)

    # Update legend handles for display model names and make legend text bold
    legend_handles = [mpatches.Patch(color=model_color_map[display_name], label=display_name) for display_name in model_name_mapping.values()]

    # Add the legend to the figure instead of the last subplot to span across the width and be centered at the bottom
    fig.legend(handles=legend_handles, title="Model Names", loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5, fontsize=23, title_fontsize=23)

    plt.tight_layout()
    plt.savefig('v2_error_analysis_by_type.png', dpi=300, bbox_inches='tight')

    plt.show()

plot_errors(df)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import re  # Import the re module for regular expressions

# Function to extract average and error bar from the 'error_percentage' column
def extract_avg_and_err_bar(s):
    match = re.search(r"(\d+\.\d+) ± (\d+\.\d+)", s)
    if match:
        return float(match.group(1)), float(match.group(2))
    else:
        return float(s), 0  # No error bar, return value as is

def plot_errors(df):
    # Define errors of interest and their corrected titles
    errors_of_interest = ["CompilationError", "IdetatioError", "SytaxError", "NameError", "Output Mismatch", "Success"]
    error_title_correction = {
        "CompilationError": "Compilation Error",
        "IdetatioError": "Indentation Error",
        "SytaxError": "Syntax Error",
        "NameError": "Name Error",
        "Output Mismatch": "Tests Failed",
        "Success": "Test Passed"
    }

    # Filter DataFrame for errors of interest
    df = df[df['error_category'].isin(errors_of_interest)]

    # Map error names to their corrected titles
    df['error_category'] = df['error_category'].map(error_title_correction)

    # Extract average error percentage
    df[['avg_error_percentage', 'error_bar']] = df['error_percentage'].apply(lambda x: pd.Series(extract_avg_and_err_bar(x)))

    # Model name mapping with the desired order
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Reverse the model_name_mapping order
    reversed_model_names = list(model_name_mapping.values())[::-1]
    reversed_model_order = {name: i for i, name in enumerate(reversed_model_names)}

    # Map model names in the DataFrame to their reversed order rank
    df['model_rank'] = df['model_name'].map(model_name_mapping).map(reversed_model_order)

    # Set the style
    sns.set_style("whitegrid")

    # Plotting
    plt.figure(figsize=(10, 6))

    # Create a stacked bar chart
    bottom_values = [0] * len(reversed_model_names)  # Initialize bottom values for stacking
    for error in errors_of_interest:
        error_df = df[df['error_category'] == error_title_correction[error]].sort_values(by='model_rank')
        plt.bar(error_df['model_rank'], error_df['avg_error_percentage'], label=error_title_correction[error], bottom=bottom_values)
        bottom_values = [i + j for i, j in zip(bottom_values, error_df['avg_error_percentage'])]  # Update bottom values for next stack

    plt.xticks(range(len(reversed_model_names)), reversed_model_names, rotation=45, ha="right")
    plt.ylabel('Error Percentage')
    plt.xlabel('Model Name')
    plt.title('Error Analysis by Category')
    plt.legend(title='Error Category')
    plt.tight_layout()
    plt.show()

# Assuming df is your DataFrame
plot_errors(df)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import re  # Import the re module for regular expressions

# Function to extract average and error bar from the 'error_percentage' column
def extract_avg_and_err_bar(s):
    match = re.search(r"(\d+\.\d+) ± (\d+\.\d+)", s)
    if match:
        return float(match.group(1)), float(match.group(2))
    else:
        return float(s), 0  # No error bar, return value as is

def plot_errors(df):
    # Define errors of interest and their corrected titles
    errors_of_interest = ["CompilationError", "IdetatioError", "SytaxError", "NameError", "Output Mismatch", "Success"]
    error_title_correction = {
        "CompilationError": "Compilation Error",
        "IdetatioError": "Indentation Error",
        "SytaxError": "Syntax Error",
        "NameError": "Name Error",
        "Output Mismatch": "Tests Failed",
        "Success": "Test Passed"
    }

    # Filter DataFrame for errors of interest
    df = df[df['error_category'].isin(errors_of_interest)]

    # Map error names to their corrected titles
    df['error_category'] = df['error_category'].map(error_title_correction)

    # Extract average error percentage
    df[['avg_error_percentage', 'error_bar']] = df['error_percentage'].apply(lambda x: pd.Series(extract_avg_and_err_bar(x)))

    # Model name mapping with the desired order
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Reverse the model_name_mapping order
    reversed_model_names = list(model_name_mapping.values())[::-1]
    reversed_model_order = {name: i for i, name in enumerate(reversed_model_names)}

    # Map model names in the DataFrame to their reversed order rank
    df['model_rank'] = df['model_name'].map(model_name_mapping).map(reversed_model_order)

    # Set the style
    sns.set_style("whitegrid")

    # Plotting
    plt.figure(figsize=(10, 6))

    # Plot each error category as a separate line
    for error in errors_of_interest:
        error_df = df[df['error_category'] == error_title_correction[error]].sort_values(by='model_rank')
        plt.plot(error_df['model_rank'], error_df['avg_error_percentage'], marker='o', label=error_title_correction[error])

    plt.xticks(range(len(reversed_model_names)), reversed_model_names, rotation=45, ha="right")
    plt.ylabel('Error Percentage')
    plt.xlabel('Model Name')
    plt.title('Error Analysis by Category')
    plt.legend(title='Error Category')
    plt.tight_layout()
    plt.show()

# Assuming df is your DataFrame
plot_errors(df)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

def plot_errors_stacked(df):
    # Define errors of interest and their corrected titles
    errors_of_interest = ["CompilationError", "IdetatioError", "SytaxError", "NameError", "Output Mismatch", "Success"]
    error_title_correction = {
        "CompilationError": "Compilation Error",
        "IdetatioError": "Indentation Error",
        "SytaxError": "Syntax Error",
        "NameError": "Name Error",
        "Output Mismatch": "Tests Failed",
        "Success": "Test Passed"
    }

    # Filter DataFrame for errors of interest
    df = df[df['error_category'].isin(errors_of_interest)]

    # Map error names to their corrected titles
    df['error_category'] = df['error_category'].map(error_title_correction)

    # Extract average error percentage
    df['avg_error_percentage'] = df['error_percentage'].apply(lambda x: float(x.split(' ± ')[0]))

    # Model name mapping with the desired order
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Reverse the model_name_mapping order
    reversed_model_names = list(model_name_mapping.values())[::-1]
    reversed_model_order = {name: i for i, name in enumerate(reversed_model_names)}

    # Map model names in the DataFrame to their reversed order rank
    df['model_rank'] = df['model_name'].map(model_name_mapping).map(reversed_model_order)

    # Sort DataFrame by model rank and error category for stacking
    df_sorted = df.sort_values(by=['model_rank', 'error_category'])

    # Set the style
    sns.set_style("whitegrid")

    # Plotting
    plt.figure(figsize=(10, 6))

    # Initialize a previous sum array to keep track of the cumulative sums
    prev_sum = np.zeros(len(reversed_model_names))

    # Iterate over each error category in reverse to stack from bottom to top
    for error in reversed(errors_of_interest):
        current_values = []
        for rank in range(len(reversed_model_names)):
            if rank in df_sorted[df_sorted['error_category'] == error_title_correction[error]]['model_rank'].values:
                current_values.append(df_sorted[(df_sorted['model_rank'] == rank) & (df_sorted['error_category'] == error_title_correction[error])]['avg_error_percentage'].values[0])
            else:
                current_values.append(0)
        current_sum = prev_sum + np.array(current_values)
        plt.fill_between(range(len(reversed_model_names)), prev_sum, current_sum, label=error_title_correction[error])
        prev_sum = current_sum

    plt.xticks(range(len(reversed_model_names)), reversed_model_names, rotation=45, ha="right")
    plt.ylabel('Cumulative Error Percentage')
    plt.xlabel('Model Name')
    plt.title('Cumulative Error Analysis by Category')
    plt.legend(title='Error Category', loc='upper left')
    plt.tight_layout()
    plt.show()

# Assuming df is your DataFrame
plot_errors_stacked(df)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

def plot_errors_stacked(df):
    # Adding "Others" category
    errors_of_interest = ["CompilationError", "IdetatioError", "SytaxError", "NameError", "Output Mismatch", "Success", "Others"]
    error_title_correction = {
        "CompilationError": "Compilation Error",
        "IdetatioError": "Indentation Error",
        "SytaxError": "Syntax Error",
        "NameError": "Name Error",
        "Output Mismatch": "Tests Failed",
        "Success": "Test Passed",
        "Others": "Others"
    }

    # Assuming 100% is the total for each model, calculate "Others" as 100 minus the sum of known errors
    df['avg_error_percentage'] = df['error_percentage'].apply(lambda x: float(x.split(' ± ')[0]))
    total_by_model = df.groupby('model_name')['avg_error_percentage'].sum().reset_index()
    total_by_model['Others'] = 100 - total_by_model['avg_error_percentage']
    others_df = total_by_model[['model_name', 'Others']]
    others_df['error_category'] = 'Others'
    others_df = others_df.rename(columns={'Others': 'avg_error_percentage'})
    df = pd.concat([df, others_df[['model_name', 'error_category', 'avg_error_percentage']]])

    # Map error names to their corrected titles
    df['error_category'] = df['error_category'].map(error_title_correction)

    # Continue with the existing setup
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Reverse the model_name_mapping order
    reversed_model_names = list(model_name_mapping.values())[::-1]
    reversed_model_order = {name: i for i, name in enumerate(reversed_model_names)}

    # Map model names in the DataFrame to their reversed order rank
    df['model_rank'] = df['model_name'].map(model_name_mapping).map(reversed_model_order)

    # Sort DataFrame by model rank and error category for stacking
    df_sorted = df.sort_values(by=['model_rank', 'error_category'])

    # Set the style and color palette
    sns.set_style("whitegrid")
    palette = sns.color_palette("hls", 7)

    # Plotting with increased resolution
    plt.figure(figsize=(12, 10), dpi=600)

    # Initialize a previous sum array to keep track of the cumulative sums
    prev_sum = np.zeros(len(reversed_model_names))

    # Iterate over each error category in reverse to stack from bottom to top
    for error in reversed(errors_of_interest):
        current_values = []
        for rank in range(len(reversed_model_names)):
            if rank in df_sorted[df_sorted['error_category'] == error_title_correction[error]]['model_rank'].values:
                current_values.append(df_sorted[(df_sorted['model_rank'] == rank) & (df_sorted['error_category'] == error_title_correction[error])]['avg_error_percentage'].values[0])
            else:
                current_values.append(0)
        current_sum = prev_sum + np.array(current_values)
        plt.fill_between(range(len(reversed_model_names)), prev_sum, current_sum, label=error_title_correction[error])
        prev_sum = current_sum
        
    # Customize font sizes and styles
    plt.xticks(range(len(reversed_model_names)), reversed_model_names, rotation=45, ha="right", fontsize=20, fontweight='bold')
    plt.yticks(fontsize=20, fontweight='bold')
    plt.xlabel('Model Name', fontsize=20, fontweight='bold')
    plt.ylabel('Cumulative Error Percentage', fontsize=20, fontweight='bold')
    plt.title('Cumulative Error Analysis by Category', fontsize=23, fontweight='bold')

    # Adjust legend
    plt.legend(title='Error Category', loc='upper center', bbox_to_anchor=(0.5, -0.15), fancybox=True, shadow=True, ncol=3, fontsize=20, title_fontsize='20')

    plt.tight_layout()
    plt.show()

# Assuming df is your DataFrame
plot_errors_stacked(df)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

def plot_errors_stacked(df):
    # Adding "Others" category
    errors_of_interest = ["CompilationError", "IdetatioError", "SytaxError", "NameError", "Output Mismatch", "Success"]
    error_title_correction = {
        "CompilationError": "Compilation Error",
        "IdetatioError": "Indentation Error",
        "SytaxError": "Syntax Error",
        "NameError": "Name Error",
        "Output Mismatch": "Tests Failed",
        "Success": "Test Passed",
    }

    # Assuming 100% is the total for each model, calculate "Others" as 100 minus the sum of known errors
    df['avg_error_percentage'] = df['error_percentage'].apply(lambda x: float(x.split(' ± ')[0]))
    total_by_model = df.groupby('model_name')['avg_error_percentage'].sum().reset_index()
    total_by_model['Other'] = 100 - total_by_model['avg_error_percentage']
    others_df = total_by_model[['model_name', 'Other']]
    others_df['error_category'] = 'Other'
    others_df = others_df.rename(columns={'Other': 'avg_error_percentage'})
    df = pd.concat([df, others_df[['model_name', 'error_category', 'avg_error_percentage']]])

    # Map error names to their corrected titles
    df['error_category'] = df['error_category'].map(error_title_correction)

    # Continue with the existing setup
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Reverse the model_name_mapping order
    reversed_model_names = list(model_name_mapping.values())[::-1]
    reversed_model_order = {name: i for i, name in enumerate(reversed_model_names)}

    # Map model names in the DataFrame to their reversed order rank
    df['model_rank'] = df['model_name'].map(model_name_mapping).map(reversed_model_order)

    # Sort DataFrame by model rank and error category for stacking
    df_sorted = df.sort_values(by=['model_rank', 'error_category'])

    # Set the style and color palette
    sns.set_style("white")
    palette = sns.color_palette("hls", 7)
    
    # Ensure "Test Passed" starts with green by rotating the palette
    green_index = palette.index(sns.color_palette("hls", 7)[2])  # Assuming green is at index 2 in the hls palette
    palette = palette[green_index:] + palette[:green_index]  # Rotate palette to start with green

    # Adjust plot size and resolution
    plt.figure(figsize=(20, 12), dpi=600)  # Increased plot size

    # Initialize a previous sum array to keep track of the cumulative sums
    prev_sum = np.zeros(len(reversed_model_names))

    # Iterate over each error category in reverse to stack from bottom to top
    for error in reversed(errors_of_interest):
        current_values = []
        for rank in range(len(reversed_model_names)):
            if rank in df_sorted[df_sorted['error_category'] == error_title_correction[error]]['model_rank'].values:
                current_values.append(df_sorted[(df_sorted['model_rank'] == rank) & (df_sorted['error_category'] == error_title_correction[error])]['avg_error_percentage'].values[0])
            else:
                current_values.append(0)
        current_sum = prev_sum + np.array(current_values)
        plt.fill_between(range(len(reversed_model_names)), prev_sum, current_sum, label=error_title_correction[error])
        prev_sum = current_sum
        
    # Customize font sizes and styles
    plt.xticks(range(len(reversed_model_names)), reversed_model_names, rotation=45, ha="right", fontsize=20, fontweight='bold')
    plt.yticks(fontsize=20, fontweight='bold')
    plt.xlabel('Model Name', fontsize=20, fontweight='bold')
    plt.ylabel('Cumulative Error Percentage', fontsize=20, fontweight='bold')
    plt.title('Cumulative Error Analysis by Category', fontsize=23, fontweight='bold')

    # Adjust legend to move it further downwards
    plt.legend(title='Error Category', loc='upper center', bbox_to_anchor=(0.5, -0.2), fancybox=True, shadow=True, ncol=3, fontsize=20, title_fontsize='20')

    # plt.tight_layout()
    plt.show()

# Assuming df is your DataFrame
plot_errors_stacked(df)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

def plot_errors_stacked(df):
    # Adding "Others" category
    errors_of_interest = ["CompilationError", "IdetatioError", "SytaxError", "NameError", "Output Mismatch", "Success"]
    error_title_correction = {
        "CompilationError": "Compilation Error",
        "IdetatioError": "Indentation Error",
        "SytaxError": "Syntax Error",
        "NameError": "Name Error",
        "Output Mismatch": "Tests Failed",
        "Success": "Test Passed",
    }

    # Assuming 100% is the total for each model, calculate "Others" as 100 minus the sum of known errors
    df['avg_error_percentage'] = df['error_percentage'].apply(lambda x: float(x.split(' ± ')[0]))
    total_by_model = df.groupby('model_name')['avg_error_percentage'].sum().reset_index()
    total_by_model['Other'] = 100 - total_by_model['avg_error_percentage']
    others_df = total_by_model[['model_name', 'Other']]
    others_df['error_category'] = 'Other'
    others_df = others_df.rename(columns={'Other': 'avg_error_percentage'})
    df = pd.concat([df, others_df[['model_name', 'error_category', 'avg_error_percentage']]])

    # Map error names to their corrected titles
    df['error_category'] = df['error_category'].map(error_title_correction)

    # Continue with the existing setup
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Reverse the model_name_mapping order
    reversed_model_names = list(model_name_mapping.values())[::-1]
    reversed_model_order = {name: i for i, name in enumerate(reversed_model_names)}

    # Map model names in the DataFrame to their reversed order rank
    df['model_rank'] = df['model_name'].map(model_name_mapping).map(reversed_model_order)

    # Sort DataFrame by model rank and error category for stacking
    df_sorted = df.sort_values(by=['model_rank', 'error_category'])

    # Set the style and color palette
    sns.set_style("white")
    palette = sns.color_palette("Set2", 7)
    
    # Ensure "Test Passed" starts with green by rotating the palette
    green_index = palette.index(sns.color_palette("Set2", 7)[2])  # Assuming green is at index 2 in the hls palette
    palette = palette[green_index:] + palette[:green_index]  # Rotate palette to start with green

    # Adjust plot size and resolution
    plt.figure(figsize=(20, 12), dpi=600)  # Increased plot size

    # Initialize a previous sum array to keep track of the cumulative sums
    prev_sum = np.zeros(len(reversed_model_names))

    # Iterate over each error category in reverse to stack from bottom to top
    for error in reversed(errors_of_interest):
        current_values = []
        for rank in range(len(reversed_model_names)):
            if rank in df_sorted[df_sorted['error_category'] == error_title_correction[error]]['model_rank'].values:
                current_values.append(df_sorted[(df_sorted['model_rank'] == rank) & (df_sorted['error_category'] == error_title_correction[error])]['avg_error_percentage'].values[0])
            else:
                current_values.append(0)
        current_sum = prev_sum + np.array(current_values)
        # plt.fill_between(range(len(reversed_model_names)), prev_sum, current_sum, label=error_title_correction[error])
        color = palette[errors_of_interest.index(error)]  # Use the custom palette
        plt.fill_between(range(len(reversed_model_names)), prev_sum, current_sum, label=error_title_correction[error], color=color)
        prev_sum = current_sum
        
    # Customize font sizes and styles
    plt.xticks(range(len(reversed_model_names)), reversed_model_names, rotation=45, ha="right", fontsize=20, fontweight='bold')
    plt.yticks(fontsize=20, fontweight='bold')
    plt.xlabel('Model Name', fontsize=20, fontweight='bold')
    plt.ylabel('Cumulative Error Percentage', fontsize=20, fontweight='bold')
    plt.title('Cumulative Error Analysis by Category', fontsize=23, fontweight='bold')

    # Adjust legend to move it further downwards
    plt.legend(title='Error Category', loc='upper center', bbox_to_anchor=(0.5, -0.2), fancybox=True, shadow=True, ncol=3, fontsize=20, title_fontsize='20')

    # plt.tight_layout()
    plt.show()

# Assuming df is your DataFrame
plot_errors_stacked(df)

In [None]:
df

In [None]:
# Get all unique model names
unique_models = df['model_name'].unique()

# Print the number of unique models
print(f"Found {len(unique_models)} unique models:")

# Print the unique model names (sorted alphabetically)
sorted_unique_models = sorted(unique_models)
for idx, model in enumerate(sorted_unique_models, 1):
    print(f"{idx}. {model}")
    
# Create a new DataFrame with just the unique model names
unique_models_df = pd.DataFrame({'model_name': sorted_unique_models})

# For analysis, you can also get the count of each model
model_counts = df['model_name'].value_counts().reset_index()
model_counts.columns = ['model_name', 'count']
print("\nModel occurrence counts:")
print(model_counts)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

def plot_errors_stacked(df):
    # Define the new order and color mapping for errors
    errors_of_interest = ["Success", "Output Mismatch", "SytaxError", "NameError", "IdetatioError", "CompilationError"]
    error_title_correction = {
        "CompilationError": "Compilation Error",
        "IdetatioError": "Indentation Error",
        "SytaxError": "Syntax Error",
        "NameError": "Name Error",
        "Output Mismatch": "Tests Failed",
        "Success": "Tests Passed",
    }

    # Adjusting the DataFrame to include the "Others" category
    df['avg_error_percentage'] = df['error_percentage'].apply(lambda x: float(x.split(' ± ')[0]))
    total_by_model = df.groupby('model_name')['avg_error_percentage'].sum().reset_index()
    total_by_model['Other'] = 100 - total_by_model['avg_error_percentage']
    others_df = total_by_model[['model_name', 'Other']]
    others_df['error_category'] = 'Other'
    others_df = others_df.rename(columns={'Other': 'avg_error_percentage'})
    df = pd.concat([df, others_df[['model_name', 'error_category', 'avg_error_percentage']]])

    # Map error names to their corrected titles
    df['error_category'] = df['error_category'].map(error_title_correction)

    # Continue with the existing setup
    # model_name_mapping = {
    #     "gpt-4-turbo": "GPT-4 Turbo",
    #     # "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
    #     "claude-3-opus-20240229": "Claude-3 Opus",
    #     "claude-3-sonnet-20240229": "Claude-3 Sonnet",
    #     "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
    #     "claude-3-haiku-20240307": "Claude-3 Haiku",
    #     "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
    #     "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
    #     "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
    #     "phi-3-mini-4k": "Phi-3 Mini 4K 3.8B",
    #     "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
    # }
   
    model_name_mapping = {
        "claude-3-7-sonnet-latest-ext16k": "Claude3.7Sonnet-ET",
        "o3-mini-2025-01-31": "o3-Mini(high)",
        "gpt-4o-2024-08-06": "GPT-4o",
        "DeepSeek-R1": "DeepSeek R1",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-7-sonnet-latest": "Claude-3.7 Sonnet",
        "claude-3-5-haiku-latest": "Claude-3.5 Haiku",
        "Qwen2-5-Coder-32B-Instruct": "Qwen2.5 Coder 32B",

        "Meta-Llama-3-3-70B-Instruct": "Llama-3.3 70B",
        "Qwen-QwQ-32B": "Qwen QwQ 32B",
        "deepseek-ai-DeepSeek-R1-Distill-Qwen-14B": "R1 Distill QWen 14B",
        "Meta-Llama-3-1-8B-Instruct-Turbo": "Llama-3.1 8B Turbo",
    }
    # # Continue with the existing setup
    # model_name_mapping = {
    #     "DeepSeek-R1": "DeepSeek R1",
    #     "Meta-Llama-3-1-8B-Instruct-Turbo": "Llama-3.1 8B Turbo",
    #     "Meta-Llama-3-3-70B-Instruct": "Llama-3.3 70B",
    #     "Qwen-QwQ-32B": "Qwen-QwQ 32B",
    #     "Qwen2-5-Coder-32B-Instruct": "Qwen2.5 Coder 32B",
    #     "claude-3-5-haiku-latest": "Claude-3.5 Haiku",
    #     "claude-3-7-sonnet-latest": "Claude-3.7 Sonnet",
    #     "claude-3-7-sonnet-latest-ext16k": "Claude-3.7 Sonnet ET",
    #     "claude-3-opus-20240229": "Claude-3 Opus",
    #     "deepseek-ai-DeepSeek-R1-Distill-Qwen-14B": "R1 Distill 14B",
    #     "gpt-4o-2024-08-06": "GPT-4o",
    #     "o3-mini-2025-01-31": "O3 Mini"
    # }

    # Reverse the model_name_mapping order
    reversed_model_names = list(model_name_mapping.values())[::-1]
    reversed_model_order = {name: i for i, name in enumerate(reversed_model_names)}

    # Map model names in the DataFrame to their reversed order rank
    df['model_rank'] = df['model_name'].map(model_name_mapping).map(reversed_model_order)

    # Sort DataFrame by model rank and error category for stacking
    df_sorted = df.sort_values(by=['model_rank', 'error_category'])

    # Set the style and select a specific color palette
    sns.set_style("white")
    hls_palette = sns.color_palette("Set2", 8)
    custom_palette = [hls_palette[i] for i in [0, 3, 4, 5, 2, 1]]  # Custom order based on the provided indices

    # Adjust plot size and resolution
    plt.figure(figsize=(20, 12), dpi=600)

    # Initialize a previous sum array to keep track of the cumulative sums
    prev_sum = np.zeros(len(reversed_model_names))

    # Iterate over each error category in reverse to stack from bottom to top
    # for error in errors_of_interest:
    #     current_values = []
    #     for rank in range(len(reversed_model_names)):
    #         if rank in df_sorted[df_sorted['error_category'] == error_title_correction[error]]['model_rank'].values:
    #             current_values.append(df_sorted[(df_sorted['model_rank'] == rank) & (df_sorted['error_category'] == error_title_correction[error])]['avg_error_percentage'].values[0])
    #         else:
    #             current_values.append(0)
    #     current_sum = prev_sum + np.array(current_values)
    #     color = custom_palette[errors_of_interest.index(error)]  # Use the custom palette
    #     plt.fill_between(range(len(reversed_model_names)), prev_sum, current_sum, label=error_title_correction[error], color=color)
    #     prev_sum = current_sum
    for error in errors_of_interest:
        current_values = []
        for rank in range(len(reversed_model_names)):
            if rank in df_sorted[df_sorted['error_category'] == error_title_correction[error]]['model_rank'].values:
                current_values.append(df_sorted[(df_sorted['model_rank'] == rank) & (df_sorted['error_category'] == error_title_correction[error])]['avg_error_percentage'].values[0])
            else:
                current_values.append(0)
        current_sum = prev_sum + np.array(current_values)
        color = custom_palette[errors_of_interest.index(error)]  # Use the custom palette
        plt.fill_between(range(len(reversed_model_names)), prev_sum, current_sum, label=error_title_correction[error], color=color, edgecolor='white', linewidth=2)  # Added edgecolor and linewidth
        prev_sum = current_sum
        
    # Customize font sizes and styles
    plt.xticks(range(len(reversed_model_names)), reversed_model_names, rotation=45, ha="right", fontsize=20, fontweight='bold')
    plt.yticks(fontsize=20, fontweight='bold')
    plt.xlabel('Model Name', fontsize=20, fontweight='bold')
    plt.ylabel('Cumulative Output Percentage', fontsize=20, fontweight='bold')
    plt.title('Cumulative Output by Category', fontsize=23, fontweight='bold')

    # Adjust legend to move it further downwards
    plt.legend(title='Output Category', loc='upper center', bbox_to_anchor=(0.5, -0.25), fancybox=True, shadow=True, ncol=3, fontsize=20, title_fontsize='20')

    # Save the plot to a PNG file
    plt.savefig('cumulative_output_by_category.png', bbox_inches='tight', dpi=1200)

    plt.show()

# Assuming df is your DataFrame
plot_errors_stacked(df)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

def plot_errors_stacked(df):
    # Define the new order and color mapping for errors
    errors_of_interest = ["Success", "Output Mismatch", "SytaxError", "NameError", "IdetatioError", "CompilationError"]
    error_title_correction = {
        "CompilationError": "Compilation Error",
        "IdetatioError": "Indentation Error",
        "SytaxError": "Syntax Error",
        "NameError": "Var Hallucination",
        "Output Mismatch": "Tests Failed",
        "Success": "Tests Passed",
    }

    # Adjusting the DataFrame to include the "Others" category
    df['avg_error_percentage'] = df['error_percentage'].apply(lambda x: float(x.split(' ± ')[0]))
    total_by_model = df.groupby('model_name')['avg_error_percentage'].sum().reset_index()
    total_by_model['Other'] = 100 - total_by_model['avg_error_percentage']
    others_df = total_by_model[['model_name', 'Other']]
    others_df['error_category'] = 'Other'
    others_df = others_df.rename(columns={'Other': 'avg_error_percentage'})
    df = pd.concat([df, others_df[['model_name', 'error_category', 'avg_error_percentage']]])

    # Map error names to their corrected titles
    df['error_category'] = df['error_category'].map(error_title_correction)

    # Updated model name mapping in descending order of capability/size
    model_name_mapping = {
        "claude-3-7-sonnet-latest-ext16k": "Claude3.7Sonnet-ET",
        "o3-mini-2025-01-31": "o3-Mini(high)",
        "gpt-4o-2024-08-06": "GPT-4o",
        "DeepSeek-R1": "DeepSeek R1",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-7-sonnet-latest": "Claude-3.7 Sonnet",
        "claude-3-5-haiku-latest": "Claude-3.5 Haiku",
        "Qwen2-5-Coder-32B-Instruct": "Qwen2.5 Coder 32B",

        "Meta-Llama-3-3-70B-Instruct": "Llama-3.3 70B",
        "Qwen-QwQ-32B": "Qwen QwQ 32B",
        "deepseek-ai-DeepSeek-R1-Distill-Qwen-14B": "R1 Distill QWen 14B",
        "Meta-Llama-3-1-8B-Instruct-Turbo": "Llama-3.1 8B Turbo",
    }
    # Reverse the model_name_mapping order
    reversed_model_names = list(model_name_mapping.values())[::-1]
    reversed_model_order = {name: i for i, name in enumerate(reversed_model_names)}

    # Map model names in the DataFrame to their reversed order rank
    df['model_rank'] = df['model_name'].map(model_name_mapping).map(reversed_model_order)

    # Sort DataFrame by model rank and error category for stacking
    df_sorted = df.sort_values(by=['model_rank', 'error_category'])

    # Set the style and select a specific color palette
    sns.set_style("whitegrid")  # Changed to whitegrid for subtle grid lines
    hls_palette = sns.color_palette("Set2", 8)
    custom_palette = [hls_palette[i] for i in [0, 3, 4, 5, 2, 1]]  # Custom order based on provided indices

    # Adjust plot size and resolution
    plt.figure(figsize=(20, 12), dpi=1200)  # Increased DPI from 600 to 1200

    # Initialize a previous sum array to keep track of the cumulative sums
    prev_sum = np.zeros(len(reversed_model_names))

    # Iterate over each error category to stack
    for error in errors_of_interest:
        current_values = []
        for rank in range(len(reversed_model_names)):
            if rank in df_sorted[df_sorted['error_category'] == error_title_correction[error]]['model_rank'].values:
                current_values.append(df_sorted[(df_sorted['model_rank'] == rank) & (df_sorted['error_category'] == error_title_correction[error])]['avg_error_percentage'].values[0])
            else:
                current_values.append(0)
        current_sum = prev_sum + np.array(current_values)
        color = custom_palette[errors_of_interest.index(error)]
        plt.fill_between(range(len(reversed_model_names)), prev_sum, current_sum, 
                         label=error_title_correction[error], 
                         color=color, 
                         edgecolor='white', 
                         linewidth=1.5,  # Refined line width for better definition
                         alpha=0.95)     # Slight transparency for better visual
        prev_sum = current_sum

    # Add subtle grid lines for better readability
    plt.grid(axis='y', linestyle='--', alpha=0.3)
        
    # Enhance text and label styling
    plt.xticks(range(len(reversed_model_names)), reversed_model_names, rotation=45, ha="right", fontsize=22, fontweight='bold')
    plt.yticks(fontsize=22, fontweight='bold')
    plt.xlabel('Model Name', fontsize=24, fontweight='bold')
    plt.ylabel('Cumulative Output Percentage', fontsize=24, fontweight='bold')
    plt.title('Cumulative Output by Category', fontsize=26, fontweight='bold')

    # Enhance legend appearance
    plt.legend(title='Output Category', 
               loc='upper center', 
               bbox_to_anchor=(0.5, -0.4), 
               fancybox=True, 
               shadow=True, 
               ncol=3, 
               fontsize=22, 
               title_fontsize=24)

    # Save with higher resolution
    plt.savefig('cumulative_output_by_category.png', 
                bbox_inches='tight', 
                dpi=1200,           # Increased DPI for saved file
                pad_inches=0.5)     # Slightly more padding


    plt.show()

# Assuming df is your DataFrame
plot_errors_stacked(df)

In [None]:
all_err_save_path = '../Analysis_Results/storage_server/All_models_res/error_analysis/final_results_all_models_all_err_by_model.csv' 
df = pd.read_csv(all_err_save_path)
df

In [None]:
df

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

def plot_errors_stacked(df):
    # Adding "Others" category
    errors_of_interest = ["CompilationError", "IdetatioError", "SytaxError", "NameError", "Output Mismatch", "Success"]
    error_title_correction = {
        "CompilationError": "Compilation Error",
        "IdetatioError": "Indentation Error",
        "SytaxError": "Syntax Error",
        "NameError": "Name Error",
        "Output Mismatch": "Tests Failed",
        "Success": "Test Passed",
    }

    # Assuming 100% is the total for each model, calculate "Others" as 100 minus the sum of known errors
    df['avg_error_percentage'] = df['error_percentage'].apply(lambda x: float(x.split(' ± ')[0]))
    total_by_model = df.groupby('model_name')['avg_error_percentage'].sum().reset_index()
    total_by_model['Other'] = 100 - total_by_model['avg_error_percentage']
    others_df = total_by_model[['model_name', 'Other']]
    others_df['error_category'] = 'Other'
    others_df = others_df.rename(columns={'Other': 'avg_error_percentage'})
    df = pd.concat([df, others_df[['model_name', 'error_category', 'avg_error_percentage']]])

    # Map error names to their corrected titles
    df['error_category'] = df['error_category'].map(error_title_correction)

    # Continue with the existing setup
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Reverse the model_name_mapping order
    reversed_model_names = list(model_name_mapping.values())[::-1]
    reversed_model_order = {name: i for i, name in enumerate(reversed_model_names)}

    # Map model names in the DataFrame to their reversed order rank
    df['model_rank'] = df['model_name'].map(model_name_mapping).map(reversed_model_order)

    # Sort DataFrame by model rank and error category for stacking
    df_sorted = df.sort_values(by=['model_rank', 'error_category'])

    # Set the style and color palette
    # Set the style and select a better color palette, ensuring "Tests Passed" is green and "Tests Failed" is red
    sns.set_style("white")
    palette = ["#3498db", "#e74c3c", "#2ecc71", "#9b59b6", "#34495e", "#f1c40f", "#95a5a6"]  # Custom palette

    
    # # Ensure "Test Passed" starts with green by rotating the palette
    # green_index = palette.index(sns.color_palette("hls", 7)[2])  # Assuming green is at index 2 in the hls palette
    # palette = palette[green_index:] + palette[:green_index]  # Rotate palette to start with green

    # Adjust plot size and resolution
    plt.figure(figsize=(20, 12), dpi=600)  # Increased plot size

    # Initialize a previous sum array to keep track of the cumulative sums
    prev_sum = np.zeros(len(reversed_model_names))

    # Iterate over each error category in reverse to stack from bottom to top
    for error in reversed(errors_of_interest):
        current_values = []
        for rank in range(len(reversed_model_names)):
            if rank in df_sorted[df_sorted['error_category'] == error_title_correction[error]]['model_rank'].values:
                current_values.append(df_sorted[(df_sorted['model_rank'] == rank) & (df_sorted['error_category'] == error_title_correction[error])]['avg_error_percentage'].values[0])
            else:
                current_values.append(0)
        current_sum = prev_sum + np.array(current_values)
        color = palette[errors_of_interest.index(error)]  # Use the custom palette
        plt.fill_between(range(len(reversed_model_names)), prev_sum, current_sum, label=error_title_correction[error], color=color)
        prev_sum = current_sum
        
    # Customize font sizes and styles
    plt.xticks(range(len(reversed_model_names)), reversed_model_names, rotation=45, ha="right", fontsize=20, fontweight='bold')
    plt.yticks(fontsize=20, fontweight='bold')
    plt.xlabel('Model Name', fontsize=20, fontweight='bold')
    plt.ylabel('Cumulative Error Percentage', fontsize=20, fontweight='bold')
    plt.title('Cumulative Error Analysis by Category', fontsize=23, fontweight='bold')

    # Adjust legend to move it further downwards to avoid overlap with the model names
    plt.legend(title='Error Category', loc='upper center', bbox_to_anchor=(0.5, -0.3), fancybox=True, shadow=True, ncol=3, fontsize=20, title_fontsize='20')

    plt.show()
# Assuming df is your DataFrame
plot_errors_stacked(df)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

def plot_errors_stacked(df):
    errors_of_interest = ["CompilationError", "IdetatioError", "SytaxError", "NameError", "Output Mismatch", "Success", "Other"]
    error_title_correction = {
        "CompilationError": "Compilation Error",
        "IdetatioError": "Indentation Error",
        "SytaxError": "Syntax Error",
        "NameError": "Name Error",
        "Output Mismatch": "Tests Failed",
        "Success": "Test Passed",
    }

    df['avg_error_percentage'] = df['error_percentage'].apply(lambda x: float(x.split(' ± ')[0]))
    total_by_model = df.groupby('model_name')['avg_error_percentage'].sum().reset_index()
    total_by_model['Other'] = 100 - total_by_model['avg_error_percentage']
    others_df = total_by_model[['model_name', 'Other']]
    others_df['error_category'] = 'Other'
    others_df = others_df.rename(columns={'Other': 'avg_error_percentage'})
    df = pd.concat([df, others_df[['model_name', 'error_category', 'avg_error_percentage']]])

    df['error_category'] = df['error_category'].map(error_title_correction)

    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    reversed_model_names = list(model_name_mapping.values())[::-1]
    reversed_model_order = {name: i for i, name in enumerate(reversed_model_names)}

    df['model_rank'] = df['model_name'].map(model_name_mapping).map(reversed_model_order)

    df_sorted = df.sort_values(by=['model_rank', 'error_category'])

    sns.set_style("white")
    plt.figure(figsize=(20, 12), dpi=600)
    plt.gca().set_facecolor('white')  # Change the plot area background to white for contrast

    palette = sns.color_palette("hls", len(errors_of_interest))  # Use "hls" palette for all categories including "Other"

    prev_sum = np.zeros(len(reversed_model_names))

    for error in reversed(errors_of_interest):
        current_values = []
        for rank in range(len(reversed_model_names)):
            if rank in df_sorted[df_sorted['error_category'] == error]['model_rank'].values:
                current_values.append(df_sorted[(df_sorted['model_rank'] == rank) & (df_sorted['error_category'] == error)]['avg_error_percentage'].values[0])
            else:
                current_values.append(0)
        current_sum = prev_sum + np.array(current_values)
        color = palette[errors_of_interest.index(error)]
        plt.fill_between(range(len(reversed_model_names)), prev_sum, current_sum, label=error, color=color)
        prev_sum = current_sum

    plt.xticks(range(len(reversed_model_names)), reversed_model_names, rotation=45, ha="right", fontsize=20, fontweight='bold')
    plt.yticks(fontsize=20, fontweight='bold')
    plt.xlabel('Model Name', fontsize=20, fontweight='bold')
    plt.ylabel('Cumulative Error Percentage', fontsize=20, fontweight='bold')
    plt.title('Cumulative Error Analysis by Category', fontsize=23, fontweight='bold')

    plt.legend(title='Error Category', loc='upper center', bbox_to_anchor=(0.5, -0.2), fancybox=True, shadow=True, ncol=3, fontsize=20, title_fontsize='20')

    plt.show()

plot_errors_stacked(df)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

def plot_errors_stacked(df):
    # Correcting typos in error categories and adding "Others" category
    errors_of_interest = ["CompilationError", "IndentationError", "SyntaxError", "NameError", "Output Mismatch", "Success"]
    error_title_correction = {
        "CompilationError": "Compilation Error",
        "IndentationError": "Indentation Error",
        "SyntaxError": "Syntax Error",
        "NameError": "Name Error",
        "Output Mismatch": "Tests Failed",
        "Success": "Test Passed",
    }

    # Calculate "Others" as 100 minus the sum of known errors for each model
    df['avg_error_percentage'] = df['error_percentage'].apply(lambda x: float(x.split(' ± ')[0]))
    total_by_model = df.groupby('model_name')['avg_error_percentage'].sum().reset_index()
    total_by_model['Other'] = 100 - total_by_model['avg_error_percentage']
    others_df = total_by_model[['model_name', 'Other']]
    others_df['error_category'] = 'Other'
    others_df = others_df.rename(columns={'Other': 'avg_error_percentage'})
    df = pd.concat([df, others_df[['model_name', 'error_category', 'avg_error_percentage']]])

    # Ensure all categories, including "Other", are included
    errors_of_interest.append("Other")
    error_title_correction["Other"] = "Other"

    # Map error names to their corrected titles
    df['error_category'] = df['error_category'].map(error_title_correction)

    # Continue with the existing setup
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Reverse the model_name_mapping order
    reversed_model_names = list(model_name_mapping.values())[::-1]
    reversed_model_order = {name: i for i, name in enumerate(reversed_model_names)}

    # Map model names in the DataFrame to their reversed order rank
    df['model_rank'] = df['model_name'].map(model_name_mapping).map(reversed_model_order)

    # Sort DataFrame by model rank and error category for stacking
    df_sorted = df.sort_values(by=['model_rank', 'error_category'])

    # Set the style and color palette
    sns.set_style("white")
    palette = sns.color_palette("hls", len(errors_of_interest))
    
    # Manually set "Test Passed" to green within the palette
    test_passed_index = errors_of_interest.index("Success")  # Find the index for "Test Passed"
    palette[test_passed_index] = (0.4, 0.8, 0.4)  # Set to green

    # Adjust plot size and resolution
    plt.figure(figsize=(20, 12), dpi=600)

    # Plotting without error bars - using fill_between for stacked areas
    prev_sum = np.zeros(len(reversed_model_names))
    for error in reversed(errors_of_interest):
        current_values = []
        for rank in range(len(reversed_model_names)):
            if rank in df[df['error_category'] == error]['model_rank'].values:
                current_values.append(df[(df['model_rank'] == rank) & (df['error_category'] == error)]['avg_error_percentage'].values[0])
            else:
                current_values.append(0)
        current_sum = prev_sum + np.array(current_values)
        plt.fill_between(range(len(reversed_model_names)), prev_sum, current_sum, label=error, color=palette[errors_of_interest.index(error)])
        prev_sum = current_sum

    # Customize font sizes and styles, adjust legend to move it further downwards
    plt.xticks(range(len(reversed_model_names)), reversed_model_names, rotation=45, ha="right", fontsize=20, fontweight='bold')
    plt.yticks(fontsize=20, fontweight='bold')
    plt.xlabel('Model Name', fontsize=20, fontweight='bold')
    plt.ylabel('Cumulative Error Percentage', fontsize=20, fontweight='bold')
    plt.title('Cumulative Error Analysis by Category', fontsize=23, fontweight='bold')
    plt.legend(title='Error Category', loc='upper center', bbox_to_anchor=(0.5, -0.2), fancybox=True, shadow=True, ncol=3, fontsize=20, title_fontsize='20')

    plt.show()
# Assuming df is your DataFrame
plot_errors_stacked(df)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

def plot_errors_stacked(df):
    # Adding "Others" category
    errors_of_interest = ["CompilationError", "IdetatioError", "SytaxError", "NameError", "Output Mismatch", "Success"]
    error_title_correction = {
        "CompilationError": "Compilation Error",
        "IdetatioError": "Indentation Error",
        "SytaxError": "Syntax Error",
        "NameError": "Name Error",
        "Output Mismatch": "Tests Failed",
        "Success": "Test Passed",
    }

    # # Assuming 100% is the total for each model, calculate "Others" as 100 minus the sum of known errors
    # df['avg_error_percentage'] = df['error_percentage'].apply(lambda x: float(x.split(' ± ')[0]))
    # total_by_model = df.groupby('model_name')['avg_error_percentage'].sum().reset_index()
    # total_by_model['Others'] = 100 - total_by_model['avg_error_percentage']
    # others_df = total_by_model[['model_name', 'Others']]
    # others_df['error_category'] = 'Others'
    # others_df = others_df.rename(columns={'Others': 'avg_error_percentage'})
    # df = pd.concat([df, others_df[['model_name', 'error_category', 'avg_error_percentage']]])

    # Calculate "Others" as 100 minus the sum of known errors for each model
    df['avg_error_percentage'] = df['error_percentage'].apply(lambda x: float(x.split(' ± ')[0]))
    total_by_model = df.groupby('model_name')['avg_error_percentage'].sum().reset_index()
    total_by_model['Others'] = 100 - total_by_model['avg_error_percentage']
    others_df = total_by_model[['model_name', 'Others']]
    others_df['error_category'] = 'Others'
    others_df = others_df.rename(columns={'Others': 'avg_error_percentage'})
    df = pd.concat([df, others_df[['model_name', 'error_category', 'avg_error_percentage']]])

    # Map error names to their corrected titles
    df['error_category'] = df['error_category'].map(error_title_correction)

    # Continue with the existing setup
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Reverse the model_name_mapping order
    reversed_model_names = list(model_name_mapping.values())[::-1]
    reversed_model_order = {name: i for i, name in enumerate(reversed_model_names)}

    # Map model names in the DataFrame to their reversed order rank
    df['model_rank'] = df['model_name'].map(model_name_mapping).map(reversed_model_order)

    # Sort DataFrame by model rank and error category for stacking
    df_sorted = df.sort_values(by=['model_rank', 'error_category'])

    # Set the style and color palette
    sns.set_style("white")
    palette = sns.color_palette("hls", len(errors_of_interest) + 1)  # Adjust palette size for "Others"

    
    # Ensure "Test Passed" starts with green by rotating the palette
    green_index = palette.index(sns.color_palette("hls", 7)[3])  # Assuming green is at index 2 in the hls palette
    palette = palette[green_index:] + palette[:green_index]  # Rotate palette to start with green

    # Ensure "Test Passed" starts with green
    # palette[errors_of_interest.index("Success")] = (0.4, 0.8, 0.4)  # Manually set green for "Test Passed"

    # Adjust plot size and resolution
    plt.figure(figsize=(20, 12), dpi=600)  # Increased plot size

    # Initialize a previous sum array to keep track of the cumulative sums
    prev_sum = np.zeros(len(reversed_model_names))

    # Iterate over each error category in reverse to stack from bottom to top
    for error in reversed(errors_of_interest):
        current_values = []
        for rank in range(len(reversed_model_names)):
            if rank in df_sorted[df_sorted['error_category'] == error_title_correction[error]]['model_rank'].values:
                current_values.append(df_sorted[(df_sorted['model_rank'] == rank) & (df_sorted['error_category'] == error_title_correction[error])]['avg_error_percentage'].values[0])
            else:
                current_values.append(0)
        current_sum = prev_sum + np.array(current_values)
        plt.fill_between(range(len(reversed_model_names)), prev_sum, current_sum, label=error_title_correction[error])
        prev_sum = current_sum
        
    # Customize font sizes and styles
    plt.xticks(range(len(reversed_model_names)), reversed_model_names, rotation=45, ha="right", fontsize=20, fontweight='bold')
    plt.yticks(fontsize=20, fontweight='bold')
    plt.xlabel('Model Name', fontsize=20, fontweight='bold')
    plt.ylabel('Cumulative Error Percentage', fontsize=20, fontweight='bold')
    plt.title('Cumulative Error Analysis by Category', fontsize=23, fontweight='bold')

    # Adjust legend to move it further downwards
    # plt.legend(title='Error Category', loc='upper center', bbox_to_anchor=(0.5, -0.2), fancybox=True, shadow=True, ncol=3, fontsize=20, title_fontsize='20')
    plt.legend(title='Error Category', loc='upper center', bbox_to_anchor=(0.5, -0.3), fancybox=True, shadow=True, ncol=3, fontsize=20, title_fontsize='20')


    plt.tight_layout()
    plt.show()

# Assuming df is your DataFrame
plot_errors_stacked(df)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.patches as mpatches
import re

all_w_post_process = '../Analysis_Results/storage_server/All_models_res/post_process/all_post.csv'
all_no_post_process = '../Analysis_Results/storage_server/All_models_res/post_process/all_no_post.csv' 
df_post = pd.read_csv(all_w_post_process)
df_no_post = pd.read_csv(all_no_post_process)
df_post

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.patches as mpatches

def plot_all_pass_ratio(df):
    sns.set_style("whitegrid")

    # Original model names and their simplified display names
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Simplify model names in the DataFrame
    df['model_display_name'] = df['Model Name'].map(model_name_mapping)

    # Define a fixed color palette for all display models
    palette = sns.color_palette("husl", len(model_name_mapping))

    # Map each display model to a color
    model_color_map = {display_name: color for display_name, color in zip(model_name_mapping.values(), palette)}

    # Plotting
    plt.figure(figsize=(10, 8))
    for model_display_name, group in df.groupby('model_display_name'):
        plt.errorbar(group['model_display_name'], group['All Pass Ratio (%)'], yerr=group['Error Bar'], fmt='o', color=model_color_map[model_display_name], label=model_display_name)

    # sns.barplot(x='model_display_name', y='avg_error_percentage', data=df, ax=ax, palette=model_color_map, yerr=df['Error Bar'])
    plt.ylabel('All Pass Ratio (%)', fontsize=14)
    plt.title('All Pass Ratio (%) by Model with Error Bars', fontsize=16)
    plt.xticks(rotation=45, ha="right", fontsize=12)

    # Update legend handles for display model names
    legend_handles = [mpatches.Patch(color=model_color_map[display_name], label=display_name) for display_name in model_name_mapping.values()]
    plt.legend(handles=legend_handles, title="Model Names", loc='upper right', fancybox=True, shadow=True, fontsize=12, title_fontsize=14)

    plt.tight_layout()
    plt.show()

# Assuming 'df' is your DataFrame
plot_all_pass_ratio(df)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.patches as mpatches

def plot_all_pass_ratio_bar_chart(df):
    sns.set_style("whitegrid")

    # Original model names and their simplified display names
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Simplify model names in the DataFrame
    df['model_display_name'] = df['Model Name'].map(model_name_mapping)

    # Define a fixed color palette for all display models
    palette = sns.color_palette("husl", len(model_name_mapping))

    # Create a figure for the plot
    fig, ax = plt.subplots(figsize=(10, 8))

    # Plotting
    sns.barplot(x='model_display_name', y='All Pass Ratio (%)', data=df, palette=palette, ci=None, ax=ax)

    # Manually add error bars
    for index, row in df.iterrows():
        plt.errorbar(x=index, y=row['All Pass Ratio (%)'], yerr=row['Error Bar'], fmt='none', color='black', capsize=5)

    ax.set_ylabel('All Pass Ratio (%)', fontsize=14)
    ax.set_title('All Pass Ratio (%) by Model with Error Bars', fontsize=16)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=12)

    # Update legend handles for display model names
    legend_handles = [mpatches.Patch(color=color, label=display_name) for display_name, color in zip(model_name_mapping.values(), palette)]

    # Add the legend to the figure instead of the last subplot
    fig.legend(handles=legend_handles, title="Model Names", loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5, fontsize=20, title_fontsize=20)

    plt.tight_layout()
    plt.show()

# Assuming 'df' is your DataFrame
plot_all_pass_ratio_bar_chart(df)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.patches as mpatches

# Specific order for the models
model_order = [
    "gpt-4-turbo", "claude-3-opus-20240229", "claude-3-sonnet-20240229", 
    "claude-3-haiku-20240307", "Meta-Llama-3-70B-Instruct", "gpt-3.5-turbo-0125",
    "Meta-Llama-3-8B-Instruct", "deepseek-coder-7b-instruct", "deepseek-coder-1.3b-instruct", 
    "phi-3-mini-4k"
]

def plot_all_pass_ratio_bar_chart_ordered(df, model_order):
    sns.set_style("whitegrid")

    # Original model names and their simplified display names
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Simplify model names in the DataFrame
    df['model_display_name'] = df['Model Name'].map(model_name_mapping)

    # Define a fixed color palette for all display models
    palette = sns.color_palette("husl", len(model_name_mapping))

    # Create a figure for the plot
    fig, ax = plt.subplots(figsize=(10, 8))

    # Plotting with specified order
    sns.barplot(x='model_display_name', y='All Pass Ratio (%)', data=df, palette=palette, ci=None, ax=ax, order=[model_name_mapping[model] for model in model_order])

    # Manually add error bars
    for index, row in df[df['Model Name'].isin(model_order)].iterrows():
        plt.errorbar(x=model_order.index(row['Model Name']), y=row['All Pass Ratio (%)'], yerr=row['Error Bar'], fmt='none', color='black', capsize=5)

    ax.set_ylabel('All Pass Ratio (%)', fontsize=14)
    ax.set_title('All Pass Ratio (%) by Model with Error Bars', fontsize=16)
    ax.set_xticklabels([model_name_mapping[model] for model in model_order], rotation=45, ha="right", fontsize=12)

    # Update legend handles for display model names
    legend_handles = [mpatches.Patch(color=color, label=model_name_mapping[model]) for model, color in zip(model_order, palette)]

    # Add the legend to the figure instead of the last subplot
    fig.legend(handles=legend_handles, title="Model Names", loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5, fontsize=12, title_fontsize=14)

    plt.tight_layout()
    plt.show()

# Assuming 'df' is your DataFrame and 'model_order' is defined as above
plot_all_pass_ratio_bar_chart_ordered(df_no_post, model_order)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.patches as mpatches
import numpy as np

# Specific order for the models
model_order = [
    "gpt-4-turbo", "claude-3-opus-20240229", "claude-3-sonnet-20240229", 
    "claude-3-haiku-20240307", "Meta-Llama-3-70B-Instruct", "gpt-3.5-turbo-0125",
    "Meta-Llama-3-8B-Instruct", "deepseek-coder-7b-instruct", "deepseek-coder-1.3b-instruct", 
    "phi-3-mini-4k"
]

def plot_all_pass_ratio_side_by_side(df_no_post, df_post, model_order):
    sns.set_style("whitegrid")

    # Original model names and their simplified display names
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Simplify model names in the DataFrames
    df_no_post['model_display_name'] = df_no_post['Model Name'].map(model_name_mapping)
    df_post['model_display_name'] = df_post['Model Name'].map(model_name_mapping)

    # Merge the two DataFrames for easier plotting
    df_no_post['Type'] = 'No Post-Processing'
    df_post['Type'] = 'With Post-Processing'
    df_combined = pd.concat([df_no_post, df_post])
 
    # Define a color palette for 'No Post' and 'Post'
    palette = sns.color_palette("husl", 2)  # Only two types: 'No Post' and 'Post'
    lighter_color, darker_color = palette

    # Simplify the combined_palette to match 'Type' values directly
    combined_palette = {'No Post-Processing': lighter_color, 'With Post-Processing': darker_color}


    # # Create a figure for the plot with increased resolution
    fig, ax = plt.subplots(figsize=(12, 9), dpi=400)

    # Plotting with specified order and side by side bars
    sns.barplot(x='model_display_name', y='All Pass Ratio (%)', hue='Type', data=df_combined, palette=combined_palette, ci=None, ax=ax, order=[model_name_mapping[model] for model in model_order])
    
    ax.set_ylabel('All Pass Ratio (%)', fontsize=14)
    ax.set_title('All Pass Ratio (%) by Model with and without Post Processing', fontsize=16)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=12)

    # Update legend handles for display model names with type
    legend_handles = [mpatches.Patch(color=color, label=model) for model, color in combined_palette.items()]

    # Add the legend to the figure instead of the last subplot
    fig.legend(handles=legend_handles, title="Types", loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5, fontsize=12, title_fontsize=14)

    plt.tight_layout()
    plt.show()

# Assuming 'df_no_post' and 'df_post' are your DataFrames and 'model_order' is defined as above
plot_all_pass_ratio_side_by_side(df_no_post, df_post, model_order)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.patches as mpatches
import numpy as np

# Specific order for the models
model_order = [
    "gpt-4-turbo", "claude-3-opus-20240229", "claude-3-sonnet-20240229", 
    "claude-3-haiku-20240307", "Meta-Llama-3-70B-Instruct", "gpt-3.5-turbo-0125",
    "Meta-Llama-3-8B-Instruct", "deepseek-coder-7b-instruct", "deepseek-coder-1.3b-instruct", 
    "phi-3-mini-4k"
]

def plot_all_pass_ratio_side_by_side(df_no_post, df_post, model_order):
    sns.set_style("whitegrid")

    # Original model names and their simplified display names
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Simplify model names in the DataFrames
    df_no_post['model_display_name'] = df_no_post['Model Name'].map(model_name_mapping)
    df_post['model_display_name'] = df_post['Model Name'].map(model_name_mapping)

    # Combine the two DataFrames with an additional 'Condition' column
    df_no_post['Condition'] = 'No Post'
    df_post['Condition'] = 'Post'
    df_combined = pd.concat([df_no_post, df_post], ignore_index=True)


    # # Define a color palette with lighter colors for 'No Post' and darker colors for 'Post'
    # palette = sns.color_palette("husl", len(model_order))
    # lighter_colors = sns.color_palette([(r, g, b, 0.6) for r, g, b in palette])
    # darker_colors = sns.color_palette([(r, g, b, 1) for r, g, b in palette])
    # combined_palette = {model_name_mapping[model] + ' No Post': lighter_colors[i] for i, model in enumerate(model_order)}
    # combined_palette.update({model_name_mapping[model] + ' Post': darker_colors[i] for i, model in enumerate(model_order)})
    
    # Define a base color palette for all display models
    base_palette = sns.color_palette("husl", len(model_order))


    # Generate darker and lighter shades for each model
    # This is a simplified approach; adjust the color manipulation as needed
    colors = {}
    for i, model in enumerate(model_order):
        base_color = base_palette[i]
        # Assuming base_color is in RGB format
        darker = sns.dark_palette(base_color, n_colors=3)[2]  # Get a darker shade
        lighter = sns.light_palette(base_color, n_colors=3)[1]  # Get a lighter shade
        colors[model] = {'Post': darker, 'No Post': lighter}
    
    # Create a mapping for colors to use in the plot
    color_mapping = {model_name_mapping[model] + ' - ' + condition: colors[model][condition] for model in model_order for condition in ['Post', 'No Post']}

    # Simplify model names in the DataFrame
    df_combined['model_display_name'] = df_combined['Model Name'].map(model_name_mapping) + ' - ' + df_combined['Condition']


    # # Create a figure for the plot with increased resolution
    fig, ax = plt.subplots(figsize=(12, 9), dpi=400)
    sns.barplot(x='model_display_name', y='All Pass Ratio (%)', data=df_combined, palette=color_mapping, ci=None, ax=ax)

    # Manually add error bars
    # Adjust this loop to correctly place error bars for "Post" and "No Post"
    for index, row in df_combined.iterrows():
        model_index = model_order.index(row['Model Name']) * 2 + ('Post' == row['Condition'])
        plt.errorbar(x=model_index, y=row['All Pass Ratio (%)'], yerr=row['Error Bar'], fmt='none', color='black', capsize=5)

    ax.set_ylabel('All Pass Ratio (%)', fontsize=14)
    ax.set_title('All Pass Ratio (%) by Model with Error Bars', fontsize=16)
    ax.set_xticklabels(color_mapping.keys(), rotation=45, ha="right", fontsize=12)

    plt.tight_layout()
    plt.show()

    # # Plotting with specified order and side by side bars
    # # sns.barplot(x='model_display_name', y='All Pass Ratio (%)', hue='Type', data=df_combined, palette=combined_palette, ci=None, ax=ax, order=[model_name_mapping[model] for model in model_order])
    # sns.barplot(x='model_display_name', y='All Pass Ratio (%)', hue='Type', data=df_combined, palette=combined_palette, ci=None, ax=ax, order=[model_name_mapping[model] for model in model_order])
    
    # ax.set_ylabel('All Pass Ratio (%)', fontsize=14)
    # ax.set_title('All Pass Ratio (%) by Model with and without Post Processing', fontsize=16)
    # ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=12)

    # # Update legend handles for display model names with type
    # legend_handles = [mpatches.Patch(color=color, label=model) for model, color in combined_palette.items()]

    # # Add the legend to the figure instead of the last subplot
    # fig.legend(handles=legend_handles, title="Model Names and Types", loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5, fontsize=12, title_fontsize=14)

    # plt.tight_layout()
    # plt.show()

# Assuming 'df_no_post' and 'df_post' are your DataFrames and 'model_order' is defined as above
plot_all_pass_ratio_side_by_side(df_no_post, df_post, model_order)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.patches as mpatches
import numpy as np

# Specific order for the models
model_order = [
    "gpt-4-turbo", "claude-3-opus-20240229", "claude-3-sonnet-20240229", 
    "claude-3-haiku-20240307", "Meta-Llama-3-70B-Instruct", "gpt-3.5-turbo-0125",
    "Meta-Llama-3-8B-Instruct", "deepseek-coder-7b-instruct", "deepseek-coder-1.3b-instruct", 
    "phi-3-mini-4k"
]

def plot_all_pass_ratio_side_by_side(df_no_post, df_post, model_order):
    sns.set_style("whitegrid")

    # Original model names and their simplified display names
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Simplify model names in the DataFrames
    df_no_post['model_display_name'] = df_no_post['Model Name'].map(model_name_mapping)
    df_post['model_display_name'] = df_post['Model Name'].map(model_name_mapping)

    # Merge the two DataFrames for easier plotting
    df_no_post['Type'] = 'No Post'
    df_post['Type'] = 'Post'
    df_combined = pd.concat([df_no_post, df_post])

    # # Define a color palette with lighter colors for 'No Post' and darker colors for 'Post'
    # palette = sns.color_palette("husl", len(model_order))
    # lighter_colors = sns.color_palette([(r, g, b, 0.6) for r, g, b in palette])
    # darker_colors = sns.color_palette([(r, g, b, 1) for r, g, b in palette])
    # combined_palette = {model_name_mapping[model] + ' No Post': lighter_colors[i] for i, model in enumerate(model_order)}
    # combined_palette.update({model_name_mapping[model] + ' Post': darker_colors[i] for i, model in enumerate(model_order)})
    
    # Define a color palette for 'No Post' and 'Post'
    palette = sns.color_palette("husl", 2)  # Only two types: 'No Post' and 'Post'
    lighter_color, darker_color = palette

    # Simplify the combined_palette to match 'Type' values directly
    combined_palette = {'No Post': lighter_color, 'Post': darker_color}


    # # Create a figure for the plot with increased resolution
    fig, ax = plt.subplots(figsize=(12, 9), dpi=400)

    # Plotting with specified order and side by side bars
    # sns.barplot(x='model_display_name', y='All Pass Ratio (%)', hue='Type', data=df_combined, palette=combined_palette, ci=None, ax=ax, order=[model_name_mapping[model] for model in model_order])
    sns.barplot(x='model_display_name', y='All Pass Ratio (%)', hue='Type', data=df_combined, palette=combined_palette, ci=None, ax=ax, order=[model_name_mapping[model] for model in model_order])
    
    ax.set_ylabel('All Pass Ratio (%)', fontsize=14)
    ax.set_title('All Pass Ratio (%) by Model with and without Post Processing', fontsize=16)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=12)

    # Update legend handles for display model names with type
    legend_handles = [mpatches.Patch(color=color, label=model) for model, color in combined_palette.items()]

    # Add the legend to the figure instead of the last subplot
    fig.legend(handles=legend_handles, title="Types", loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5, fontsize=12, title_fontsize=14)

    plt.tight_layout()
    plt.show()

# Assuming 'df_no_post' and 'df_post' are your DataFrames and 'model_order' is defined as above
plot_all_pass_ratio_side_by_side(df_no_post, df_post, model_order)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.patches as mpatches
import re

all_w_post_process = '../Analysis_Results/storage_server/All_models_res/post_process/all_post.csv'
all_no_post_process = '../Analysis_Results/storage_server/All_models_res/post_process/all_no_post.csv' 
python_w_post_process = '../Analysis_Results/storage_server/All_models_res/post_process/python_res_with_post.csv'
python_no_post_process = '../Analysis_Results/storage_server/All_models_res/post_process/python_res_no_post.csv' 
java_w_post_process = '../Analysis_Results/storage_server/All_models_res/post_process/java_res_with_post.csv'
java_no_post_process = '../Analysis_Results/storage_server/All_models_res/post_process/java_res_no_post.csv' 
df_post = pd.read_csv(all_w_post_process)
df_no_post = pd.read_csv(all_no_post_process)
python_w_post = pd.read_csv(python_w_post_process)
python_no_post = pd.read_csv(python_no_post_process)
java_w_post = pd.read_csv(java_w_post_process)
java_no_post = pd.read_csv(java_no_post_process)
display(python_w_post)
display(python_no_post)
display(java_w_post)
display(java_no_post)

In [None]:
# Specific order for the models
model_order = [
    "gpt-4-turbo", "claude-3-opus-20240229", "claude-3-sonnet-20240229", 
    "claude-3-haiku-20240307", "Meta-Llama-3-70B-Instruct", "gpt-3.5-turbo-0125",
    "Meta-Llama-3-8B-Instruct", "deepseek-coder-7b-instruct", "deepseek-coder-1.3b-instruct", 
    "phi-3-mini-4k"
]

def plot_all_pass_ratio_side_by_side(df_no_post, df_post, model_order, program_type):
    sns.set_style("whitegrid")

    # Original model names and their simplified display names
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Simplify model names in the DataFrames
    df_no_post['model_display_name'] = df_no_post['Model Name'].map(model_name_mapping)
    df_post['model_display_name'] = df_post['Model Name'].map(model_name_mapping)

    # Merge the two DataFrames for easier plotting
    df_no_post['Type'] = 'No Post-Processing'
    df_post['Type'] = 'With Post-Processing'
    df_combined = pd.concat([df_no_post, df_post])

    # Define a color palette for 'No Post' and 'Post'
    palette = sns.color_palette("husl", 2)  # Only two types: 'No Post' and 'Post'
    lighter_color, darker_color = palette

    # Simplify the combined_palette to match 'Type' values directly
    combined_palette = {'No Post-Processing': lighter_color, 'With Post-Processing': darker_color}

    # Create a figure for the plot with increased resolution
    fig, ax = plt.subplots(figsize=(12, 9), dpi=400)

    # Plotting with specified order and side by side bars
    sns.barplot(x='model_display_name', y='All Pass Ratio (%)', hue='Type', data=df_combined, palette=combined_palette, ci=None, ax=ax, order=[model_name_mapping[model] for model in model_order])

    # Updated title with program_type included, x-axis, and y-axis labels
    ax.set_title(f'{program_type} - Pass Rate with vs. without Post-Processing', fontsize=23)
    ax.set_ylabel('Percentage', fontsize=20)
    # ax.set_xlabel('Model Name', fontsize=20)
    ax.set_xlabel('')

    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=15)

    # Remove the default legend
    ax.get_legend().remove()

    # Update legend handles for display model names with type and place it at the bottom center
    legend_handles = [mpatches.Patch(color=color, label=label) for label, color in combined_palette.items()]
    # fig.legend(handles=legend_handles, loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5, fontsize=15, title_fontsize=20)

    # plt.tight_layout()
    # # Save the plot with program_type in the filename
    # plt.savefig(f'{program_type}_pass_rate_comparison.png')
    # plt.show()
    
    # Adjust the legend placement to be within the figure bounds
    # fig.legend(handles=legend_handles, loc='lower center', bbox_to_anchor=(0.5, 0.01), fancybox=True, shadow=True, ncol=5, fontsize=15, title_fontsize=20)
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), fancybox=True, shadow=True, ncol=3)
    
    # Adjust tight_layout to make space for the legend outside the plot area
    plt.tight_layout()

    # Save the plot with program_type in the filename
    plt.savefig(f'{program_type}_pass_rate_comparison.png', bbox_inches='tight')
    plt.show()

plot_all_pass_ratio_side_by_side(python_no_post, python_w_post, model_order, "Python")

In [None]:
plot_all_pass_ratio_side_by_side(java_no_post, java_w_post, model_order, "Java")

In [None]:
df_dict = {
    'Python': (python_no_post, python_w_post),
    'Java': (java_no_post, java_w_post)
}

def plot_pass_ratio_for_program_types_combined(df_dict, model_order):
    sns.set_style("whitegrid")

    # Define program types and their respective data
    program_types = ['Python', 'Java']

    # Original model names and their simplified display names
    model_name_mapping = {
        "gpt-4-turbo": "GPT-4 Turbo",
        "claude-3-opus-20240229": "Claude-3 Opus",
        "claude-3-sonnet-20240229": "Claude-3 Sonnet",
        "claude-3-haiku-20240307": "Claude-3 Haiku",
        "Meta-Llama-3-70B-Instruct": "Meta-Llama-3 70B",
        "gpt-3.5-turbo-0125": "GPT-3.5 Turbo",
        "Meta-Llama-3-8B-Instruct": "Meta-Llama-3 8B",
        "deepseek-coder-7b-instruct": "DeepSeek Coder 7B",
        "deepseek-coder-1.3b-instruct": "DeepSeek Coder 1.3B",
        "phi-3-mini-4k": "Phi-3 Mini 4K"
    }

    # Create subplots for each program type
    fig, axs = plt.subplots(len(program_types), 1, figsize=(12, 18), dpi=400)

    for i, program_type in enumerate(program_types):
        df_no_post, df_post = df_dict[program_type]

        # Simplify model names in the DataFrames
        df_no_post['model_display_name'] = df_no_post['Model Name'].map(model_name_mapping)
        df_post['model_display_name'] = df_post['Model Name'].map(model_name_mapping)

        # Merge the two DataFrames for easier plotting
        df_no_post['Type'] = 'No Post-Processing'
        df_post['Type'] = 'With Post-Processing'
        df_combined = pd.concat([df_no_post, df_post])

        # Define a color palette for 'No Post' and 'Post'
        palette = sns.color_palette("husl", 2)  # Only two types: 'No Post' and 'Post'
        combined_palette = {'No Post-Processing': palette[0], 'With Post-Processing': palette[1]}

        ax = axs[i]
        # Plotting with specified order and side by side bars
        sns.barplot(x='model_display_name', y='All Pass Ratio (%)', hue='Type', data=df_combined, palette=combined_palette, ci=None, ax=ax, order=[model_name_mapping[model] for model in model_order])

        # Set title, x-axis, and y-axis labels
        ax.set_title(f'{program_type} - Pass Rate with vs. without Post-Processing', fontsize=16)
        ax.set_ylabel('Percentage', fontsize=14)
        ax.set_xlabel('Model Name', fontsize=14)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", fontsize=12)

        # Remove the default legend for all but the last plot
        if i < len(program_types) - 1:
            ax.get_legend().remove()

    # Update legend handles for display model names with type and place it at the bottom center of the last plot
    legend_handles = [mpatches.Patch(color=color, label=label) for label, color in combined_palette.items()]
    axs[-1].legend(handles=legend_handles, loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=5, fontsize=12)

    plt.tight_layout()
    # Save the plot with combined program types in the filename
    plt.savefig('Python & Java_pass_rate_comparison.png')
    plt.show()

plot_pass_ratio_for_program_types_combined(df_dict, model_order)