In [2]:
# Jupyter Notebook Code
from code_gen_result_analysis import CodeAnalysis, cal_err_bar, bootstrap_resampling  # Assuming code_analysis.py is the name of the file
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Function to style the DataFrame
def highlight_cols(df):
     # We will create a custom style for the headers and vertical lines
     styles = [
               dict(selector="th.col_heading",
                    props=[("text-align", "center"),
                              ("border-right", "2px solid #6c6c6c"),
                              ("border-left", "2px solid #6c6c6c")]),
               dict(selector="th.col_heading.level0",
                    props=[("border-top", "2px solid #6c6c6c")]),
               dict(selector="th",
                    props=[("font-size", "12px")]),
               dict(selector="td",
                    props=[("text-align", "center")]),
               # Customize the boundary for your specific DataFrame structure
               # Adjust "4" and "10" according to your DataFrame's column indices
               dict(selector=f"th:nth-child(5), td:nth-child(5)",
                    props=[("border-left", "2px solid #6c6c6c")]),
               dict(selector=f"th:nth-child(11), td:nth-child(11)",
                    props=[("border-left", "2px solid #6c6c6c")])
          ]
     return df.style.set_table_styles(styles).set_properties(**{'width': '120px', 'text-align': 'center'}).hide_index()


# Set display options for pandas DataFrame
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


In [12]:
#Python All Res
##########################
directory_path = '../Analysis_Results/storage_server/Python_all_res/Completion/4th_post_process_reason_update/Update_labels'
save_file_name = 'Python_Completion_grouped.csv'
# directory_path = '../Analysis_Results/storage_server/Python_all_res/Infilling/4th_post_process_reason_update/Update_labels'
# save_file_name = 'Python_Infilling_grouped.csv'

#Java All Res
##########################
# directory_path = '../Analysis_Results/storage_server/Java_all_res/Completion/4th_post_process_reason_update/Update_labels'
# save_file_name = 'Java_Completion_grouped.csv'
# directory_path = '../Analysis_Results/storage_server/Java_all_res/Infilling/4th_post_process_reason_update/Update_labels'
# save_file_name = 'Java_Infilling_grouped.csv'


label_mapping = {
    'If Condition': 'If Body', #If Condition
    'Elif Condition': 'If Body',
    'If Body': 'If Body',
    'Elif Body': 'If Body',
    'Else Reasoning': 'If Body',
    'Loop Body': 'Loop Body',
    'Define Stop Criteria': 'Loop Body',
    'List Comprehension': 'List Comprehension',
    'Lambda Expressions': 'List Comprehension',
    'Generator Expressions': 'List Comprehension',
    'Class': 'Function',
    'Function': 'Function',
    'Library': 'Function',
    'Variable': 'Variable',
    'Global_Variable': 'Variable'
}



# gen_code_pass_col = 'gen_code_pass_ratio'
gen_code_pass_col = 'post_process_pass_ratio'
analysis = CodeAnalysis(weighted=False, use_max_range=True, gen_code_pass_col=gen_code_pass_col)  # Pass additional parameters if different from defaults
# Analyze results and structure the summary DataFrame
# summary_df = analysis.analyze_results_in_folder(directory_path)
summary_df = analysis.analyze_results_in_folder(directory_path, label_mapping=label_mapping)

# Sort the DataFrame by 'Model Name'
summary_df_sorted = summary_df.sort_values(by='Model Name')
# summary_df_sorted
# Creating MultiIndex for columns to include subheaders
reasons_columns = [('Reason Categories Pass Ratio', col.replace('_', ' ')) for col in analysis.labels_reasons]
horizons_columns = [('Horizon Categories Pass Ratio', col.replace('_', ' ')) for col in analysis.labels_horizons]
columns = [('General', 'Model Name'), ('General', 'Generation Mode'), ('General', 'Code Task'), ('General', 'All Pass Ratio')] + reasons_columns + horizons_columns
summary_df_sorted.columns = pd.MultiIndex.from_tuples(columns)

# Apply the styling function to your summary DataFrame
styled_df = highlight_cols(summary_df_sorted)
# styled_df

phi-3-mini-4k no_afterlines RL Motion Planning 06_03_04_01
Meta-Llama-3-70B-Instruct no_afterlines simplex method 06_03_17_33
deepseek-coder-1.3b-instruct no_afterlines Image Transformation 06_03_04_00
claude-3-opus-20240229 no_afterlines simplex method 06_03_04_01
Meta-Llama-3-70B-Instruct no_afterlines GAN model 06_03_17_33
claude-3-haiku-20240307 no_afterlines GAN model 06_03_04_01
gpt-3.5-turbo-0125 no_afterlines Image Transformation 06_03_04_01
deepseek-coder-1.3b-instruct no_afterlines RL Motion Planning 06_03_04_01
deepseek-coder-7b-instruct no_afterlines RL Motion Planning 06_03_04_01
Meta-Llama-3-8B-Instruct no_afterlines Credit Scoring Fairness 06_03_04_01
Meta-Llama-3-8B-Instruct no_afterlines GAN model 06_03_04_01
claude-3-opus-20240229 no_afterlines Timeseries Clustering 06_03_04_00
gpt-4-turbo no_afterlines Credit Scoring Fairness 06_03_04_01
deepseek-coder-1.3b-instruct no_afterlines simplex method 06_03_04_00
phi-3-mini-4k no_afterlines simplex method 06_03_04_01
claude

In [13]:
# Function to split the ratio column into 'Passed Count' and 'Total Count'
def split_ratio_for_general(df, debug_print=False):
    if debug_print:
        print(df[('General', 'All Pass Ratio')])  # Print before split
    df[('General', 'Passed Count')] = df[('General', 'All Pass Ratio')].str.extract('\((\d+)/').squeeze().astype(int)
    df[('General', 'Total Count')] = df[('General', 'All Pass Ratio')].str.extract('/(\d+)\)').squeeze().astype(int)
    if debug_print:
        print(df[('General', 'Passed Count')])
        print(df[('General', 'Total Count')])
    return df

# Function to analyze the DataFrame by 'Model Name' and 'Generation Mode'

def analyze_by_model_and_mode_for_general(df, report_err_bar=True):
    df = split_ratio_for_general(df)
    grouped_df = df.groupby([('General', 'Model Name'), ('General', 'Generation Mode')]).sum()

    total_counts = grouped_df[('General', 'Total Count')]
    pass_counts = grouped_df[('General', 'Passed Count')]

    percentages = pass_counts / total_counts * 100

    if report_err_bar:
        percentages, err_bar = cal_err_bar(pass_counts, total_counts)
        err_bar = pd.Series(err_bar, index=grouped_df.index)
        grouped_df[('General', 'All Pass Ratio')] = grouped_df.apply(lambda row: f"{percentages.loc[row.name]*100:.2f} ± {err_bar.loc[row.name]*100:.2f}", axis=1)
    else:
        grouped_df[('General', 'All Pass Ratio')] = percentages.map("{:.3f}%".format)

    return grouped_df

# Usage:
general_analyzed_df = analyze_by_model_and_mode_for_general(summary_df_sorted)
general_analyzed_df

Unnamed: 0_level_0,Unnamed: 1_level_0,General,General,General
Unnamed: 0_level_1,Unnamed: 1_level_1,Passed Count,Total Count,All Pass Ratio
"(General, Model Name)","(General, Generation Mode)",Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Meta-Llama-3-70B-Instruct,no_afterlines,96,212,45.35 ± 6.62
Meta-Llama-3-8B-Instruct,no_afterlines,67,212,31.57 ± 6.22
claude-3-haiku-20240307,no_afterlines,58,212,27.38 ± 6.01
claude-3-opus-20240229,no_afterlines,51,212,24.09 ± 5.77
claude-3-sonnet-20240229,no_afterlines,57,212,26.91 ± 5.95
deepseek-coder-1.3b-instruct,no_afterlines,26,212,12.22 ± 4.36
deepseek-coder-7b-instruct,no_afterlines,96,212,45.26 ± 6.72
gpt-3.5-turbo-0125,no_afterlines,112,212,52.81 ± 6.73
gpt-4-turbo,no_afterlines,118,212,55.72 ± 6.67
phi-3-mini-4k,no_afterlines,17,212,8.03 ± 3.69


In [4]:
# #Comment Style
# styled_df
# summary_df_sorted

In [14]:
# Function to split the ratio column into 'Passed Count' and 'Total Count'
# for Reason Categories Pass Ratio and Horizon Categories Pass Ratio
def split_ratio(df):
    # reason_categories = ['List Comprehension', 'Lambda Expressions', 'Generator Expressions', 'If Condition','If Body', 'Elif Condition', 'Elif Body', 'Else Reasoning', 'Stream Operations', 'Loop Body', 'Define Stop Criteria', 'Super Call']
    # reason_categories = ['List Comprehension', 'Lambda Expressions', 'Generator Expressions', 'If Condition','If Body', 'Stream Operations', 'Loop Body', 'Super Call']
    reason_categories = ['List Comprehension', 'If Condition','If Body', 'Stream Operations', 'Loop Body', 'Super Call']

    horizon_categories = ['Short-Range', 'Medium-Range', 'Long-Range', 'Variable', 'Global Variable', 'Function', 'Class', 'Library', 'Interface']
    # horizon_categories = ['Short-Range', 'Medium-Range', 'Long-Range', 'Variable', 'Global Variable', 'Function', 'Library', 'Interface']


    for category in reason_categories:
        df[('Reason Categories Pass Ratio', category + ' Passed Count')] = df[('Reason Categories Pass Ratio', category)].str.extract('\((\d+)/').squeeze().fillna(0).astype(int)
        df[('Reason Categories Pass Ratio', category + ' Total Count')] = df[('Reason Categories Pass Ratio', category)].str.extract('/(\d+)\)').squeeze().fillna(0).astype(int)

    for category in horizon_categories:
        df[('Horizon Categories Pass Ratio', category + ' Passed Count')] = df[('Horizon Categories Pass Ratio', category)].str.extract('\((\d+)/').squeeze().fillna(0).astype(int)
        df[('Horizon Categories Pass Ratio', category + ' Total Count')] = df[('Horizon Categories Pass Ratio', category)].str.extract('/(\d+)\)').squeeze().fillna(0).astype(int)

    return df

#     return grouped_df
def analyze_by_model_and_mode(df, report_err_bar=True):
    df = split_ratio(df)
    grouped_df = df.groupby([('General', 'Model Name'), ('General', 'Generation Mode')]).sum()

    horizon_categories = ['Short-Range', 'Medium-Range', 'Long-Range', 'Variable', 'Global Variable', 'Function', 'Class', 'Library', 'Interface']

    # horizon_categories = ['Short-Range', 'Medium-Range', 'Long-Range', 'Variable', 'Global Variable', 'Function', 'Library', 'Interface']

    # reason_categories = ['List Comprehension', 'Lambda Expressions', 'Generator Expressions', 'If-else Reasoning', 'Stream Operations', 'Define Stop Criteria', 'Super Call']
    # reason_categories = ['List Comprehension', 'Lambda Expressions', 'Generator Expressions', 'If Condition','If Body', 'Elif Condition', 'Elif Body', 'Else Reasoning', 'Stream Operations', 'Loop Body', 'Define Stop Criteria', 'Super Call']
    # reason_categories = ['List Comprehension', 'Lambda Expressions', 'Generator Expressions', 'If Condition','If Body', 'Stream Operations', 'Loop Body', 'Super Call']
    reason_categories = ['List Comprehension', 'If Condition','If Body', 'Stream Operations', 'Loop Body', 'Super Call']

    if report_err_bar:
        for category in horizon_categories:
            total_counts = grouped_df[('Horizon Categories Pass Ratio', category + ' Total Count')]
            pass_counts = grouped_df[('Horizon Categories Pass Ratio', category + ' Passed Count')]
            percentages = pass_counts / total_counts * 100

            if report_err_bar:
                percentages, err_bar = cal_err_bar(pass_counts, total_counts)
                err_bar = pd.Series(err_bar, index=grouped_df.index)
                grouped_df[('Horizon Categories Pass Ratio', category)] = grouped_df.apply(lambda row: f"{percentages.loc[row.name]*100:.2f} ± {err_bar.loc[row.name]*100:.2f}", axis=1)
            else:
                grouped_df[('Horizon Categories Pass Ratio', category)] = percentages.map("{:.3f}%".format)

            grouped_df.drop([('Horizon Categories Pass Ratio', category + ' Passed Count'), ('Horizon Categories Pass Ratio', category + ' Total Count')], axis=1, inplace=True)

        for category in reason_categories:
            total_counts = grouped_df[('Reason Categories Pass Ratio', category + ' Total Count')]
            pass_counts = grouped_df[('Reason Categories Pass Ratio', category + ' Passed Count')]
            percentages = pass_counts / total_counts * 100

            if report_err_bar:
                percentages, err_bar = cal_err_bar(pass_counts, total_counts)
                err_bar = pd.Series(err_bar, index=grouped_df.index)
                grouped_df[('Reason Categories Pass Ratio', category)] = grouped_df.apply(lambda row: f"{percentages.loc[row.name]*100:.2f} ± {err_bar.loc[row.name]*100:.2f}", axis=1)
            else:
                grouped_df[('Reason Categories Pass Ratio', category)] = percentages.map("{:.3f}%".format)

            grouped_df.drop([('Reason Categories Pass Ratio', category + ' Passed Count'), ('Reason Categories Pass Ratio', category + ' Total Count')], axis=1, inplace=True)
    else:
        for category in horizon_categories:
            pass_ratio = grouped_df[('Horizon Categories Pass Ratio', category + ' Passed Count')] / grouped_df[('Horizon Categories Pass Ratio', category + ' Total Count')] * 100
            grouped_df[('Horizon Categories Pass Ratio', category)] = pass_ratio.map("{:.3f}%".format) + '(' + grouped_df[('Horizon Categories Pass Ratio', category + ' Passed Count')].astype(str) + '/' + grouped_df[('Horizon Categories Pass Ratio', category + ' Total Count')].astype(str) + ')'
            grouped_df.drop([('Horizon Categories Pass Ratio', category + ' Passed Count'), ('Horizon Categories Pass Ratio', category + ' Total Count')], axis=1, inplace=True)

        for category in reason_categories:
            pass_ratio = grouped_df[('Reason Categories Pass Ratio', category + ' Passed Count')] / grouped_df[('Reason Categories Pass Ratio', category + ' Total Count')] * 100
            grouped_df[('Reason Categories Pass Ratio', category)] = pass_ratio.map("{:.3f}%".format) + '(' + grouped_df[('Reason Categories Pass Ratio', category + ' Passed Count')].astype(str) + '/' + grouped_df[('Reason Categories Pass Ratio', category + ' Total Count')].astype(str) + ')'
            grouped_df.drop([('Reason Categories Pass Ratio', category + ' Passed Count'), ('Reason Categories Pass Ratio', category + ' Total Count')], axis=1, inplace=True)

    return grouped_df

def clean_df(df, drop_general=False, drop_range=True, drop_nan=True):
    if drop_general:
        df = df.drop(columns=[col for col in df.columns if 'Passed Count' in col or 'Total Count' in col], errors='ignore')

    if drop_range:
        df = df.drop(columns=[('Horizon Categories Pass Ratio', 'Short-Range'), ('Horizon Categories Pass Ratio', 'Medium-Range'), ('Horizon Categories Pass Ratio', 'Long-Range')], errors='ignore')

    if drop_nan:
        df = df.loc[:, ~(df == 'nan%(0/0)').all()]

    return df

analyzed_df = analyze_by_model_and_mode(summary_df_sorted)
cleaned_df = clean_df(analyzed_df)
# cleaned_df
cleaned_df.to_csv(save_file_name)

In [35]:
cleaned_df

Unnamed: 0_level_0,Unnamed: 1_level_0,General,General,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Reason Categories Pass Ratio,Reason Categories Pass Ratio,Reason Categories Pass Ratio,Reason Categories Pass Ratio,Reason Categories Pass Ratio,Reason Categories Pass Ratio
Unnamed: 0_level_1,Unnamed: 1_level_1,Passed Count,Total Count,Variable,Global Variable,Function,Class,Library,Interface,List Comprehension,If Condition,If Body,Stream Operations,Loop Body,Super Call
"(General, Model Name)","(General, Generation Mode)",Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
Meta-Llama-3-70B-Instruct,no_afterlines,154,286,53.01 ± 6.04,0.00 ± 0.00,54.14 ± 9.28,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,63.39 ± 8.66,0.00 ± 0.00,50.96 ± 9.83,0.00 ± 0.00
Meta-Llama-3-8B-Instruct,no_afterlines,77,286,26.48 ± 5.30,0.00 ± 0.00,20.19 ± 7.48,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,29.18 ± 8.15,0.00 ± 0.00,29.41 ± 8.94,0.00 ± 0.00
claude-3-haiku-20240307,no_afterlines,140,286,50.02 ± 5.97,0.00 ± 0.00,51.36 ± 9.38,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,49.19 ± 8.84,0.00 ± 0.00,43.06 ± 9.49,0.00 ± 0.00
claude-3-opus-20240229,no_afterlines,195,286,69.03 ± 5.49,0.00 ± 0.00,66.03 ± 8.78,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,63.37 ± 8.52,0.00 ± 0.00,58.76 ± 9.54,0.00 ± 0.00
claude-3-sonnet-20240229,no_afterlines,159,286,55.23 ± 5.94,0.00 ± 0.00,47.82 ± 9.28,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,57.48 ± 8.84,0.00 ± 0.00,49.97 ± 9.61,0.00 ± 0.00
deepseek-coder-1.3b-instruct,no_afterlines,46,286,16.06 ± 4.38,0.00 ± 0.00,18.31 ± 7.31,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,19.96 ± 7.10,0.00 ± 0.00,21.58 ± 7.96,0.00 ± 0.00
deepseek-coder-7b-instruct,no_afterlines,120,286,42.57 ± 5.88,0.00 ± 0.00,37.60 ± 9.10,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,43.31 ± 8.84,0.00 ± 0.00,45.10 ± 9.73,0.00 ± 0.00
gpt-3.5-turbo-0125,no_afterlines,122,286,42.90 ± 5.93,0.00 ± 0.00,34.84 ± 8.87,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,53.33 ± 8.90,0.00 ± 0.00,45.03 ± 9.74,0.00 ± 0.00
gpt-4-turbo,no_afterlines,176,286,62.29 ± 5.80,0.00 ± 0.00,56.91 ± 9.30,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,63.32 ± 8.62,0.00 ± 0.00,62.67 ± 9.35,0.00 ± 0.00
phi-3-mini-4k,no_afterlines,30,286,10.82 ± 3.72,0.00 ± 0.00,12.89 ± 6.32,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,0.00 ± 0.00,10.82 ± 5.52,0.00 ± 0.00,10.75 ± 5.97,0.00 ± 0.00


In [None]:
# Instantiate CodeAnalysis
analysis = CodeAnalysis(weighted=True)  # Pass additional parameters if different from defaults
# directory_path = '../Analysis_Results/LLMs_code_gen_results/COMP215/Prompt_no_examples/'  # Adjust this path as necessary

# Analyze results and structure the summary DataFrame
summary_df = analysis.analyze_results_in_folder(directory_path)

# Sort the DataFrame by 'Model Name'
summary_df_sorted = summary_df.sort_values(by='Model Name')

# Creating MultiIndex for columns to include subheaders
reasons_columns = [('Reason Categories Pass Ratio', col.replace('_', ' ')) for col in analysis.labels_reasons]
horizons_columns = [('Horizon Categories Pass Ratio', col.replace('_', ' ')) for col in analysis.labels_horizons]
columns = [('General', 'Model Name'), ('General', 'Generation Mode'), ('General', 'Code Task'), ('General', 'All Pass Ratio')] + reasons_columns + horizons_columns
summary_df_sorted.columns = pd.MultiIndex.from_tuples(columns)

# Apply the styling function to your summary DataFrame
styled_df = highlight_cols(summary_df_sorted)
styled_df

## Results 