In [1]:
# Jupyter Notebook Code
from code_gen_result_analysis import CodeAnalysis, cal_err_bar, bootstrap_resampling  # Assuming code_analysis.py is the name of the file
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Function to style the DataFrame
def highlight_cols(df):
     # We will create a custom style for the headers and vertical lines
     styles = [
               dict(selector="th.col_heading",
                    props=[("text-align", "center"),
                              ("border-right", "2px solid #6c6c6c"),
                              ("border-left", "2px solid #6c6c6c")]),
               dict(selector="th.col_heading.level0",
                    props=[("border-top", "2px solid #6c6c6c")]),
               dict(selector="th",
                    props=[("font-size", "12px")]),
               dict(selector="td",
                    props=[("text-align", "center")]),
               # Customize the boundary for your specific DataFrame structure
               # Adjust "4" and "10" according to your DataFrame's column indices
               dict(selector=f"th:nth-child(5), td:nth-child(5)",
                    props=[("border-left", "2px solid #6c6c6c")]),
               dict(selector=f"th:nth-child(11), td:nth-child(11)",
                    props=[("border-left", "2px solid #6c6c6c")])
          ]
     return df.style.set_table_styles(styles).set_properties(**{'width': '120px', 'text-align': 'center'}).hide_index()

label_mapping = {
     'If Condition': 'If Condition', #If Condition
     'Elif Condition': 'If Condition',
     'If Body': 'If Body',
     'Elif Body': 'If Body',
     'Else Reasoning': 'If Body',
     'Loop Body': 'Loop Body',
     'Define Stop Criteria': 'Loop Body',
     'List Comprehension': 'List Comprehension',
     'Lambda Expressions': 'List Comprehension',
     'Generator Expressions': 'List Comprehension',
}
def get_summary_df(directory_path, label_mapping, weighted=False, use_max_range=True,gen_code_pass_col = 'post_process_pass_ratio'):
     analysis = CodeAnalysis(weighted=weighted, use_max_range=use_max_range, gen_code_pass_col=gen_code_pass_col)  # Pass additional parameters if different from defaults
     # Analyze results and structure the summary DataFrame
     summary_df = analysis.analyze_results_in_folder(directory_path, label_mapping=label_mapping)
     # Sort the DataFrame by 'Model Name'
     summary_df_sorted = summary_df.sort_values(by='Model Name')
     # Creating MultiIndex for columns to include subheaders
     reasons_columns = [('Reason Categories Pass Ratio', col.replace('_', ' ')) for col in analysis.labels_reasons]
     horizons_columns = [('Horizon Categories Pass Ratio', col.replace('_', ' ')) for col in analysis.labels_horizons]
     columns = [('General', 'Model Name'), ('General', 'Generation Mode'), ('General', 'Code Task'), ('General', 'All Pass Ratio')] + reasons_columns + horizons_columns
     summary_df_sorted.columns = pd.MultiIndex.from_tuples(columns)

     # Apply the styling function to your summary DataFrame
     styled_df = highlight_cols(summary_df_sorted)
     return summary_df_sorted, styled_df

# Function to split the ratio column into 'Passed Count' and 'Total Count'
def split_ratio_for_general(df, debug_print=False):
     if debug_print:
          print(df[('General', 'All Pass Ratio')])  # Print before split
     df[('General', 'Passed Count')] = df[('General', 'All Pass Ratio')].str.extract('\((\d+)/').squeeze().astype(int)
     df[('General', 'Total Count')] = df[('General', 'All Pass Ratio')].str.extract('/(\d+)\)').squeeze().astype(int)
     if debug_print:
          print(df[('General', 'Passed Count')])
          print(df[('General', 'Total Count')])
     return df

# Function to analyze the DataFrame by 'Model Name' and 'Generation Mode'
def analyze_by_model_and_mode_for_general(df, report_err_bar=True):
     df = split_ratio_for_general(df)
     grouped_df = df.groupby([('General', 'Model Name'), ('General', 'Generation Mode')]).sum()

     total_counts = grouped_df[('General', 'Total Count')]
     pass_counts = grouped_df[('General', 'Passed Count')]

     percentages = pass_counts / total_counts * 100

     if report_err_bar:
          percentages, err_bar = cal_err_bar(pass_counts, total_counts)
          err_bar = pd.Series(err_bar, index=grouped_df.index)
          grouped_df[('General', 'All Pass Ratio')] = grouped_df.apply(lambda row: f"{percentages.loc[row.name]*100:.1f} ± {err_bar.loc[row.name]*100:.1f}", axis=1)
     else:
          grouped_df[('General', 'All Pass Ratio')] = percentages.map("{:.3f}%".format)

     return grouped_df

# Function to split the ratio column into 'Passed Count' and 'Total Count'
# for Reason Categories Pass Ratio and Horizon Categories Pass Ratio
def split_ratio(df):
     reason_categories = ['List Comprehension', 'If Condition','If Body', 'Stream Operations', 'Loop Body', 'Super Call']
     horizon_categories = ['Short-Range', 'Medium-Range', 'Long-Range', 'Variable', 'Global Variable', 'Function', 'Class', 'Library', 'Interface']

     for category in reason_categories:
          df[('Reason Categories Pass Ratio', category + ' Passed Count')] = df[('Reason Categories Pass Ratio', category)].str.extract('\((\d+)/').squeeze().fillna(0).astype(int)
          df[('Reason Categories Pass Ratio', category + ' Total Count')] = df[('Reason Categories Pass Ratio', category)].str.extract('/(\d+)\)').squeeze().fillna(0).astype(int)

     for category in horizon_categories:
          df[('Horizon Categories Pass Ratio', category + ' Passed Count')] = df[('Horizon Categories Pass Ratio', category)].str.extract('\((\d+)/').squeeze().fillna(0).astype(int)
          df[('Horizon Categories Pass Ratio', category + ' Total Count')] = df[('Horizon Categories Pass Ratio', category)].str.extract('/(\d+)\)').squeeze().fillna(0).astype(int)

     return df

def analyze_by_model_and_mode(df, report_err_bar=True):
     df = split_ratio(df)
     grouped_df = df.groupby([('General', 'Model Name'), ('General', 'Generation Mode')]).sum()

     horizon_categories = ['Short-Range', 'Medium-Range', 'Long-Range', 'Variable', 'Global Variable', 'Function', 'Class', 'Library', 'Interface']
     reason_categories = ['List Comprehension', 'If Condition','If Body', 'Stream Operations', 'Loop Body', 'Super Call']

     if report_err_bar:
          for category in horizon_categories:
               total_counts = grouped_df[('Horizon Categories Pass Ratio', category + ' Total Count')]
               pass_counts = grouped_df[('Horizon Categories Pass Ratio', category + ' Passed Count')]
               percentages = pass_counts / total_counts * 100

               if report_err_bar:
                    percentages, err_bar = cal_err_bar(pass_counts, total_counts)
                    err_bar = pd.Series(err_bar, index=grouped_df.index)
                    grouped_df[('Horizon Categories Pass Ratio', category)] = grouped_df.apply(lambda row: f"{percentages.loc[row.name]*100:.1f} ± {err_bar.loc[row.name]*100:.1f}", axis=1)
               else:
                    grouped_df[('Horizon Categories Pass Ratio', category)] = percentages.map("{:.3f}%".format)

               grouped_df.drop([('Horizon Categories Pass Ratio', category + ' Passed Count'), ('Horizon Categories Pass Ratio', category + ' Total Count')], axis=1, inplace=True)

          for category in reason_categories:
               total_counts = grouped_df[('Reason Categories Pass Ratio', category + ' Total Count')]
               pass_counts = grouped_df[('Reason Categories Pass Ratio', category + ' Passed Count')]
               percentages = pass_counts / total_counts * 100

               if report_err_bar:
                    percentages, err_bar = cal_err_bar(pass_counts, total_counts)
                    err_bar = pd.Series(err_bar, index=grouped_df.index)
                    grouped_df[('Reason Categories Pass Ratio', category)] = grouped_df.apply(lambda row: f"{percentages.loc[row.name]*100:.1f} ± {err_bar.loc[row.name]*100:.1f}", axis=1)
               else:
                    grouped_df[('Reason Categories Pass Ratio', category)] = percentages.map("{:.3f}%".format)

               grouped_df.drop([('Reason Categories Pass Ratio', category + ' Passed Count'), ('Reason Categories Pass Ratio', category + ' Total Count')], axis=1, inplace=True)
     else:
          for category in horizon_categories:
               pass_ratio = grouped_df[('Horizon Categories Pass Ratio', category + ' Passed Count')] / grouped_df[('Horizon Categories Pass Ratio', category + ' Total Count')] * 100
               grouped_df[('Horizon Categories Pass Ratio', category)] = pass_ratio.map("{:.3f}%".format) + '(' + grouped_df[('Horizon Categories Pass Ratio', category + ' Passed Count')].astype(str) + '/' + grouped_df[('Horizon Categories Pass Ratio', category + ' Total Count')].astype(str) + ')'
               grouped_df.drop([('Horizon Categories Pass Ratio', category + ' Passed Count'), ('Horizon Categories Pass Ratio', category + ' Total Count')], axis=1, inplace=True)

          for category in reason_categories:
               pass_ratio = grouped_df[('Reason Categories Pass Ratio', category + ' Passed Count')] / grouped_df[('Reason Categories Pass Ratio', category + ' Total Count')] * 100
               grouped_df[('Reason Categories Pass Ratio', category)] = pass_ratio.map("{:.3f}%".format) + '(' + grouped_df[('Reason Categories Pass Ratio', category + ' Passed Count')].astype(str) + '/' + grouped_df[('Reason Categories Pass Ratio', category + ' Total Count')].astype(str) + ')'
               grouped_df.drop([('Reason Categories Pass Ratio', category + ' Passed Count'), ('Reason Categories Pass Ratio', category + ' Total Count')], axis=1, inplace=True)

     return grouped_df

def clean_df(df, drop_general=False, drop_range=True, drop_nan=True, drop_zero=True):
     if drop_general:
          df = df.drop(columns=[col for col in df.columns if 'Passed Count' in col or 'Total Count' in col], errors='ignore')

     if drop_range:
          df = df.drop(columns=[('Horizon Categories Pass Ratio', 'Short-Range'), ('Horizon Categories Pass Ratio', 'Medium-Range'), ('Horizon Categories Pass Ratio', 'Long-Range')], errors='ignore')

     if drop_nan:
          df = df.loc[:, ~(df == 'nan%(0/0)').all()]

     if drop_zero:
          df = df.loc[:, ~(df == '0.0 ± 0.0').all()]

     return df

# def analyze_and_save(directory_path, label_mapping, save_file_name, gen_code_pass_col='post_process_pass_ratio'):
def analyze_and_save(directory_path, label_mapping, save_file_name, gen_code_pass_col='gen_code_pass_ratio'):
     summary_df_sorted, styled_df = get_summary_df(directory_path, label_mapping, gen_code_pass_col = gen_code_pass_col)
     general_analyzed_df = analyze_by_model_and_mode_for_general(summary_df_sorted)
     display(general_analyzed_df)
     analyzed_df = analyze_by_model_and_mode(summary_df_sorted)
     cleaned_df = clean_df(analyzed_df)
     display(cleaned_df)
     cleaned_df.to_csv(save_file_name)

# Set display options for pandas DataFrame
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


In [2]:
def analyze_and_display_results_for_settings():
    settings = [
        {"language": "Python", "type": "Completion", "directory": "../Analysis_Results/storage_server/Python_all_res/Completion/4th_post_process_reason_update/Update_labels", "save_file": "Python_Completion_grouped.csv"},
        {"language": "Python", "type": "Infilling", "directory": "../Analysis_Results/storage_server/Python_all_res/Infilling/4th_post_process_reason_update/Update_labels", "save_file": "Python_Infilling_grouped.csv"},
        {"language": "Java", "type": "Completion", "directory": "../Analysis_Results/storage_server/Java_all_res/Completion/4th_post_process_reason_update/Update_labels", "save_file": "Java_Completion_grouped.csv"},
        {"language": "Java", "type": "Infilling", "directory": "../Analysis_Results/storage_server/Java_all_res/Infilling/4th_post_process_reason_update/Update_labels", "save_file": "Java_Infilling_grouped.csv"}
    ]

    for setting in settings:
        print(f"Analyzing {setting['language']} {setting['type']}")
        analyze_and_save(setting['directory'], label_mapping, setting['save_file'])
        print(f"Analysis and saving completed for {setting['language']} {setting['type']}\n\n")

# Example usage
analyze_and_display_results_for_settings()

Analyzing Python Completion


Unnamed: 0_level_0,Unnamed: 1_level_0,General,General,General
Unnamed: 0_level_1,Unnamed: 1_level_1,Passed Count,Total Count,All Pass Ratio
"(General, Model Name)","(General, Generation Mode)",Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Meta-Llama-3-70B-Instruct,no_afterlines,39,212,18.4 ± 5.3
Meta-Llama-3-8B-Instruct,no_afterlines,14,212,6.6 ± 3.3
claude-3-haiku-20240307,no_afterlines,14,212,6.6 ± 3.3
claude-3-opus-20240229,no_afterlines,8,212,3.8 ± 2.6
claude-3-sonnet-20240229,no_afterlines,7,212,3.3 ± 2.4
deepseek-coder-1.3b-instruct,no_afterlines,5,212,2.4 ± 2.0
deepseek-coder-7b-instruct,no_afterlines,46,212,21.7 ± 5.5
gpt-3.5-turbo-0125,no_afterlines,68,212,32.0 ± 6.3
gpt-4-turbo,no_afterlines,96,212,45.2 ± 6.6
phi-3-mini-4k,no_afterlines,2,212,0.9 ± 1.3


Unnamed: 0_level_0,Unnamed: 1_level_0,General,General,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Reason Categories Pass Ratio,Reason Categories Pass Ratio,Reason Categories Pass Ratio,Reason Categories Pass Ratio
Unnamed: 0_level_1,Unnamed: 1_level_1,Passed Count,Total Count,Variable,Function,Class,Library,List Comprehension,If Condition,If Body,Loop Body
"(General, Model Name)","(General, Generation Mode)",Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
Meta-Llama-3-70B-Instruct,no_afterlines,39,212,18.0 ± 5.2,11.0 ± 20.5,10.5 ± 14.0,13.0 ± 5.9,11.3 ± 20.9,7.2 ± 13.5,20.6 ± 9.2,12.2 ± 9.2
Meta-Llama-3-8B-Instruct,no_afterlines,14,212,6.8 ± 3.4,0.0 ± 0.0,5.3 ± 10.1,4.1 ± 3.5,33.0 ± 30.8,7.3 ± 13.7,1.4 ± 2.7,2.0 ± 4.0
claude-3-haiku-20240307,no_afterlines,14,212,6.8 ± 3.4,0.0 ± 0.0,0.0 ± 0.0,9.0 ± 5.1,0.0 ± 0.0,0.0 ± 0.0,8.2 ± 6.3,10.3 ± 8.5
claude-3-opus-20240229,no_afterlines,8,212,3.9 ± 2.6,0.0 ± 0.0,5.3 ± 10.1,4.1 ± 3.5,0.0 ± 0.0,0.0 ± 0.0,4.1 ± 4.6,2.1 ± 4.0
claude-3-sonnet-20240229,no_afterlines,7,212,3.4 ± 2.4,0.0 ± 0.0,5.3 ± 10.0,3.2 ± 3.1,0.0 ± 0.0,0.0 ± 0.0,1.4 ± 2.6,2.0 ± 3.9
deepseek-coder-1.3b-instruct,no_afterlines,5,212,2.4 ± 2.1,0.0 ± 0.0,0.0 ± 0.0,1.6 ± 2.2,0.0 ± 0.0,0.0 ± 0.0,1.4 ± 2.7,0.0 ± 0.0
deepseek-coder-7b-instruct,no_afterlines,46,212,22.4 ± 5.7,0.0 ± 0.0,10.6 ± 13.9,16.3 ± 6.6,0.0 ± 0.0,7.1 ± 13.4,13.8 ± 7.9,16.3 ± 10.3
gpt-3.5-turbo-0125,no_afterlines,68,212,32.5 ± 6.4,11.1 ± 20.5,10.7 ± 13.8,36.6 ± 8.5,22.2 ± 26.9,14.2 ± 18.1,32.9 ± 10.8,22.4 ± 11.8
gpt-4-turbo,no_afterlines,96,212,45.6 ± 6.8,22.0 ± 27.0,26.1 ± 19.5,39.0 ± 8.6,66.8 ± 31.3,28.6 ± 23.9,42.5 ± 11.2,48.9 ± 14.0
phi-3-mini-4k,no_afterlines,2,212,1.0 ± 1.3,0.0 ± 0.0,0.0 ± 0.0,0.8 ± 1.6,0.0 ± 0.0,0.0 ± 0.0,1.4 ± 2.7,0.0 ± 0.0


Analysis and saving completed for Python Completion


Analyzing Python Infilling


Unnamed: 0_level_0,Unnamed: 1_level_0,General,General,General
Unnamed: 0_level_1,Unnamed: 1_level_1,Passed Count,Total Count,All Pass Ratio
"(General, Model Name)","(General, Generation Mode)",Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Meta-Llama-3-70B-Instruct,with_afterlines,109,382,28.6 ± 4.5
Meta-Llama-3-8B-Instruct,with_afterlines,23,382,6.0 ± 2.4
claude-3-haiku-20240307,with_afterlines,43,382,11.2 ± 3.2
claude-3-opus-20240229,with_afterlines,114,382,29.8 ± 4.6
claude-3-sonnet-20240229,with_afterlines,94,382,24.6 ± 4.3
deepseek-coder-1.3b-instruct,with_afterlines,9,382,2.3 ± 1.5
deepseek-coder-7b-instruct,with_afterlines,12,382,3.1 ± 1.7
gpt-3.5-turbo-0125,with_afterlines,60,382,15.7 ± 3.6
gpt-4-turbo,with_afterlines,219,382,57.3 ± 4.9
phi-3-mini-4k,with_afterlines,4,382,1.0 ± 1.0


Unnamed: 0_level_0,Unnamed: 1_level_0,General,General,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Reason Categories Pass Ratio,Reason Categories Pass Ratio,Reason Categories Pass Ratio,Reason Categories Pass Ratio
Unnamed: 0_level_1,Unnamed: 1_level_1,Passed Count,Total Count,Variable,Function,Class,Library,List Comprehension,If Condition,If Body,Loop Body
"(General, Model Name)","(General, Generation Mode)",Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
Meta-Llama-3-70B-Instruct,with_afterlines,109,382,28.3 ± 4.6,40.5 ± 15.8,0.0 ± 0.0,27.7 ± 6.2,19.9 ± 20.0,10.4 ± 13.9,8.1 ± 6.9,15.3 ± 8.4
Meta-Llama-3-8B-Instruct,with_afterlines,23,382,5.7 ± 2.3,10.8 ± 9.8,0.0 ± 0.0,6.9 ± 3.5,13.4 ± 17.2,5.3 ± 10.2,4.9 ± 5.4,6.9 ± 5.9
claude-3-haiku-20240307,with_afterlines,43,382,10.8 ± 3.2,32.4 ± 15.1,0.0 ± 0.0,10.9 ± 4.3,0.0 ± 0.0,5.4 ± 10.2,1.6 ± 3.2,8.4 ± 6.4
claude-3-opus-20240229,with_afterlines,114,382,29.4 ± 4.7,48.6 ± 16.1,0.0 ± 0.0,25.8 ± 6.0,26.8 ± 22.6,5.2 ± 9.9,8.2 ± 6.8,9.8 ± 6.8
claude-3-sonnet-20240229,with_afterlines,94,382,24.6 ± 4.4,37.9 ± 15.8,0.0 ± 0.0,21.3 ± 5.6,0.0 ± 0.0,0.0 ± 0.0,6.5 ± 6.2,12.5 ± 7.6
deepseek-coder-1.3b-instruct,with_afterlines,9,382,2.4 ± 1.6,0.0 ± 0.0,0.0 ± 0.0,1.5 ± 1.7,0.0 ± 0.0,0.0 ± 0.0,1.6 ± 3.2,1.4 ± 2.7
deepseek-coder-7b-instruct,with_afterlines,12,382,3.0 ± 1.7,0.0 ± 0.0,0.0 ± 0.0,2.5 ± 2.1,0.0 ± 0.0,5.3 ± 10.1,6.6 ± 6.2,4.2 ± 4.6
gpt-3.5-turbo-0125,with_afterlines,60,382,15.6 ± 3.7,24.4 ± 13.8,13.1 ± 17.1,12.4 ± 4.6,6.7 ± 12.6,5.3 ± 10.1,13.2 ± 8.5,9.8 ± 6.9
gpt-4-turbo,with_afterlines,219,382,57.9 ± 5.1,54.2 ± 16.2,33.5 ± 23.8,50.5 ± 6.9,73.4 ± 22.5,26.3 ± 20.0,55.8 ± 12.3,55.5 ± 11.4
phi-3-mini-4k,with_afterlines,4,382,1.1 ± 1.1,2.7 ± 5.2,6.7 ± 12.7,0.5 ± 1.0,6.7 ± 12.6,0.0 ± 0.0,1.7 ± 3.2,0.0 ± 0.0


Analysis and saving completed for Python Infilling


Analyzing Java Completion


Unnamed: 0_level_0,Unnamed: 1_level_0,General,General,General
Unnamed: 0_level_1,Unnamed: 1_level_1,Passed Count,Total Count,All Pass Ratio
"(General, Model Name)","(General, Generation Mode)",Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Meta-Llama-3-70B-Instruct,no_afterlines,36,286,12.6 ± 3.8
Meta-Llama-3-8B-Instruct,no_afterlines,6,286,2.1 ± 1.7
claude-3-haiku-20240307,no_afterlines,18,286,6.3 ± 2.8
claude-3-opus-20240229,no_afterlines,79,286,27.7 ± 5.2
claude-3-sonnet-20240229,no_afterlines,32,286,11.2 ± 3.7
deepseek-coder-1.3b-instruct,no_afterlines,3,286,1.0 ± 1.2
deepseek-coder-7b-instruct,no_afterlines,7,286,2.4 ± 1.8
gpt-3.5-turbo-0125,no_afterlines,10,286,3.5 ± 2.1
gpt-4-turbo,no_afterlines,27,286,9.4 ± 3.3
phi-3-mini-4k,no_afterlines,1,286,0.4 ± 0.7


Unnamed: 0_level_0,Unnamed: 1_level_0,General,General,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Reason Categories Pass Ratio,Reason Categories Pass Ratio,Reason Categories Pass Ratio
Unnamed: 0_level_1,Unnamed: 1_level_1,Passed Count,Total Count,Variable,Global Variable,Function,Class,Library,If Condition,If Body,Loop Body
"(General, Model Name)","(General, Generation Mode)",Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
Meta-Llama-3-70B-Instruct,no_afterlines,36,286,10.6 ± 4.1,12.0 ± 4.6,12.5 ± 6.9,12.5 ± 11.5,0.0 ± 0.0,6.1 ± 8.2,16.7 ± 6.6,4.9 ± 4.2
Meta-Llama-3-8B-Instruct,no_afterlines,6,286,1.8 ± 1.8,2.2 ± 2.1,2.3 ± 3.1,0.0 ± 0.0,0.0 ± 0.0,0.0 ± 0.0,2.5 ± 2.8,2.0 ± 2.7
claude-3-haiku-20240307,no_afterlines,18,286,4.1 ± 2.6,7.1 ± 3.7,7.9 ± 5.7,3.1 ± 6.0,10.0 ± 18.6,6.1 ± 8.2,3.3 ± 3.2,2.9 ± 3.3
claude-3-opus-20240229,no_afterlines,79,286,25.2 ± 5.8,30.5 ± 6.6,25.0 ± 9.1,21.8 ± 14.3,40.0 ± 30.2,24.2 ± 14.5,15.8 ± 6.5,14.7 ± 6.8
claude-3-sonnet-20240229,no_afterlines,32,286,9.6 ± 3.9,10.9 ± 4.5,10.2 ± 6.3,3.1 ± 6.0,9.9 ± 18.7,6.0 ± 8.2,12.5 ± 6.0,8.8 ± 5.5
deepseek-coder-1.3b-instruct,no_afterlines,3,286,1.4 ± 1.5,0.0 ± 0.0,3.4 ± 3.8,0.0 ± 0.0,0.0 ± 0.0,0.0 ± 0.0,0.8 ± 1.6,2.9 ± 3.3
deepseek-coder-7b-instruct,no_afterlines,7,286,1.8 ± 1.8,3.3 ± 2.6,2.3 ± 3.1,0.0 ± 0.0,0.0 ± 0.0,0.0 ± 0.0,0.0 ± 0.0,2.0 ± 2.7
gpt-3.5-turbo-0125,no_afterlines,10,286,3.7 ± 2.5,2.2 ± 2.1,2.3 ± 3.1,0.0 ± 0.0,0.0 ± 0.0,3.1 ± 5.9,5.9 ± 4.2,1.9 ± 2.7
gpt-4-turbo,no_afterlines,27,286,10.6 ± 4.2,8.7 ± 4.0,8.0 ± 5.6,12.4 ± 11.3,20.0 ± 24.7,9.1 ± 9.9,10.9 ± 5.5,6.9 ± 4.9
phi-3-mini-4k,no_afterlines,1,286,0.5 ± 0.9,0.0 ± 0.0,1.1 ± 2.2,0.0 ± 0.0,0.0 ± 0.0,0.0 ± 0.0,0.0 ± 0.0,0.0 ± 0.0


Analysis and saving completed for Java Completion


Analyzing Java Infilling


Unnamed: 0_level_0,Unnamed: 1_level_0,General,General,General
Unnamed: 0_level_1,Unnamed: 1_level_1,Passed Count,Total Count,All Pass Ratio
"(General, Model Name)","(General, Generation Mode)",Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Meta-Llama-3-70B-Instruct,with_afterlines,119,283,42.0 ± 5.8
Meta-Llama-3-8B-Instruct,with_afterlines,21,283,7.4 ± 3.1
claude-3-haiku-20240307,with_afterlines,19,283,6.7 ± 2.9
claude-3-opus-20240229,with_afterlines,159,283,56.2 ± 5.8
claude-3-sonnet-20240229,with_afterlines,103,283,36.4 ± 5.6
deepseek-coder-1.3b-instruct,with_afterlines,4,283,1.4 ± 1.4
deepseek-coder-7b-instruct,with_afterlines,3,283,1.1 ± 1.2
gpt-3.5-turbo-0125,with_afterlines,38,283,13.4 ± 3.9
gpt-4-turbo,with_afterlines,181,283,64.0 ± 5.6
phi-3-mini-4k,with_afterlines,11,283,3.9 ± 2.3


Unnamed: 0_level_0,Unnamed: 1_level_0,General,General,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Horizon Categories Pass Ratio,Reason Categories Pass Ratio,Reason Categories Pass Ratio,Reason Categories Pass Ratio
Unnamed: 0_level_1,Unnamed: 1_level_1,Passed Count,Total Count,Variable,Global Variable,Function,Class,Library,If Condition,If Body,Loop Body
"(General, Model Name)","(General, Generation Mode)",Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
Meta-Llama-3-70B-Instruct,with_afterlines,119,283,41.0 ± 6.4,34.4 ± 7.4,43.6 ± 9.3,37.0 ± 12.9,66.7 ± 38.1,25.0 ± 11.4,36.7 ± 10.7,36.3 ± 9.3
Meta-Llama-3-8B-Instruct,with_afterlines,21,283,6.5 ± 3.2,5.7 ± 3.6,9.1 ± 5.3,5.6 ± 6.1,16.8 ± 30.0,8.9 ± 7.5,10.1 ± 6.6,6.9 ± 5.0
claude-3-haiku-20240307,with_afterlines,19,283,6.5 ± 3.2,5.7 ± 3.6,8.2 ± 5.1,13.0 ± 9.0,0.0 ± 0.0,8.9 ± 7.5,6.3 ± 5.4,4.9 ± 4.2
claude-3-opus-20240229,with_afterlines,159,283,56.0 ± 6.3,57.4 ± 7.8,58.2 ± 9.3,53.7 ± 13.3,83.4 ± 29.7,62.5 ± 12.6,59.5 ± 10.8,51.1 ± 9.6
claude-3-sonnet-20240229,with_afterlines,103,283,34.0 ± 6.1,35.0 ± 7.5,40.0 ± 9.2,37.0 ± 12.9,100.0 ± 0.0,23.2 ± 11.1,31.7 ± 10.4,30.4 ± 9.0
deepseek-coder-1.3b-instruct,with_afterlines,4,283,1.7 ± 1.7,1.3 ± 1.8,0.9 ± 1.8,1.8 ± 3.5,0.0 ± 0.0,0.0 ± 0.0,1.3 ± 2.5,4.0 ± 3.8
deepseek-coder-7b-instruct,with_afterlines,3,283,1.3 ± 1.5,0.6 ± 1.2,1.8 ± 2.5,0.0 ± 0.0,0.0 ± 0.0,0.0 ± 0.0,1.3 ± 2.5,2.9 ± 3.3
gpt-3.5-turbo-0125,with_afterlines,38,283,12.1 ± 4.2,14.7 ± 5.4,13.6 ± 6.4,9.3 ± 7.7,16.7 ± 29.8,19.6 ± 10.5,24.1 ± 9.5,12.8 ± 6.5
gpt-4-turbo,with_afterlines,181,283,63.4 ± 6.2,51.7 ± 7.8,64.6 ± 8.9,57.4 ± 13.2,83.3 ± 29.6,44.6 ± 12.9,60.8 ± 10.7,65.6 ± 9.2
phi-3-mini-4k,with_afterlines,11,283,4.3 ± 2.6,1.9 ± 2.1,5.4 ± 4.3,5.6 ± 6.0,16.6 ± 29.7,8.9 ± 7.5,5.0 ± 4.8,7.8 ± 5.2


Analysis and saving completed for Java Infilling


