## Import Utilities

In [None]:
import os
import sys
from IPython.display import display, HTML
# from matplotlib.figure import Figure

utils_path = os.path.abspath(os.path.join('..'))
sys.path.append(utils_path)

from Utils.models_analysis_class import ModelsAnalysis
from Utils.statistics_plots_analysis_utils import *

EXPERT_BOXPLOT_COLOR = '#E74C3C'
MODELS_BOXPLOT_COLOR = '#1F77B4'

def display_wrapped(text):
    display(HTML("<div style='white-space: pre-wrap;'>{}</div>".format(text)))

def questionwise_agreement_heatmap(stat_group_df: pd.DataFrame, figsize=None, title=''):
    question_avg = stat_group_df.mean(axis=1).values.reshape(-1, 1).round(2)
    model_avg = stat_group_df.mean(axis=0).values.reshape(1, -1).round(2)

    if figsize is None:
        (m, n) = stat_group_df.shape
    else:
        (m, n) = figsize

    asp = 0.5 * m / float(n)
    figw = n
    figh = figw * asp
    gridspec_kw = {'height_ratios': [m, 1], 'width_ratios': [n, 1]}
    fig, axes = plt.subplots(2, 2, figsize=(figw, figh), gridspec_kw=gridspec_kw)

    plt.subplots_adjust(top=0.90, wspace=0.05, hspace=0.1)

    hm_kwargs = dict(vmin=0, vmax=1, cmap='coolwarm', annot=True, fmt='.2f', linewidths=.5, linecolor='black')

    # Plot stat_group_df heatmap
    sns.heatmap(stat_group_df, ax=axes[0, 0], xticklabels=False, cbar=False, **hm_kwargs)

    # Plot q_avg heatmap
    sns.heatmap(question_avg, ax=axes[0, 1], yticklabels=False, **hm_kwargs)
    axes[0, 1].set_xticklabels(['Question Avg'], rotation=90)

    # Plot model_avg heatmap
    sns.heatmap(model_avg, ax=axes[1, 0], cbar=False, **hm_kwargs)
    axes[1, 0].set_xticklabels(stat_group_df.columns, rotation=90)
    axes[1, 0].set_yticklabels(['Model Avg'])

    # Hide the empty subplot
    axes[1, 1].axis('off')

    # plt.tight_layout()
    fig.suptitle(title, fontsize=16)

    return fig

# Function to plot a single comparison of Zero-shot and Chain-of-thought scores across models
def plot_comparison_bars(zs_agreement: pd.DataFrame, gs_agreement: pd.DataFrame, 
                         analysis_instance, ax=None, figsize=(10, 6), annotate=True, filename=None, dpi=500):
    if ax is None:
        fig, ax = plt.subplots(figsize=figsize)

    xticklabels = gs_agreement.columns
    x = np.arange(len(xticklabels))  # the label locations
    width = 0.35  # the width of the bars

    light_blue = '#AEC7E8'
    dark_blue = '#1F77B4'

    for i, model in enumerate(xticklabels):
        zs_original_value = zs_agreement[model].iloc[0]
        gs_original_value = gs_agreement[model].iloc[0]
        
        zs_bar_value = zs_original_value if zs_original_value < 0 else zs_original_value * 5
        gs_bar_value = gs_original_value if gs_original_value < 0 else gs_original_value * 5
        
        bar_zs = ax.bar(x[i] - width/2, zs_bar_value, width, bottom=2, color=light_blue)
        bar_gs = ax.bar(x[i] + width/2, gs_bar_value, width, bottom=2, color=dark_blue)

        if annotate:
            ax.annotate(f'{zs_original_value:.2f}', xy=(x[i] - width/2, zs_bar_value + 2), 
                        xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')
            ax.annotate(f'{gs_original_value:.2f}', xy=(x[i] + width/2, gs_bar_value + 2), 
                        xytext=(0, 3), textcoords="offset points", ha='center', va='bottom')
        
    set_plot_properties(ax, ylabel=analysis_instance.agreement_coef)
    ax.set_xticks(x)
    ax.set_xticklabels(xticklabels, rotation=0)
    
    y_ticks = ['-2', '-1', '0'] + [str(round(x, 1)) for x in np.arange(0.2, 1.2, 0.2)]
    ax.set_yticks(np.arange(len(y_ticks)))
    ax.set_yticklabels(y_ticks)
    
    ax.annotate('↯', xy=(0, 0), xytext=(0, 0.22),
                fontsize=15, va='top', ha='center', color='black',
                xycoords='axes fraction', textcoords='axes fraction')
    
    ax.legend([bar_zs, bar_gs], ['ZS', 'GS'])

    # plt.tight_layout()
    if filename:
        plt.savefig(filename, format=filename.split('.')[-1], dpi=dpi)
    
    return ax

# Function to plot multiple comparisons of Zero-shot and Chain-of-thought scores across models
def plot_multiple_comparisons(zs_qw_agreement, gs_qw_agreement, gs_analysis, ncols=2, filename=None, dpi=500):
    num_plots = len(gs_analysis.model_names)

    # Calculate number of rows
    nrows = int(np.ceil(num_plots / ncols))
    
    # Adjust figsize based on number of columns and rows
    fig_width = 10 * ncols
    fig_height = 5 * nrows

    _, axs = plt.subplots(nrows, ncols, figsize=(fig_width, fig_height))

    # Flatten axs if it's a single row/column grid
    if nrows == 1 and ncols == 1:
        axs = np.array([[axs]])
    else:
        axs = axs.flatten()

    for i, model_name in enumerate(gs_analysis.model_names):
        zs_data = zs_qw_agreement[[model_name]].T
        gs_data = gs_qw_agreement[[model_name]].T

        ax = axs[i]

        plot_comparison_bars(zs_data, gs_data, gs_analysis, ax, annotate=False)

        ax.set_title(f'Model: {model_name}')

    # Turn off the last subplot if there are unused subplots
    if num_plots < nrows * ncols:
        for j in range(num_plots, nrows * ncols):
            axs[j].axis('off')

    if filename:
        plt.savefig(filename, format=filename.split('.')[-1], dpi=dpi)
    plt.show()

#### Prepare and Save Expert Scores

In [None]:
# videos_info_file = '../../Getting_Transcripts/merged_filtered_videos_transcripts.csv'
# videos_info_df = pd.read_csv(videos_info_file, usecols=['Video ID'], encoding='utf-8')
# original_experts_file = '../../../Videos_and_DISCERN_data/videos_info_and_scores.xlsx'
# score_columns_to_read = ['Video ID', 'Topic', 'DISCERN1', 'DISCERN2'] + [f'DISCERN1 Q{i}' for i in range(1, 16)] + [f'DISCERN2 Q{i}' for i in range(1, 16)]
# experts_df = pd.read_excel(original_experts_file, usecols=score_columns_to_read)
# experts_df = merge_dataframes(videos_info_df, experts_df, experts_df.columns)

# experts_df.rename(columns=lambda x: x.replace('DISCERN', 'Expert'), inplace=True)

# # Find the indices of rows where any of the Q1 to Q15 columns have non-null values, to fill the NaN with 1
# # This is to fill Cluster Headque N/A questions with 1
# indices = experts_df[EXPERT1_COLUMNS].notna().any(axis=1)
# experts_df.loc[indices, EXPERT1_COLUMNS] = experts_df.loc[indices, EXPERT1_COLUMNS].fillna(1)
# # Sum the total after filling Nan with 1
# experts_df.loc[indices, 'Expert1'] = experts_df.loc[indices, EXPERT1_COLUMNS].sum(axis=1)

# # Calculate the mean of 'Expert1' and 'Expert2' columns where 'Expert2' is not NaN, and round it up
# mean_discern = np.where(experts_df['Expert2'].notnull(), 
#                         experts_df[['Expert1', 'Expert2']].mean(axis=1),
#                         experts_df['Expert1'])
# experts_df.insert(4, 'Experts_Avg', mean_discern)

# experts_df.to_csv('../../../Videos_and_DISCERN_data/filtered_experts_scores.csv', index=False)
# print(experts_df.shape)
# experts_df.head()

## Load Data

In [None]:
videos_info_file = '../../Getting_Transcripts/merged_filtered_videos_transcripts.csv'
videos_info_df = pd.read_csv(videos_info_file, usecols=['Video ID'], encoding='utf-8')
original_experts_file = '../../../Videos_and_DISCERN_data/videos_info_and_scores.xlsx'
score_columns_to_read = ['Video ID', 'Topic', 'DISCERN1', 'DISCERN2'
                         ] + [f'DISCERN1 Q{i}' for i in range(1, 16)
                              ] + [f'DISCERN2 Q{i}' for i in range(1, 16)]
original_experts_df = pd.read_excel(original_experts_file, usecols=score_columns_to_read)
print(original_experts_df.shape)
print(original_experts_df['Topic'].value_counts())

Zero-Shot Prompting: 7 topics

In [None]:
zs_model_files_dict = {
    'BioMistral': 'BioMistral-response.csv',
    'Claude-3 Sonnet': 'claude-3-sonnet-20240229-response.csv',
    'Falcon': 'falcon-40b-instruct-response.csv',
    'GPT-4 Turbo': 'gpt-4-turbo-response.csv',
    'GPT-4o': 'gpt-4o-last_5_topics-ZS_prompting-response.csv',
    'Gemini-1.0 Pro': 'gemini-1.0-pro-latest-response.csv',
    'KTO Mistral': 'KTO_Mistral_PairRM-response.csv',
    'Llama-3 70B': 'Llama-3-70B-Instruct-response.csv',
    'Llama-3 8B': 'Llama-3-8B-Instruct-response user-only-prompt.csv',
    'Meerkat': 'meerkat-response.csv',
    'Mistral': 'Mistral-7B-Instruct-response.csv',
    'Mixtral 8x22B': 'Mixtral-8x22B-Instruct-response.csv',
    'Mixtral 8x7B': 'Mixtral-8x7B-Instruct-response.csv',
    'MultiVerse': 'MultiVerse_70B-response.csv',
    'Orca-2': 'Orca-2-do_sample=false-response.csv',
    'Phi-3 mini': 'Phi-3-mini-4k-instruct-response.csv',
    'Qwen-1.5': 'Qwen1_5-72B-response.csv',
    'Rhea': 'Rhea-response.csv',
    'Vicuna': 'vicuna-33b-response.csv',
    'Yi': 'Yi-34B-response.csv',
}

figures_dir = '../../../Lancet_Paper/Figures'
experts_file = '../../../Videos_and_DISCERN_data/filtered_experts_scores.csv'
models_dir = '../LLMs_Responses'
topics_keys = ['SB', 'FF', 'CH', 'TF', 'PN']

Zero-shot Prompting: ISA topic

In [None]:
# zs_model_files_dict = {
#     'Claude-3 Sonnet': 'claude-3-sonnet-20240229-diabetes-ZS_prompting-response.csv',
#     # 'Gemini-1.0 Pro': 'gemini-1.0-pro-latest-diabetes-ZS_prompting-response.csv',
#     # 'Gemini-1.5 Pro': 'gemini-1.5-pro-latest-diabetes-ZS_prompting-response.csv',
#     'Gemini-1.5 Flash': 'gemini-1.5-flash-diabetes-ZS_prompting-response.csv',
#     'GPT-4o': 'gpt-4o-diabetes-ZS_prompting-response.csv',
# }

# figures_dir = '../../../ISA_Paper/Figures'
# experts_file = '../../../ISA_Paper/Data/diabetes_experts_scores.csv'
# models_dir = '../../../ISA_Paper/Data/Results'
# topics_keys = ['ISA']

Zero-shot: binary questions: first 2 topics

In [None]:
# zs_model_files_dict = {
#     # 'Claude-3 Sonnet': 'claude-3-sonnet-20240229-diabetes-zero_shot_prompting-response.csv',
#     'Gemini-1.0 Pro': 'gemini-1.0-pro-first_2_topics-ZS_prompting-binary_questions-response.csv',
#     'Gemini-1.5 Pro': 'gemini-1.5-pro-latest-first_2_topics-ZS_prompting-binary_questions-response.csv',
#     'GPT-4o': 'gpt-4o-first_2_topics-ZS-binary_questions-response.csv',
# }

In [None]:
if not os.path.exists(figures_dir):
    os.makedirs(figures_dir)
fig_format = 'pdf'
fig_dpi = 500

categories = [1, 2, 3, 4, 5]
agreement_coef = 'Brennan-Prediger Kappa'
weights_type = 'quadratic'

zs_analysis = ModelsAnalysis(zs_model_files_dict, experts_file, models_dir, topics_keys, 
                         categories, agreement_coef, weights_type)

zs_analysis.load_experts_data()
zs_analysis.process_models()

print('Data shape:', zs_analysis.group_df.shape)
zs_analysis.group_df.head(1)

## Zero-Shot Analysis

### Total Score Analysis
Total scores out of 75.

Experts Agreement

In [None]:
print(f'Total score agreement between Expert 1 and Expert 2:', 
      round(zs_analysis.calculate_total_experts_agreement(), 2))

Distribution of Total Experts Scores

In [None]:
total_bins = list(range(15,80,5))
fig, _ = create_plot('histplot', zs_analysis.group_df, x='Experts_Avg', bins=total_bins,
            xlabel='Experts_Avg Scores', ylabel='Frequency',
            figsize=(10,5))

# fig_title = "Frequency distribution of experts' total average scores"
# fig_file_name = os.path.join(figures_dir, fig_title + '.' + fig_format)
# fig.savefig(fig_file_name, format=fig_format, dpi=fig_dpi)

Distribution of Scores Across Questions

In [None]:
fig, axes = plt.subplots(3, 5, figsize=(15, 10), sharex=True, sharey=True)

qw_bins = [x - 0.5 for x in categories] + [categories[-1] + 0.5]

# Flatten the axes array to iterate over it easily
axes = axes.flatten()

max_y_lim = len(zs_analysis.group_df) + 5
# Iterate over each column and create a countplot
for i, (expert_column, q_column) in enumerate(zip(EXPERTS_AVG_COLUMNS, QUESTIONS_COLUMNS)):
    create_plot('histplot', zs_analysis.group_df, 
                x=expert_column, ax=axes[i],
                title=q_column, xlabel='Score', ylabel='Frequency',
                ylim=(0, max_y_lim), bins=qw_bins)

# fig.suptitle("Distribution of experts' average scores of individual questions")

plt.tight_layout()
# plt.show()

# fig_title = "Distribution of experts' average scores of individual questions"
# fig_file_name = os.path.join(figures_dir, fig_title + '.' + fig_format)
# fig.savefig(fig_file_name, format=fig_format, dpi=fig_dpi)

#### Descriptive Statistics

In [None]:
zs_total_expert_models_agreement = zs_analysis.calculate_total_expert_models_agreement(models_in_order=True)

zs_descriptive_stat_df = zs_analysis.generate_descriptive_stat()
zs_descriptive_stat_df = zs_descriptive_stat_df.iloc[::-1]

mean_color = '#1F77B4'  # dark blue
std_color = '#FFA07A'   # Light salmon

# Plot the horizontal bar chart
bars = zs_descriptive_stat_df.plot(kind='barh', figsize=(14, 10), color=[mean_color, std_color])
for bar in bars.patches:
    width = bar.get_width()
    plt.annotate(f'{width:.2f}', xy=(width, bar.get_y() + bar.get_height() / 2),
                 xytext=(3, 0), textcoords='offset points', ha='left', va='center')

set_plot_properties(bars, xlabel='Values', ylabel='Models', 
                    )

# fig_title = "Mean and standard deviation of total scores by expert and LLMs for ZS prompting"
# fig_file_name = os.path.join(figures_dir, fig_title + '.' + fig_format)
# plt.savefig(fig_file_name, format=fig_format, dpi=fig_dpi)

Box Plot

In [None]:
palette = {model: MODELS_BOXPLOT_COLOR for model in zs_analysis.model_names}
palette['Experts_Avg'] = EXPERT_BOXPLOT_COLOR

fig, _ = create_plot('boxplot', data=zs_analysis.group_df[['Experts_Avg'] + zs_analysis.model_names],
                     palette=palette, figsize=(12,6), ylim=(10, 80),
                     xlabel='Models', ylabel='Scores',
                     xticks_rotation=90)

# fig_title = "Box-and-whisker plot of expert and models total scores for ZS prompting"
# fig_file_name = os.path.join(figures_dir, fig_title + '.' + fig_format)
# fig.savefig(fig_file_name, format=fig_format, dpi=fig_dpi)

#### Expert-Models Inter-Rater Agreement

In [None]:
fig, _ = create_plot('bar', zs_total_expert_models_agreement,
            ylabel=agreement_coef,
            xticks_rotation=90, figsize=(14, 5))

# fig_title = "Expert-Model inter-rater agreement on total scores for ZS prompting"
# fig_file_name = os.path.join(figures_dir, fig_title + '.' + fig_format)
# fig.savefig(fig_file_name, format=fig_format, dpi=fig_dpi)

In [None]:
# group_df_sorted = zs_analysis.group_df.sort_values(by='Experts_Avg')

# fig, ax = plt.subplots(figsize=(15, 6))

# # Scatter plot for Experts_Avg
# ax.scatter(group_df_sorted['Video ID'], group_df_sorted['Experts_Avg'], color='blue', label='Experts_Avg')

# models_to_plot = ['Gemini-1.0 Pro']#, 'MultiVerse', 'GPT-4 Turbo']
# # Scatter plot for each model
# for model in models_to_plot:
#     ax.scatter(group_df_sorted['Video ID'], group_df_sorted[model], label=model)
        
# set_plot_properties(ax, xlabel='Video ID', ylabel='Scores',
#                     title='Scatter Plot of Experts_Avg and Models')
# plt.legend()
# plt.grid(True)

### Question-Wise Analysis
Individual 15 scores

Experts Inter-Rater Agreement

In [None]:
qw_experts_agreement = zs_analysis.calculate_qw_experts_agreement()

fig, _ = create_plot('barh', data=qw_experts_agreement,
            figsize=(10, 5),
            xlabel=agreement_coef, ylabel='Questions',
            xticks_rotation=0)

# fig_title = "Expert-Expert question-wise agreement (before removing discrepancy)"
# fig_file_name = os.path.join(figures_dir, fig_title + '.' + fig_format)
# fig.savefig(fig_file_name, format=fig_format, dpi=fig_dpi)

Expert-Model Inter-Rater Agreement

In [None]:
# zs_qw_expert_models_agreement = zs_analysis.calculate_qw_expert_models_agreement()

# _ = questionwise_agreement_heatmap(zs_qw_expert_models_agreement, figsize=(14,14))

#### Removing distant ratings between Expert 1 and Expert 2

In [None]:
zs_cleaned_group_df, num_of_distant_ratings_per_q = zs_analysis.remove_distant_ratings(
    QUESTIONS_COLUMNS, EXPERT1_COLUMNS, EXPERT2_COLUMNS, max_diff=1
)

In [None]:
count_notna = zs_cleaned_group_df[EXPERTS_AVG_COLUMNS].notna().sum()

# Create a DataFrame with these counts
videos_count_df = pd.DataFrame({'Count': count_notna.values}, index=QUESTIONS_COLUMNS)
videos_count_df.T

Experts Inter-Rater Agreement

In [None]:
qw_experts_agreement = zs_analysis.calculate_qw_experts_agreement(zs_cleaned_group_df)

fig, _ = create_plot('barh', data=qw_experts_agreement,
            figsize=(10, 5),
            xlabel=agreement_coef, ylabel='Questions',
            xlim=(-0.32,1),
            xticks_rotation=0)

# fig_title = "Expert-Expert question-wise agreement (after removing discrepancy)"
# fig_file_name = os.path.join(figures_dir, fig_title + '.' + fig_format)
# fig.savefig(fig_file_name, format=fig_format, dpi=fig_dpi)

#### Expert-Model Inter-Rater Agreement

In [None]:
zs_qw_expert_models_agreement = zs_analysis.calculate_qw_expert_models_agreement()

title = "Expert-Model inter-rater agreement on individual question scores for ZS prompting"
fig = questionwise_agreement_heatmap(zs_qw_expert_models_agreement, figsize=(14,14))

# fig_file_name = os.path.join(figures_dir, title + '.' + fig_format)
# fig.savefig(fig_file_name, format=fig_format, dpi=fig_dpi)

In [None]:
def filtering_qw_agreement(qw_agreement_df, avg_threshold=0.3):
    '''Keeping only models with average agreement above `avg_threshold`'''
    model_avg = qw_agreement_df.mean(axis=0).round(2)
    model_names_above_threshold = model_avg[model_avg > avg_threshold].index.tolist()
    return qw_agreement_df[model_names_above_threshold]

# cleaned_qw_agreement_filtered = filtering_qw_agreement(zs_qw_expert_models_agreement)

# _ = questionwise_agreement_heatmap(cleaned_qw_agreement_filtered, figsize=(10,6))

Percent of Correct Scores

In [None]:
# percent_of_correct_scores_df = pd.DataFrame(index=QUESTIONS_COLUMNS, columns=zs_analysis.model_names, dtype=int)

# diff = 1.5
# model_columns = []
# for model_name in zs_analysis.model_names:
#     for question, expert_col in zip(QUESTIONS_COLUMNS, EXPERTS_AVG_COLUMNS):
#         model_col = ' '.join([model_name, question])
#         valid_rows = zs_cleaned_group_df[expert_col].notna()
        
#         diff_series = abs(zs_cleaned_group_df.loc[valid_rows, expert_col] - zs_analysis.group_df.loc[valid_rows, model_col])
        
#         # Count the number of differences less than 2
#         correct_scores_percent = diff_series.lt(diff).sum() / diff_series.count()
#         # Store the result in the DataFrame
#         percent_of_correct_scores_df.at[question, model_name] = correct_scores_percent

# _ = questionwise_agreement_heatmap(percent_of_correct_scores_df, title=f'Percent of differences < {diff}')#, figsize=(14,6))

## Models Inter-Rater Agreement

In [None]:
# good_threshold = 0.6
# bad_threshold = 0.2

# good_models = zs_total_expert_models_agreement.columns[zs_total_expert_models_agreement.loc['Brennan-Prediger Kappa'] > good_threshold].tolist()
# moderate_models = zs_total_expert_models_agreement.columns[(zs_total_expert_models_agreement.loc['Brennan-Prediger Kappa'] <= good_threshold) 
#                                         & (zs_total_expert_models_agreement.loc['Brennan-Prediger Kappa'] > bad_threshold)].tolist()
# bad_models = zs_total_expert_models_agreement.columns[zs_total_expert_models_agreement.loc['Brennan-Prediger Kappa'] <= bad_threshold].tolist()

In [None]:
# # models = good_models + moderate_models + bad_models
# zs_models_agreement_df = zs_analysis.calculate_models_agreement()

# _ = create_plot('heatmap', data=zs_models_agreement_df,
#                 figsize=(12,6),
#                 title=f'Models Total Score Agreement Heatmap',
#                 xticks_rotation=90)

## Chain-of-Thoughts

In [None]:
gs_model_files_dict = {
    'GPT-4o': 'gpt-4o-last_5_topics-GS_prompting-response.csv',
    'Gemini-1.0 Pro': 'gemini-1.0-pro-latest-last_5_topics-GS_prompting-response.csv',
    'MultiVerse': 'MultiVerse_70B-last_5_topics-GS_prompting-response.csv',
    'GPT-4 Turbo': 'gpt-4-turbo-last_5_topics-GS_prompting-response.csv',
    # 'Llama-3 70B': 'Llama-3-70B-Instruct-last_5_topics-GS_prompting-response.csv',
    'Claude-3 Sonnet': 'claude-3-sonnet-20240229-last_5_topics-GS_prompting-response.csv',
    'Orca-2': 'Orca-2-last_5_topics-GS_prompting-response.csv',
    'KTO Mistral': 'KTO_Mistral_PairRM-last_5_topics-GS_prompting-response.csv',
    'Phi-3 mini': 'Phi-3-mini-128k-instruct-last_5_topics-GS_prompting-response.csv',
}

In [None]:
# gs_model_files_dict = {
#     'Claude-3 Sonnet': 'claude-3-sonnet-20240229-diabetes-GS_prompting-response.csv',
#     # 'Gemini-1.0 Pro': 'gemini-1.0-pro-latest-diabetes-GS_prompting-response.csv',
#     # 'Gemini-1.5 Pro': 'gemini-1.5-pro-latest-diabetes-GS_prompting-response.csv',
#     'Gemini-1.5 Flash': 'gemini-1.5-flash-diabetes-GS_prompting-response.csv',
#     'GPT-4o': 'gpt-4o-diabetes-GS_prompting-response.csv',
# }

Chain-of-Thought Prompting: last 3 topics

In [None]:
def reorder_model_names(base_analysis, target_analysis):
    """
    Reorders the model names in target_analysis to match the order of model names in base_analysis.

    Args:
        base_analysis: Instance of ModelsAnalysis for the order to follow.
        target_analysis: Instance of ModelsAnalysis for order to update.
    """
    base_model_names = base_analysis.model_names
    target_model_names = [model for model in base_model_names if model in target_analysis.model_names]
    target_analysis.model_names = target_model_names

gs_analysis = ModelsAnalysis(gs_model_files_dict, experts_file, models_dir, topics_keys,
                         categories, agreement_coef, weights_type)

gs_analysis.load_experts_data()
gs_analysis.process_models()

# Reordering models in GS to be same order as ZS, for comparison
reorder_model_names(zs_analysis, gs_analysis)

gs_total_expert_models_agreement = gs_analysis.calculate_total_expert_models_agreement()

gs_cleaned_group_df, num_of_distant_ratings_per_q = gs_analysis.remove_distant_ratings(
    QUESTIONS_COLUMNS, EXPERT1_COLUMNS, EXPERT2_COLUMNS, max_diff=1)

gs_qw_expert_models_agreement = gs_analysis.calculate_qw_expert_models_agreement(group_df=gs_analysis.group_df)

In [None]:
gs_descriptive_stat = gs_analysis.generate_descriptive_stat()
gs_descriptive_stat.round(2)

In [None]:
palette = {model: MODELS_BOXPLOT_COLOR for model in gs_analysis.model_names}
palette['Experts_Avg'] = EXPERT_BOXPLOT_COLOR

_, _ = create_plot('boxplot', data=gs_analysis.group_df[['Experts_Avg'] + gs_analysis.model_names],
                   palette=palette, figsize=(10,5), ylim=(10, 80),
                   xlabel='Models', ylabel='Scores',
                   title='Box plot of expert and models total scores after GS prompting',
                   xticks_rotation=90)

In [None]:
# fig_title = 'Expert-Model inter-rater agreement on total scores for ZS vs GS prompting'
# fig_file_name = os.path.join(figures_dir, fig_title + '.' + fig_format)
# fig.savefig(fig_file_name, format=fig_format, dpi=fig_dpi)

_ = plot_comparison_bars(zs_total_expert_models_agreement[gs_analysis.model_names], 
                         gs_total_expert_models_agreement, 
                         gs_analysis, 
                        #  filename=fig_file_name, dpi=fig_dpi,
                         figsize=(12,6))

In [None]:
# fig_title = 'Expert-Model inter-rater agreement on individual question scores for ZS vs GS prompting'
# fig_file_name = os.path.join(figures_dir, fig_title + '.' + fig_format)
# fig.savefig(fig_file_name, format=fig_format, dpi=fig_dpi)

plot_multiple_comparisons(zs_qw_expert_models_agreement, 
                          gs_qw_expert_models_agreement, 
                          gs_analysis, 
                        #   filename=fig_file_name, dpi=fig_dpi,
                          ncols=2)

In [None]:
# fig_title = "Average of models' question-wise ZS and GS prompting agreement"
# fig_file_name = os.path.join(figures_dir, fig_title + '.' + fig_format)
# fig.savefig(fig_file_name, format=fig_format, dpi=fig_dpi)

zs_mean_df = zs_qw_expert_models_agreement.mean(axis=1).to_frame(name='mean').T
gs_mean_df = gs_qw_expert_models_agreement.mean(axis=1).to_frame(name='mean').T

_ = plot_comparison_bars(zs_mean_df, 
                         gs_mean_df, 
                         gs_analysis, 
                        #  filename=fig_file_name, dpi=fig_dpi,
                         figsize=(20,7))

In [None]:
# _ = questionwise_agreement_heatmap(gs_qw_expert_models_agreement, figsize=(12,6))

## Displaying samples of responses by LLMs

In [None]:
model_name = 'GPT-4o'

def get_model_responses(model_name, models_dir, analysis_instance, model_files_dict) -> pd.DataFrame:
    model_file_path = os.path.join(models_dir, model_files_dict[model_name])
    model_responses_df = pd.read_csv(model_file_path, encoding='utf-8')
    return merge_dataframes(model_responses_df, analysis_instance.group_df, ['Video ID'] + EXPERTS_AVG_COLUMNS)

zs_model_responses_df = get_model_responses(model_name, models_dir, zs_analysis, zs_model_files_dict)
gs_model_responses_df = get_model_responses(model_name, models_dir, gs_analysis, gs_model_files_dict)

In [None]:
def filter_and_display_examples(zs_model_responses_df: pd.DataFrame, q_num, expert_score, example_num, diff, model_name):
    # Define the range for expert scores and differences
    expert_score_range = [expert_score, expert_score - 0.5]
    diff_range = [diff, diff - 0.5]

    # Filter rows where the expert score is within the defined range
    filtered_to_expert_score_df = zs_model_responses_df[
        zs_model_responses_df[f'Experts_Avg Q{q_num}'].isin(expert_score_range)
    ]

    # Check if the dataframe is not empty
    if not filtered_to_expert_score_df.empty:
        # Filter rows where the absolute difference between model score and expert score is within the defined range
        filtered_to_diff_df = filtered_to_expert_score_df[
            filtered_to_expert_score_df.apply(
                lambda row: any(abs(row[f'Q{q_num}'] - row[f'Experts_Avg Q{q_num}']) == d for d in diff_range),
                axis=1
            )
        ]

        if filtered_to_diff_df.empty:
            print(f'No example found where the expert score is within {expert_score_range} and the difference is within {diff_range}')
        else:
            number_of_examples = len(filtered_to_diff_df)
            print(f'There are {number_of_examples} example(s) for the selected difference and expert score')
            if example_num > number_of_examples:
                print(f"Error: `example_num` should be {number_of_examples} or less")
            else:
                example_row = filtered_to_diff_df.iloc[example_num - 1]
                transcript = example_row['Transcript']
                response = example_row[f'Response_{q_num}']
                q_score = example_row[f'Q{q_num}']
                avg_score = example_row[f'Experts_Avg Q{q_num}']
                actual_diff = abs(q_score - avg_score)
                print('Video ID:', example_row['Video ID'])
                display_wrapped("Transcript: " + transcript)
                print(f'{model_name} Response_{q_num} where Experts_Avg Q{q_num}={avg_score} and Q{q_num}={q_score} have a difference of {actual_diff:.1f}:')
                display_wrapped(response)
    else:
        print(f'No examples found where the expert score is within {expert_score_range}')

question_num = 9
expert_score = 4
difference = 4            # 0 for high agreement, 2 for medium, and 4 for low
example_num = 1     # from 1 to len(filtered_to_expert_score_df)
filter_and_display_examples(zs_model_responses_df, question_num, expert_score, example_num, difference, model_name)

Video IDs where the agreement on ZS is better than the agreement on GS prompting

In [None]:
def get_video_ids_for_condition(zs_model_responses_df, gs_model_responses_df, q_num, diff_limit=1.5):
    """
    Find Video IDs where the absolute difference for a specific question number is less than `diff_limit`
    in zs_model_responses_df but greater than `diff_limit` in gs_model_responses_df.
    """
    # Calculate the absolute differences
    zs_diff = abs(zs_model_responses_df[f'Q{q_num}'] - zs_model_responses_df[f'Experts_Avg Q{q_num}'])
    gs_diff = abs(gs_model_responses_df[f'Q{q_num}'] - gs_model_responses_df[f'Experts_Avg Q{q_num}'])
    
    # Find the indices that meet the conditions
    indices = zs_model_responses_df.index[(zs_diff < diff_limit) & (gs_diff > diff_limit)]
    
    # Get the corresponding Video IDs
    video_ids = zs_model_responses_df.loc[indices, 'Video ID']
    
    return video_ids.tolist()

# Example usage
question_num = 5
difference_limit = 1.5
video_ids = get_video_ids_for_condition(zs_model_responses_df, gs_model_responses_df, question_num, difference_limit)
print(f'Video IDs meeting the condition:', video_ids)

Display Expert, ZS, and GS Responses for Sample Video


In [None]:
smaple_video_id = 'OO5oDaG45kE'
cell_index = zs_model_responses_df.loc[zs_model_responses_df['Video ID'] == smaple_video_id].index[0]
print('Expert score:', zs_model_responses_df.at[cell_index, f'Experts_Avg Q{question_num}'], end='\n')

zs_response = zs_model_responses_df.at[cell_index, f'Response_{question_num}']
display_wrapped('ZS response: ' + zs_response)

print("########################################################################")

gs_response = gs_model_responses_df.at[cell_index, f'Response_{question_num}']
display_wrapped('GS response: ' + gs_response)