In [None]:
%load_ext autoreload
%autoreload 2

import json
import re
import pickle
import spacy
import glob
import random
from pypremise import Premise, data_loaders
from data.constants import DATASETS
import matplotlib.font_manager as fm
import statsmodels.formula.api as smf
from contextlib import redirect_stdout
from data.constants import DATASETS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
import os
import seaborn as sns

In [None]:
fm.fontManager.addfont("/usr/share/fonts/truetype/cmu/cmunrm.ttf")   # regular
fm.fontManager.addfont("/usr/share/fonts/truetype/cmu/cmunbx.ttf")   # bold
bold_font = fm.FontProperties(fname="/usr/share/fonts/truetype/cmu/cmunbx.ttf")

In [None]:
cmu_serif = fm.FontProperties(fname="/usr/share/fonts/truetype/cmu/cmunrm.ttf").get_name()
print("Font name:", cmu_serif)  # should be "CMU Serif"

In [None]:
plt.rcParams.update({
    "text.usetex": False,  # Enable LaTeX
    "mathtext.fontset": "cm",  # Use Computer Modern (LaTeX default)
    "font.family": cmu_serif,
    "font.size": 16,         # Base font size
    "axes.titlesize": 18,    # Title font size
    "axes.labelsize": 16,    # Axis label font size
    "xtick.labelsize": 14,   # X-axis tick font size
    "ytick.labelsize": 14,   # Y-axis tick font size
    "legend.fontsize": 16    # Legend font size
})

In [None]:
model_order = [
    "Llama-3.1-Nemotron-Nano-8B-v1", "Llama-3_3-Nemotron-Super-49B-v1",
     "gemma-3-4b-it", "gemma-3-27b-it",
    "Qwen3-4B-Instruct-2507", "Qwen3-30B-A3B-Instruct-2507",
    "gemini-2.5-flash"
]

In [None]:
def plot_role_no_role_heatmaps(
    df,
    count_col="count_in_role_gen",
    pattern_group_col="pattern_group",
    model_col="model",
    setting_col="setting",
    turn_col="turn",
    role_value=1,
    no_role_value=0,
    normalize=None,    # None | "row" | "global"
    sort_by_role_slope=False,
    model_order=None,  # New parameter for custom ordering
    figsize=(12, 8),
    vmax=None,
):
    """
    Plots 2x2 grid of heatmaps: role vs no-role x persona-directed vs goal-oriented.

    Color scheme:
      - Persona-directed: Orange (role), Grey (no-role)
      - Goal-oriented:    Blue    (role), Grey (no-role)
    """

    # --- Step 1. Bin turns to fixed 10 bins (0–9) ---
    df = df.copy()
    df["turn_bin"] = df[turn_col] // 11    # adjust divisor if spacing changes
    df["turn_bin"] = df["turn_bin"].clip(upper=9)

    # --- Shorten model names based on the mapping
    model_name_map = {
        "Llama-3.1-Nemotron-Nano-8B-v1": "Nemotron-8B",
        "Llama-3_3-Nemotron-Super-49B-v1": "Nemotron-49B",
        "gemma-3-4b-it": "Gemma3-4B",
        "gemma-3-27b-it": "Gemma3-27B",
        "Qwen3-4B-Instruct-2507": "Qwen3-4B",
        "Qwen3-30B-A3B-Instruct-2507": "Qwen3-30B",
        "gemini-2.5-flash": "gemini-2.5-flash"
    }
    df[model_col] = df[model_col].replace(model_name_map)

    # --- Step 2. Aggregate counts ---
    agg = (
        df.groupby([model_col, setting_col, "turn_bin", pattern_group_col], as_index=False)[count_col]
        .sum()
        .rename(columns={count_col: "count"})
    )

    turns = list(range(10))  # always 10 turns

    # --- Determine the final model order based on parameters ---
    if sort_by_role_slope:
        # If sorting by slope is requested, override the custom model_order
        # and sort based on the calculated slopes.
        models = sorted(df[model_col].unique())
        sub_role_persona = agg[(agg[setting_col] == "persona-directed") & (agg[pattern_group_col] == role_value)]
        mat_role_persona = sub_role_persona.pivot_table(index=model_col, columns="turn_bin", values="count", aggfunc="sum")
        mat_role_persona = mat_role_persona.reindex(index=models, columns=turns, fill_value=0)
        
        x = np.arange(mat_role_persona.shape[1])
        slopes = {}
        for idx, row in mat_role_persona.iterrows():
            y = row.values.astype(float)
            mask = y > 0
            if mask.sum() >= 2:
                slopes[idx] = np.polyfit(x[mask], y[mask], 1)[0]
            else:
                slopes[idx] = 0
        final_model_order = sorted(models, key=lambda m: slopes[m])
        
    elif model_order is not None:
        # Use the provided custom model_order list
        final_model_order = [model_name_map.get(m, m) for m in model_order]
    else:
        # Default to alphabetical sort of unique models
        final_model_order = sorted(df[model_col].unique())

    def make_matrix(setting, pg_value):
        sub = agg[(agg[setting_col] == setting) & (agg[pattern_group_col] == pg_value)]
        mat = sub.pivot_table(index=model_col, columns="turn_bin", values="count", aggfunc="sum")
        # Reindex with the final, determined model order
        return mat.reindex(index=final_model_order, columns=turns, fill_value=0)

    # --- Step 3. Build matrices ---
    role_persona = make_matrix("persona-directed", role_value)
    no_role_persona = make_matrix("persona-directed", no_role_value)
    role_goal = make_matrix("goal-oriented", role_value)
    no_role_goal = make_matrix("goal-oriented", no_role_value)

    # --- Step 4. Optional: sort by slope (role, persona-directed) ---
    # This step is now handled by the logic above, so we can skip it here.
    # The dataframes are already created in the desired order.

    if normalize == "row":
        # Combine dataframes to find the maximum per row across both variants
        role_combined = pd.concat([role_persona, role_goal], axis=1)
        role_maxes = role_combined.max(axis=1).replace(0, np.nan)
        no_role_combined = pd.concat([no_role_persona, no_role_goal], axis=1)
        no_role_maxes = no_role_combined.max(axis=1).replace(0, np.nan)
        
        role_persona_n = role_persona.div(role_maxes, axis=0).fillna(0)
        role_goal_n = role_goal.div(role_maxes, axis=0).fillna(0)
        no_role_persona_n = no_role_persona.div(no_role_maxes, axis=0).fillna(0)
        no_role_goal_n = no_role_goal.div(no_role_maxes, axis=0).fillna(0)
        
        mats = [role_persona_n, no_role_persona_n, role_goal_n, no_role_goal_n]
        
    elif normalize == "global":
        gmax = max(m.values.max() for m in [role_persona, no_role_persona, role_goal, no_role_goal])
        gmax = gmax if gmax > 0 else 1
        mats = [m / gmax for m in [role_persona, no_role_persona, role_goal, no_role_goal]]

    else: # normalize is None
        mats = [role_persona, no_role_persona, role_goal, no_role_goal]

    role_persona_n, no_role_persona_n, role_goal_n, no_role_goal_n = mats

    # --- Step 6. Set vmax for consistent colormap scaling ---
    if vmax is None:
        if normalize in ("row", "global"):
            vmax = 1.0
        else:
            vmax = np.percentile(np.concatenate([m.values.flatten() for m in mats]), 99)

    # --- Step 7. Plot ---
    fig, axes = plt.subplots(2, 2, figsize=figsize, sharex=True, sharey=True)

    sns.heatmap(role_persona_n, ax=axes[0, 0], cmap="viridis", cbar=True, annot_kws={"fontsize":12}, annot=True, vmin=0, vmax=vmax)
    axes[0, 0].set_title("Role patterns — persona-directed", fontproperties=bold_font, fontsize=14)

    # Use a lighter color palette for the no-role patterns
    sns.heatmap(no_role_persona_n, ax=axes[1, 0], cmap="viridis", cbar=True, annot_kws={"fontsize":12}, annot=True, vmin=0, vmax=vmax)
    axes[1, 0].set_title("No-role patterns — persona-directed", fontproperties=bold_font, fontsize=14)

    sns.heatmap(role_goal_n, ax=axes[0, 1], cmap="viridis", cbar=True,annot=True, annot_kws={"fontsize":12}, vmin=0, vmax=vmax)
    axes[0, 1].set_title("Role patterns — goal-oriented", fontproperties=bold_font, fontsize=14)

    # Use a lighter color palette for the no-role patterns
    sns.heatmap(no_role_goal_n, ax=axes[1, 1], cmap="viridis", cbar=True, annot=True,annot_kws={"fontsize":12}, vmin=0, vmax=vmax)
    axes[1, 1].set_title("No-role patterns — goal-oriented", fontproperties=bold_font, fontsize=14)

    # Create more meaningful x-axis labels
    turn_labels = np.unique(df.turn)
    for i, ax in enumerate(axes.flat):
        #ax.set_xlabel("Conversation round")
        ax.set_ylabel("")
        ax.set_xticks(np.arange(len(turns)) + 0.5)
        ax.set_xticklabels(turn_labels, rotation=0)
        # Add horizontal lines to distinguish model families
        model_family_breaks = [2, 4, 6]  # Corresponds to Gemma, Nemotron, Qwen
        if i % 2 == 0:
            sec_y = ax.secondary_yaxis(location='left')
            sec_y.set_yticks([2, 4, 6])
            sec_y.set_yticklabels([])
            sec_y.tick_params(axis='y', length=80, width=1, color='black')
        for y_pos in model_family_breaks:
            ax.axhline(y_pos, color='black', linestyle='--', linewidth=1,clip_on=False)
            
    axes[1, 0].set_xlabel("Conversation round")
    axes[1, 1].set_xlabel("Conversation round")
    axes[0, 0].set_xlabel("")
    axes[0, 1].set_xlabel("")

    #plt.suptitle("Role vs No-role token patterns over conversation rounds", fontsize=14)
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()

    return {
        "role_persona": role_persona,
        "no_role_persona": no_role_persona,
        "role_goal": role_goal,
        "no_role_goal": no_role_goal,
        "role_persona_normed": role_persona_n,
        "no_role_persona_normed": no_role_persona_n,
        "role_goal_normed": role_goal_n,
        "no_role_goal_normed": no_role_goal_n,
    }

In [None]:
master_df = pd.read_csv("./results/all_pattern_counts.csv")

In [None]:
plot_role_no_role_heatmaps(
    master_df,
    normalize="row",          # try: None, "row", "global"
    sort_by_role_slope=False,  # set True if you want strongest decays at top
    figsize=(12, 8),
    model_order=model_order
)

In [None]:
agg_df = master_df.groupby(["model", "turn", "setting", "pattern_group"]).agg({"count_in_role_gen": "sum", "count_in_no_role_gen": "sum"}).reset_index()

In [None]:
model_order = [
    "Llama-3.1-Nemotron-Nano-8B-v1", "Llama-3_3-Nemotron-Super-49B-v1",
    "gemma-3-4b-it", "gemma-3-27b-it",
    "Qwen3-4B-Instruct-2507", "Qwen3-30B-A3B-Instruct-2507",
    "gemini-2.5-flash"
]

model_name_map = {
    "Llama-3.1-Nemotron-Nano-8B-v1": "Nemotron-8B",
    "Llama-3_3-Nemotron-Super-49B-v1": "Nemotron-49B",
    "gemma-3-4b-it": "Gemma3-4B",
    "gemma-3-27b-it": "Gemma3-27B",
    "Qwen3-4B-Instruct-2507": "Qwen3-4B",
    "Qwen3-30B-A3B-Instruct-2507": "Qwen3-30B",
    "gemini-2.5-flash": "gemini-2.5-flash"
}


In [None]:
df = agg_df.copy()

model_order = [
    "Llama-3.1-Nemotron-Nano-8B-v1", "Llama-3_3-Nemotron-Super-49B-v1",
    "gemma-3-4b-it", "gemma-3-27b-it",
    "Qwen3-4B-Instruct-2507", "Qwen3-30B-A3B-Instruct-2507",
    "gemini-2.5-flash"
]

model_name_map = {
    "Llama-3.1-Nemotron-Nano-8B-v1": "Nemotron-8B",
    "Llama-3_3-Nemotron-Super-49B-v1": "Nemotron-49B",
    "gemma-3-4b-it": "Gemma3-4B",
    "gemma-3-27b-it": "Gemma3-27B",
    "Qwen3-4B-Instruct-2507": "Qwen3-4B",
    "Qwen3-30B-A3B-Instruct-2507": "Qwen3-30B",
    "gemini-2.5-flash": "gemini-2.5-flash"
}

# Filter and rename models based on user input
df_filtered = df[df['model'].isin(model_order)].copy()
df_filtered['model'] = df_filtered['model'].map(model_name_map)

# Get unique, reordered models from the filtered data
unique_models = [model_name_map[m] for m in model_order if m in df['model'].unique()]

# Define unique settings
settings = df_filtered['setting'].unique()

# Define colors for settings
colors = {'goal-oriented': 'blue', 'persona-directed': 'orange'}

# Create a figure and a set of subplots
fig, axs = plt.subplots(nrows=len(unique_models), ncols=len(settings), figsize=(12, 4 * len(unique_models)), sharey=True)

# Handle the case where there is only one model to prevent indexing error on axs
if len(unique_models) == 1:
    axs = axs.reshape(1, 2)

# Iterate through models (rows) and settings (columns)
for i, model in enumerate(unique_models):
    for j, setting in enumerate(settings):
        ax = axs[i, j]

        # Filter data for the current subplot
        df_plot_filtered = df_filtered[(df_filtered['model'] == model) & (df_filtered['setting'] == setting)].copy()

        # Normalize 'count_in_role_gen' within this subplot
        max_val = df_plot_filtered['count_in_role_gen'].max()
        if max_val > 0:
            df_plot_filtered['normalized_count'] = df_plot_filtered['count_in_role_gen'] / max_val
        else:
            df_plot_filtered['normalized_count'] = 0

        # Plot for each pattern group using the normalized values
        for pattern_group in df_plot_filtered['pattern_group'].unique():
            df_pattern = df_plot_filtered[df_plot_filtered['pattern_group'] == pattern_group]
            linestyle = '--' if pattern_group == 0 else '-'
            label = f'Pattern Group {pattern_group}'
            color = colors[setting]

            ax.plot(df_pattern['turn'], df_pattern['normalized_count'],
                    linestyle=linestyle, color=color, label=label)

        # Set labels and title
        ax.set_title(f'Model: {model}\nSetting: {setting}')
        ax.set_xlabel('Turn')
        ax.set_ylabel('Normalized Count in Role Gen')
        ax.legend()
        ax.grid(True)
        ax.set_ylim(0, 1.1)

# Adjust layout and save the figure
plt.tight_layout()
#plt.savefig('normalized_model_performance_subplot.png')

In [None]:
df_filtered = master_df.groupby(["dataset", "turn", "setting", "pattern_group"]).agg({"count_in_role_gen": "sum", "count_in_no_role_gen": "sum"}).reset_index().copy()

# Get unique, reordered models from the filtered data
unique_models = np.unique(df_filtered.dataset)

# Define unique settings
settings = df_filtered['setting'].unique()

# Define colors for settings
colors = {'goal-oriented': 'blue', 'persona-directed': 'orange'}

# Create a figure and a set of subplots
fig, axs = plt.subplots(nrows=len(unique_models), ncols=len(settings), figsize=(12, 4 * len(unique_models)), sharey=True)

# Handle the case where there is only one model to prevent indexing error on axs
if len(unique_models) == 1:
    axs = axs.reshape(1, 2)

# Iterate through models (rows) and settings (columns)
for i, model in enumerate(unique_models):
    for j, setting in enumerate(settings):
        ax = axs[i, j]

        # Filter data for the current subplot
        df_plot_filtered = df_filtered[(df_filtered['dataset'] == model) & (df_filtered['setting'] == setting)].copy()

        # Normalize 'count_in_role_gen' within this subplot
        max_val = df_plot_filtered['count_in_role_gen'].max()
        if max_val > 0:
            df_plot_filtered['normalized_count'] = df_plot_filtered['count_in_role_gen'] / max_val
        else:
            df_plot_filtered['normalized_count'] = 0

        # Plot for each pattern group using the normalized values
        for pattern_group in df_plot_filtered['pattern_group'].unique():
            df_pattern = df_plot_filtered[df_plot_filtered['pattern_group'] == pattern_group]
            linestyle = '--' if pattern_group == 0 else '-'
            label = f'Pattern Group {pattern_group}'
            color = colors[setting]

            ax.plot(df_pattern['turn'], df_pattern['normalized_count'],
                    linestyle=linestyle, color=color, label=label)

        # Set labels and title
        ax.set_title(f'Model: {model}\nSetting: {setting}')
        ax.set_xlabel('Turn')
        ax.set_ylabel('Normalized Count in Role Gen')
        ax.legend()
        ax.grid(True)
        ax.set_ylim(0, 1.1)

# Adjust layout and save the figure
plt.tight_layout()
#plt.savefig('normalized_model_performance_subplot.png')

In [None]:
df_filtered = master_df.groupby(["role", "turn", "setting", "pattern_group"]).agg({"count_in_role_gen": "sum", "count_in_no_role_gen": "sum"}).reset_index().copy()

# Get unique, reordered models from the filtered data
unique_models = np.unique(df_filtered.role)

# Define unique settings
settings = df_filtered['setting'].unique()

# Define colors for settings
colors = {'goal-oriented': 'blue', 'persona-directed': 'orange'}

# Create a figure and a set of subplots
fig, axs = plt.subplots(nrows=len(unique_models), ncols=len(settings), figsize=(12, 4 * len(unique_models)), sharey=True)

# Handle the case where there is only one model to prevent indexing error on axs
if len(unique_models) == 1:
    axs = axs.reshape(1, 2)

# Iterate through models (rows) and settings (columns)
for i, model in enumerate(unique_models):
    for j, setting in enumerate(settings):
        ax = axs[i, j]

        # Filter data for the current subplot
        df_plot_filtered = df_filtered[(df_filtered['role'] == model) & (df_filtered['setting'] == setting)].copy()

        # Normalize 'count_in_role_gen' within this subplot
        max_val = df_plot_filtered['count_in_role_gen'].max()
        if max_val > 0:
            df_plot_filtered['normalized_count'] = df_plot_filtered['count_in_role_gen'] / max_val
        else:
            df_plot_filtered['normalized_count'] = 0

        # Plot for each pattern group using the normalized values
        for pattern_group in df_plot_filtered['pattern_group'].unique():
            df_pattern = df_plot_filtered[df_plot_filtered['pattern_group'] == pattern_group]
            linestyle = '--' if pattern_group == 0 else '-'
            label = f'Pattern Group {pattern_group}'
            color = colors[setting]

            ax.plot(df_pattern['turn'], df_pattern['normalized_count'],
                    linestyle=linestyle, color=color, label=label)

        # Set labels and title
        ax.set_title(f'Model: {model}\nSetting: {setting}')
        ax.set_xlabel('Turn')
        ax.set_ylabel('Normalized Count in Role Gen')
        ax.legend()
        ax.grid(True)
        ax.set_ylim(0, 1.1)

# Adjust layout and save the figure
plt.tight_layout()
#plt.savefig('normalized_model_performance_subplot.png')

In [None]:
df = master_df.copy()

# Aggregate data by setting, turn, and pattern_group, averaging across all models
df_aggregated = df.groupby(['setting', 'turn', 'pattern_group'], as_index=False).agg(
    avg_count_in_role_gen=('count_in_role_gen', 'mean')
)

# Define unique settings and colors
settings = df_aggregated['setting'].unique()
colors = {'goal-oriented': 'blue', 'persona-directed': 'orange'}

# Create a figure with a 1x2 grid of subplots
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(8, 6), sharex=True)

# Iterate through each setting and plot the data
for j, setting in enumerate(["persona-directed", "goal-oriented"]):
    ax = axs[j]

    # Filter data for the current subplot
    df_plot = df_aggregated[df_aggregated['setting'] == setting].copy()

    # Normalize 'avg_count_in_role_gen' within this subplot
    max_val = df_plot['avg_count_in_role_gen'].max()
    if max_val > 0:
        df_plot['normalized_count'] = df_plot['avg_count_in_role_gen'] / max_val
    else:
        df_plot['normalized_count'] = 0

    # Plot for each pattern group using the normalized values
    for pattern_group in df_plot['pattern_group'].unique():
        df_pattern = df_plot[df_plot['pattern_group'] == pattern_group]
        linestyle = '--' if pattern_group == 0 else '-'
        label = "Persona patterns" if pattern_group==1 else "Baseline patterns"
        color = colors[setting]

        ax.plot(df_pattern['turn'], df_pattern['normalized_count'],
                linestyle=linestyle, color=color, label=label)

    # Set labels and title
    ax.set_title(f'Pattern counts in {setting} dialogues', fontproperties=bold_font, fontsize=14)
    ax.set_xlabel('Turn')
    ax.set_ylabel('')
    ax.legend()
    ax.grid(True)
    ax.set_ylim(0, 1.1)
fig.text(0.007, 0.5, 'Normalized Pattern Count', va='center', rotation='vertical')

# Adjust layout and save the figure
plt.tight_layout()

In [None]:
fig.savefig('../persistent-personas-paper/media/pypremise.pdf', dpi=300, bbox_inches="tight")

In [None]:
master_df

In [None]:
df = master_df.groupby(["model", "turn", "role", "setting", "pattern_group"]).agg({"count_in_role_gen": "sum"}).reset_index()

In [None]:
df["modelRole"] = df["model"] + df["role"]

In [None]:
persona_df = df[df.setting == "persona-directed"]

In [None]:
goal_df = df[df.setting == "goal-oriented"]

In [None]:
persona_df.groupby("turn").sum(numeric_only=True)

In [None]:
for all_df in [persona_df, goal_df]:
    for pattern_group in [0, 1]:
        print(f"Regression for pattern group {pattern_group} and {all_df.setting.tolist()[0]}")
        df = all_df[all_df.pattern_group==pattern_group]
        md = smf.mixedlm(f"count_in_role_gen ~ turn", df, groups=df["modelRole"]) 
        mdf = md.fit(method=["powell", "lbfgs"])
        coefs = mdf.summary().tables[1]
        coefs["Coef."] = coefs["Coef."].astype("float")
        display(coefs)

### Difference between last and first rounds

In [None]:
roles = [x for x in json.load(open("./data/roles.json", "r")).keys()][:-1]

In [None]:
prefixes = ["", "inst-"]

In [None]:
model_list = []
role_list = []
count_first_list = []
count_last_list = []
dataset_list = []
dialogue_type_list = []

In [None]:
for model in model_order:
    for dataset in DATASETS:
        for prefix in prefixes:
            for role in roles:
                count_first_list.append((len(pd.read_csv(f"./results/premise/premise_{model}_{dataset}_{role.replace(" ", "_")}.csv"))))
                count_last_list.append((len(pd.read_csv(f"./results/premise/premise_{model}_{prefix}{dataset}_{role.replace(" ", "_")}_last_round.csv"))))
                model_list.append(model)
                role_list.append(role)
                dataset_list.append(dataset)
                dialogue_type_list.append("Persona-directed" if prefix == "" else "Goal-oriented")

In [None]:
df = pd.DataFrame({"model": model_list, "dataset": dataset_list, "role": role_list, "dialogue_type": dialogue_type_list, "count_first": count_first_list, "count_last": count_last_list})

In [None]:
df.sort_values("count_last")

In [None]:
model_name_map = {
        "Llama-3.1-Nemotron-Nano-8B-v1": "Nemotron-8B",
        "Llama-3_3-Nemotron-Super-49B-v1": "Nemotron-49B",
        "gemma-3-4b-it": "Gemma3-4B",
        "gemma-3-27b-it": "Gemma3-27B",
        "Qwen3-4B-Instruct-2507": "Qwen3-4B",
        "Qwen3-30B-A3B-Instruct-2507": "Qwen3-30B",
        "gemini-2.5-flash": "gemini-2.5-flash"
    }

In [None]:
df.model = df.model.apply(lambda x: model_name_map[x])

In [None]:
1-(df["count_last"]/df["count_first"]).mean()

In [None]:
df_melted = df.melt(
    id_vars=["model", "dataset", "role", "dialogue_type"],
    value_vars=["count_first", "count_last"],
    var_name="Position",
    value_name="Count"
)
df_melted["Position"] = df_melted["Position"].map({
    "count_first": "First round", "count_last": "Last round"
})

# Plot grouped bar chart
g = sns.catplot(
    sharey=False,
    data=df_melted,
    kind="bar",
    x="model",
    y="Count",
    hue="Position",
    col="dialogue_type",
    row="dataset",
    order=list(model_name_map.values()),
    height=4, aspect=2,
)

# Add dashed vertical lines (your code is fine)
for ax_row in g.axes:
    for ax in ax_row:
        for boundary in [2, 4, 6]:
            ax.axvline(boundary - 0.5, color="gray", linestyle="--")

g.tick_params(axis='x', labelrotation=45)

# Get handles and labels from any subplot
handles, labels = g.axes[0, 0].get_legend_handles_labels()

# Create your new, desired legend for the entire figure
# g.fig.legend(
#     handles=handles,
#     labels=labels,
#     loc='upper center',
#     bbox_to_anchor=(0.5, 1.03),
#     ncol=2,
#     title=None
# )

sns.move_legend(
    g, "upper center",
    bbox_to_anchor=(.45, 1.03), ncol=2, title=None, frameon=True,
)
# Adjust layout to make room for the new legend and labels
#plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

In [None]:
(df["count_last"] / df["count_first"]).sort_values()

In [None]:
def bootstrap_difference(data, num_bootstraps=10000, alpha=0.05):
    """
    Performs bootstrapping to find the confidence interval for the difference of means.

    Args:
        data1 (pd.Series or np.array): The first dataset.
        data2 (pd.Series or np.array): The second dataset.
        num_bootstraps (int): The number of bootstrap iterations.
        alpha (float): The significance level for the confidence interval.

    Returns:
        tuple: A tuple containing the lower and upper bounds of the confidence interval,
               and a message about the statistical significance.
    """
    frac_means = []
    n = len(data)

    for _ in range(num_bootstraps):
        # Create bootstrap samples by sampling with replacement.
        sample = data.sample(n, replace=True)


        # Calculate the difference in means and store it.
        frac =  (sample["count_last"] / sample["count_first"]).mean()
        frac_means.append(frac)

    # Calculate the confidence interval from the list of fracerences.
    lower_ci = np.percentile(frac_means, 100 * (alpha / 2))
    upper_ci = np.percentile(frac_means, 100 * (1 - alpha / 2))

    # Interpret the results.
    if upper_ci < 1:
        conclusion = "The difference is statistically significant. count_first is significantly larger than count_last."
    elif lower_ci > 1:
        conclusion = "The difference is statistically significant. count_last is significantly larger than count_first."
    else:
        conclusion = "The difference is not statistically significant. The confidence interval includes one."

    return lower_ci, upper_ci, conclusion

# Run the analysis on the 'count_first' and 'count_last' columns.
ci_lower, ci_upper, result_message = bootstrap_difference(df)

# Print the results in a human-readable format.
print("--- Bootstrapping Analysis ---")
print(f"95% Confidence Interval for (count_last / count_first): [{ci_lower:.2f}, {ci_upper:.2f}]")
print(result_message)
print(f"\nMean of 'count_last'/'count_first': {(df['count_last']/df['count_first']).mean():.2f}")

In [None]:
(df["count_last"]/df["count_first"]).mean()

In [None]:
1 - ci_lower, 1-ci_upper

In [None]:
def bootstrap_difference(data, num_bootstraps=10000, alpha=0.05):
    """
    Performs bootstrapping to find the confidence interval for the difference of means.

    Args:
        data1 (pd.Series or np.array): The first dataset.
        data2 (pd.Series or np.array): The second dataset.
        num_bootstraps (int): The number of bootstrap iterations.
        alpha (float): The significance level for the confidence interval.

    Returns:
        tuple: A tuple containing the lower and upper bounds of the confidence interval,
               and a message about the statistical significance.
    """
    diff_means = []
    n = len(data)

    for _ in range(num_bootstraps):
        # Create bootstrap samples by sampling with replacement.
        sample = data.sample(n, replace=True)


        # Calculate the difference in means and store it.
        diff = (sample["count_first"] - sample["count_last"]).mean()
        diff_means.append(diff)

    # Calculate the confidence interval from the list of differences.
    lower_ci = np.percentile(diff_means, 100 * (alpha / 2))
    upper_ci = np.percentile(diff_means, 100 * (1 - alpha / 2))

    # Interpret the results.
    if lower_ci > 0:
        conclusion = "The difference is statistically significant. count_first is significantly larger than count_last."
    elif upper_ci < 0:
        conclusion = "The difference is statistically significant. count_last is significantly larger than count_first."
    else:
        conclusion = "The difference is not statistically significant. The confidence interval includes zero."

    return lower_ci, upper_ci, conclusion

# Run the analysis on the 'count_first' and 'count_last' columns.
ci_lower, ci_upper, result_message = bootstrap_difference(df)

# Print the results in a human-readable format.
print("--- Bootstrapping Analysis ---")
print(f"95% Confidence Interval for (count_first - count_last): [{ci_lower:.2f}, {ci_upper:.2f}]")
print(result_message)
print(f"\nMean of 'count_first': {df['count_first'].mean():.2f}")
print(f"Mean of 'count_last': {df['count_last'].mean():.2f}")

In [None]:
goal_oriented_df= df[df['dialogue_type'] == 'Goal-oriented']
persona_directed_df = df[df['dialogue_type'] == 'Persona-directed']
# Run the analysis on the 'count_first' and 'count_last' columns.


ci_lower, ci_upper, result_message = bootstrap_difference(persona_directed_df)

# Print the results in a human-readable format.
print("--- Bootstrapping Analysis ---")
print(f"95% Confidence Interval for (count_first - count_last): [{ci_lower:.2f}, {ci_upper:.2f}]")
print(result_message)
print(f"\nMean of 'count_first': {persona_directed_df['count_first'].mean():.2f}")
print(f"Mean of 'count_last': {persona_directed_df['count_last'].mean():.2f}")

ci_lower, ci_upper, result_message = bootstrap_difference(goal_oriented_df)

# Print the results in a human-readable format.
print("--- Bootstrapping Analysis ---")
print(f"95% Confidence Interval for (count_first - count_last): [{ci_lower:.2f}, {ci_upper:.2f}]")
print(result_message)
print(f"\nMean of 'count_first': {goal_oriented_df['count_first'].mean():.2f}")
print(f"Mean of 'count_last': {goal_oriented_df['count_last'].mean():.2f}")