In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as sps
import scikit_posthocs as sp
from matplotlib.backends.backend_pdf import PdfPages

## Reusable functions

In [None]:
def create_boxplot(df, col_value, col_category, title):
    plot_data = []
    groups = []

    for group in df[col_category].unique():
        group_data = df[df[col_category] == group][col_value].dropna()
        plot_data.append(group_data)
        groups.append(group)

    fig, ax = plt.subplots()
    bp = ax.boxplot(plot_data, patch_artist=True, showfliers=False)

    colors = plt.cm.viridis(np.linspace(0, 1, len(plot_data)))
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color) 

    ax.set_title(title)
    ax.set_xticklabels(groups, rotation=45, ha='right')
    ax.set_ylabel('Value')
        
    ax.grid(True, linestyle='--', alpha=0.7)


In [None]:
def create_heatmap(df, col_value, col_category, title):
    plot_data = []
    groups = []
                
    for group in df[col_category].unique():
        group_data = df[df[col_category] == group][col_value].dropna()
        plot_data.append(group_data)
        groups.append(group)
                
    # Perform Dunn test
    posthoc_results = sp.posthoc_dunn(plot_data, p_adjust='bonferroni')

    fig, ax = plt.subplots()
    sns.heatmap(
        posthoc_results,
        annot=True,
        cmap='coolwarm_r',
        vmin=0,
        vmax=0.05,
        ax=ax,
        xticklabels=groups,
        yticklabels=groups,
        cbar=False,
        fmt='.2f',
        annot_kws={"fontsize":7}
    )

    # Color p-values < 0.05 differently for emphasis
    for i in range(posthoc_results.shape[0]):
        for j in range(posthoc_results.shape[1]):
            if posthoc_results.iloc[i, j] < 0.05:
                ax.add_patch(plt.Rectangle((j, i), 1, 1, fill=False, edgecolor='black', lw=1.5))

    ax.set_title(title)

## Standardized Project Gutenberg Corpus

### Prepare the data

In [None]:
# Load the results for further analysis
spgc_metadata_sampled = pd.read_csv("results/spgc_metadata_sampled_after.csv")

# Reverse sign of gamma
spgc_metadata_sampled["gamma_0"] = -spgc_metadata_sampled["gamma_0"]
spgc_metadata_sampled["gamma_3"] = -spgc_metadata_sampled["gamma_3"]
spgc_metadata_sampled["gamma_9"] = -spgc_metadata_sampled["gamma_9"]
spgc_metadata_sampled["gamma_27"] = -spgc_metadata_sampled["gamma_27"]

# Rename columns _0 to _1
spgc_metadata_sampled.rename(columns=lambda x: x.replace("_0", "_1"), inplace=True)

# Skip Chinese language, because embeddings from NLTK are not well-suited for SPGC corpus
spgc_metadata_sampled = spgc_metadata_sampled[spgc_metadata_sampled["language"] != "['zh']"]

# Strip brackets from language codes
spgc_metadata_sampled["language"] = spgc_metadata_sampled["language"].str.replace(r"[\[\]']", "", regex=True)

# Explore missing values in the fitted parameters
selected_columns = spgc_metadata_sampled.filter(regex="^(gamma|delta|beta)").columns
missing_values = spgc_metadata_sampled[selected_columns].isnull().sum()
missing_percentage = (missing_values / len(spgc_metadata_sampled)) * 100

missing_data_summary = pd.DataFrame({
    "Parameter Name": missing_values.index,
    "Missing Count": missing_values.values,
    "Missing Percentage": missing_percentage.values
})

missing_data_summary

### Distribution of fitted parameters

In [None]:
all_params = [
    "gamma_1", "gamma_3", "gamma_9", "gamma_27",
    "delta_1", "delta_3", "delta_9", "delta_27",
    "beta_1", "beta_3", "beta_9", "beta_27",
]

pdf_filename = r'figures\spgc_distribution.pdf'

with PdfPages(pdf_filename) as pdf:
    for param in all_params:
        # Titles with greek letters
        if param.startswith("gamma"):
            title = r"Fitted $\gamma$ (order {}) for all languages".format(param.split("_")[1])
        elif param.startswith("delta"):
            title = r"Fitted $\delta$ (order {}) for all languages".format(param.split("_")[1])
        elif param.startswith("beta"):
            title = r"Fitted $\beta$ (order {}) for all languages".format(param.split("_")[1])
        else:
            title = f"Fitted {param} for all languages"

        create_boxplot(
            df = spgc_metadata_sampled,
            col_value = param,
            col_category = "language",
            title = title
        )
        pdf.savefig()

### Analysis of variance

In [None]:
# Create a dataframe to store the Kruskal-Wallis test results
kw_results = pd.DataFrame(
    index=['gamma', 'delta', 'beta'],
    columns=['1', '3', '9', '27']
)

# Define parameter groups
param_groups = {
    'gamma': ['gamma_1', 'gamma_3', 'gamma_9', 'gamma_27'],
    'delta': ['delta_1', 'delta_3', 'delta_9', 'delta_27'],
    'beta': ['beta_1', 'beta_3', 'beta_9', 'beta_27']
}

# Perform Kruskal-Wallis test for each parameter
for group_name, group_params in param_groups.items():
    for param in group_params:
        pool_order = param.split('_')[1]  # Extract pool order (1, 3, 9, 27)
        
        # Create a list of data for each language
        groups = []
        group_labels = []
        
        for lang in spgc_metadata_sampled['language'].unique():
            data = spgc_metadata_sampled.loc[spgc_metadata_sampled['language'] == lang, param].dropna()
            if len(data) > 0:
                groups.append(data)
                group_labels.append(lang)
        
        # Perform Kruskal-Wallis test
        if len(groups) > 1:  # Need at least 2 groups for the test
            stat, p_value = sps.kruskal(*groups)
            kw_results.loc[group_name, pool_order] = p_value
        else:
            kw_results.loc[group_name, pool_order] = float('nan')

# Format p-values with scientific notation for small values
kw_results_formatted = kw_results.applymap(lambda x: f"{x:.2e}" if pd.notnull(x) else "NaN")

# Display the results
kw_results_formatted

In [None]:
pdf_filename = r'figures\spgc_heatmaps.pdf'

with PdfPages(pdf_filename) as pdf:
    for param in all_params:
        # Titles with greek letters
        if param.startswith("gamma"):
            title = r"p-value of Dunn test for $\gamma$ (order {})".format(param.split("_")[1])
        elif param.startswith("delta"):
            title = r"Fp-value of Dunn test for $\delta$ (order {})".format(param.split("_")[1])
        elif param.startswith("beta"):
            title = r"p-value of Dunn test for $\beta$ (order {})".format(param.split("_")[1])
        else:
            title = f"p-value of Dunn test for {param}"

        create_heatmap(
            df = spgc_metadata_sampled,
            col_value = param,
            col_category = "language",
            title = title
        )
        pdf.savefig()

## Human vs LLM Corpus

### Prepare the data

In [None]:
# Load the results for further analysis
df_human_vs_llm_sampled = pd.read_csv("results/human_vs_llm_sampled_after.csv")

# Reverse sign of gamma
df_human_vs_llm_sampled["gamma_1"] = -df_human_vs_llm_sampled["gamma_1"]
df_human_vs_llm_sampled["gamma_3"] = -df_human_vs_llm_sampled["gamma_3"]
df_human_vs_llm_sampled["gamma_9"] = -df_human_vs_llm_sampled["gamma_9"]
df_human_vs_llm_sampled["gamma_27"] = -df_human_vs_llm_sampled["gamma_27"]

# Explore missing values in the fitted parameters
selected_columns = df_human_vs_llm_sampled.filter(regex="^(gamma|delta|beta)").columns
missing_values = df_human_vs_llm_sampled[selected_columns].isnull().sum()
missing_percentage = (missing_values / len(df_human_vs_llm_sampled)) * 100

missing_data_summary = pd.DataFrame({
    "Parameter Name": missing_values.index,
    "Missing Count": missing_values.values,
    "Missing Percentage": missing_percentage.values
})

missing_data_summary

### Distribution of fitted parameters

In [None]:
pdf_filename = r'figures\human_vs_llm_distribution.pdf'

with PdfPages(pdf_filename) as pdf:
    for param in all_params:
        # Titles with greek letters
        if param.startswith("gamma"):
            title = r"Fitted $\gamma$ (order {}) for all sources".format(param.split("_")[1])
        elif param.startswith("delta"):
            title = r"Fitted $\delta$ (order {}) for all sources".format(param.split("_")[1])
        elif param.startswith("beta"):
            title = r"Fitted $\beta$ (order {}) for all sources".format(param.split("_")[1])
        else:
            title = f"Fitted {param} for all sources"

        create_boxplot(
            df = df_human_vs_llm_sampled,
            col_value = param,
            col_category = "source",
            title = title
        )
        pdf.savefig()

### Analysis of variance

In [None]:
# Create a dataframe to store the Kruskal-Wallis test results
kw_results = pd.DataFrame(
    index=['gamma', 'delta', 'beta'],
    columns=['1', '3', '9', '27']
)

# Define parameter groups
param_groups = {
    'gamma': ['gamma_1', 'gamma_3', 'gamma_9', 'gamma_27'],
    'delta': ['delta_1', 'delta_3', 'delta_9', 'delta_27'],
    'beta': ['beta_1', 'beta_3', 'beta_9', 'beta_27']
}

# Perform Kruskal-Wallis test for each parameter
for group_name, group_params in param_groups.items():
    for param in group_params:
        pool_order = param.split('_')[1]  # Extract pool order (1, 3, 9, 27)
        
        # Create a list of data for each language
        groups = []
        group_labels = []
        
        for lang in df_human_vs_llm_sampled['source'].unique():
            data = df_human_vs_llm_sampled.loc[df_human_vs_llm_sampled['source'] == lang, param].dropna()
            if len(data) > 0:
                groups.append(data)
                group_labels.append(lang)
        
        # Perform Kruskal-Wallis test
        if len(groups) > 1:  # Need at least 2 groups for the test
            stat, p_value = sps.kruskal(*groups)
            kw_results.loc[group_name, pool_order] = p_value
        else:
            kw_results.loc[group_name, pool_order] = float('nan')

# Format p-values with scientific notation for small values
kw_results_formatted = kw_results.applymap(lambda x: f"{x:.2e}" if pd.notnull(x) else "NaN")

# Display the results
kw_results_formatted

In [None]:
pdf_filename = r'figures\human_vs_llm_heatmaps.pdf'

with PdfPages(pdf_filename) as pdf:
    for param in all_params:
        # Titles with greek letters
        if param.startswith("gamma"):
            title = r"p-value of Dunn test for $\gamma$ (order {})".format(param.split("_")[1])
        elif param.startswith("delta"):
            title = r"Fp-value of Dunn test for $\delta$ (order {})".format(param.split("_")[1])
        elif param.startswith("beta"):
            title = r"p-value of Dunn test for $\beta$ (order {})".format(param.split("_")[1])
        else:
            title = f"p-value of Dunn test for {param}"

        create_heatmap(
            df = df_human_vs_llm_sampled,
            col_value = param,
            col_category = "source",
            title = title
        )
        pdf.savefig()