In [1]:
import os
from collections import Counter

import numpy as np
import pandas as pd
from scipy.stats import wilcoxon
from statsmodels.stats.multitest import multipletests

In [2]:
# Get the current working directory
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, ".."))

In [3]:
def calculate_significant_improvements(df_baseline, df_method, max_citations=5):
    """
    Checks whether pages in the 'Boost Product Index' have significantly lower
    ranks in df_method's 'Citation Order' compared to df_baseline.

    The function:
    1. Iterates over both DataFrames row by row.
    2. Gathers the ranks of items that appear in each row's 'Boost Product Index'.
       For each boosted item, we find its position in the baseline and method ranks.
    3. Computes a difference: baseline_position - method_position.
       (If this difference is positive, it means that under the method, the item
        appears at a lower index, i.e. a better/lower rank.)
    4. Applies the Wilcoxon signed-rank test to see if the differences are
       significantly > 0.

    Returns:
        A dictionary containing:
            - 'statistic': The Wilcoxon test statistic.
            - 'pvalue': The Wilcoxon test p-value.
            - 'mean_diff': The average of the rank differences.
            - 'count': How many boosted items were analyzed in total.
    """

    all_differences = []
    n = min(len(df_baseline), len(df_method))
    # reset index
    df_baseline = df_baseline.reset_index(drop=True)
    df_method = df_method.reset_index(drop=True)

    for i in range(n):
        # Ensure data is a Python list so we can use .index()
        baseline_order = df_baseline.loc[i, "Citation Order"]
        method_order = df_method.loc[i, "Citation Order"]

        if isinstance(baseline_order, np.ndarray):
            baseline_order = baseline_order.tolist()[:max_citations]
        if isinstance(method_order, np.ndarray):
            method_order = method_order.tolist()[:max_citations]

        boosted_items = df_method.loc[i, "Boost Product Index"]
        # if boosted item is not a list (for compatibility with older results)
        if not isinstance(boosted_items, list):
            boosted_items = [boosted_items]
        # For each boosted item, compute the rank difference (baseline - method)
        for item in boosted_items:
            if (item in baseline_order) and (item in method_order):
                diff = baseline_order.index(item) - method_order.index(item)
                all_differences.append(diff)
            elif (item in baseline_order) and (item not in method_order):
                # rank after = max_citations
                diff = baseline_order.index(item) - len(method_order)
                all_differences.append(diff)
            elif (item not in baseline_order) and (item in method_order):
                # rank before = max_citations
                diff = len(baseline_order) - method_order.index(item)
                all_differences.append(diff)
            else:
                all_differences.append(0)

    if len(all_differences) == 0:
        # print("No boosted items found in the data, so no comparison could be made.")
        return {"statistic": None, "pvalue": None, "mean_diff": None, "count": 0}

    # We want to test if the difference is significantly > 0
    stat, pvalue = wilcoxon(all_differences, alternative="greater")
    mean_diff = np.mean(all_differences)
    std_diff = np.std(all_differences)
    # Print a human-friendly explanation
    # print("Wilcoxon test results:")
    # print(f"Statistic: {stat:.4f}")
    # print(f"P-value: {pvalue:.4g}")
    # print(f"Mean difference (baseline - method): {mean_diff:.4f}")

    # if pvalue is not None and pvalue < 0.05:
    #     print(
    #         "Conclusion: There is a significant improvement for boosted terms (p < 0.05)."
    #     )
    # else:
    #     print(
    #         "Conclusion: No significant improvement found for boosted terms (p >= 0.05)."
    #     )
    # how many delta rank > 0
    # print(
    #     f"Number of boosted items analyzed: {len(all_differences)}. "
    #     f"Number of significant improvements: {Counter(np.array(all_differences) > 0)[True]}"
    #     f"Number of significant deteriorations: {Counter(np.array(all_differences) < 0)[True]}"
    #     f"Number of no change: {Counter(np.array(all_differences) == 0)[True]}"
    # )

    return {
        "statistic": stat,
        "pvalue": pvalue,
        "Delta Rank": (mean_diff, std_diff),
        "diffs": all_differences,
    }

In [4]:
def calculate_seo_baseline_improvements(
    df_baseline, df_method, new_position, max_citations=5
):
    """
    In the seo baseline, the index to boost is the first one.
    This means that if index to boost i 3, the doc #3 in the baseline is used as #0 in df_method
    new_position is 1-based
    """

    all_differences = []
    n = min(len(df_baseline), len(df_method))
    # reset index
    df_baseline = df_baseline.reset_index(drop=True)
    df_method = df_method.reset_index(drop=True)
    new_position = new_position - 1  # convert to 0-based index

    for i in range(n):
        # Ensure data is a Python list so we can use .index()
        baseline_order = df_baseline.loc[i, "Citation Order"]
        method_order = df_method.loc[i, "Citation Order"]

        if isinstance(baseline_order, np.ndarray):
            baseline_order = baseline_order.tolist()[:max_citations]
        if isinstance(method_order, np.ndarray):
            method_order = method_order.tolist()[:max_citations]

        boosted_items = df_method.loc[i, "Boost Product Index"]
        # if boosted item is not a list (for compatibility with older results)
        if not isinstance(boosted_items, list):
            boosted_items = [boosted_items]

        # I need to calculate the difference between rank of item 0 in method_order and rank of item in baseline_order

        # For each boosted item, compute the rank difference (baseline - method)
        for item in boosted_items:
            if (item in baseline_order) and (new_position in method_order):
                diff = baseline_order.index(item) - method_order.index(new_position)
                all_differences.append(diff)
            elif (item in baseline_order) and (new_position not in method_order):
                # rank after = max_citations
                diff = baseline_order.index(item) - len(method_order)
                all_differences.append(diff)
            elif (item not in baseline_order) and (new_position in method_order):
                # rank before = max_citations
                diff = len(baseline_order) - method_order.index(new_position)
                all_differences.append(diff)
            else:
                all_differences.append(0)

    if len(all_differences) == 0:
        # print("No boosted items found in the data, so no comparison could be made.")
        return {"statistic": None, "pvalue": None, "mean_diff": None, "count": 0}

    # We want to test if the difference is significantly > 0
    stat, pvalue = wilcoxon(all_differences, alternative="greater")
    mean_diff = np.mean(all_differences)
    std_diff = np.std(all_differences)
    # Print a human-friendly explanation
    # print("Wilcoxon test results:")
    # print(f"Statistic: {stat:.4f}")
    # print(f"P-value: {pvalue:.4g}")
    # print(f"Mean difference (baseline - method): {mean_diff:.4f}")

    # if pvalue is not None and pvalue < 0.05:
    #     print(
    #         "Conclusion: There is a significant improvement for boosted terms (p < 0.05)."
    #     )
    # else:
    #     print(
    #         "Conclusion: No significant improvement found for boosted terms (p >= 0.05)."
    #     )

    print(
        f"Number of boosted items analyzed: {len(all_differences)}. "
        f"Number of significant improvements: {Counter(np.array(all_differences) > 0)[True]}"
        f"Number of significant deteriorations: {Counter(np.array(all_differences) < 0)[True]}"
        f"Number of no change: {Counter(np.array(all_differences) == 0)[True]}"
    )

    return {
        "statistic": stat,
        "pvalue": pvalue,
        "Delta Rank": (mean_diff, std_diff),
        "diffs": all_differences,
    }

In [5]:
def bonferroni_holm_correction(df_pvalues):
    # Make a clean copy and set Method as the index
    df_corrected = df_pvalues.set_index("Method").copy()

    # Apply Holm-Bonferroni correction to each dataset column
    for col in df_corrected.columns:
        # Convert p-values to float just in case
        pvals = df_corrected[col].astype(float).values
        _, pvals_corrected, _, _ = multipletests(pvals, method="holm")
        df_corrected[col] = pvals_corrected

    # Optional: round for display
    df_corrected = df_corrected.round(6)

    # Show the corrected DataFrame
    return df_corrected

In [6]:
dataset = "retail"
cseo_method = "LLMGuidance"
llm_name = "gpt-4o-mini-2024-07-18"

In [8]:
baseline_path = os.path.join(
    project_root,
    "experiments/results",
    dataset,
    "Original",
    llm_name,
    "AdoptionMode.NONE",
)
df_baseline = pd.read_parquet(os.path.join(baseline_path, "responses.parquet"))

In [9]:
method_path = os.path.join(
    project_root,
    "experiments/results",
    dataset,
    cseo_method,
    llm_name,
    "AdoptionMode.UNILATERAL/",
)
df_method = pd.read_parquet(os.path.join(method_path, "responses.parquet"))

In [10]:
results = calculate_significant_improvements(df_baseline, df_method)

In [11]:
results["Delta Rank"]
print(
    f"Mean difference (baseline - method): {results['Delta Rank'][0]:.4f} +/- {results['Delta Rank'][1]:.4f}"
)
print(f"Wilcoxon test statistic: {results['statistic']}")
print(f"Wilcoxon test p-value (before Bonferroni-Holm correction): {results['pvalue']}")

Mean difference (baseline - method): 0.3640 +/- 1.4709
Wilcoxon test statistic: 13667.0
Wilcoxon test p-value (before Bonferroni-Holm correction): 6.65020377670196e-08
