In [None]:
import re
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from gensim.utils import tokenize
import LRDEstimator
import utils
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as sps
import scikit_posthocs as sp

## Standardized Project Gutenberg Corpus

### Prepare the data

In [None]:
# Read sampled dataset
spgc_metadata_sampled = pd.read_csv("data/spgc_metadata_sampled.csv")

In [None]:
# We will be calculating Coco for pooled embeddings of order 1, 3, 9 and 27
# Then we will be fitting power law (gamma) and stretched exponential function (delta and beta)
# To our dataframe we need to add columns for storing the results
# Additionally, power law has scaling parameter (alpha_pl) and stretched exponential has shift parameters (alpha_se)
# After fitting parameters we will calculate error metric
new_columns = [
    # Power law parameters
    "gamma_1", "gamma_3", "gamma_9", "gamma_27",
    "alpha_pl_1", "alpha_pl_3", "alpha_pl_9", "alpha_pl_27",
    "error_pl_1", "error_pl_3", "error_pl_9", "error_pl_27",

    # Stretched exponential parameters
    "delta_1", "delta_3", "delta_9", "delta_27",
    "beta_1", "beta_3", "beta_9", "beta_27",
    "alpha_se_1", "alpha_se_3", "alpha_se_9", "alpha_se_27",
    "error_se_1", "error_se_3", "error_se_9", "error_se_27",
]
spgc_metadata_sampled[new_columns] = np.nan

### Define reusable functions

In [None]:
def load_word2vec_embeddings(language_code: str, base_path: str = "embeddings") -> KeyedVectors:
    """
    Load word2vec embeddings for a given language code.
    """
    model_path = f"{base_path}/word2vec_{language_code}.bin"
    return KeyedVectors.load_word2vec_format(model_path, binary=True)

In [None]:
def load_tokens(file_name: str, base_path: str = "data/SGPC") -> list:
    """
    Load tokens from a file and return them as a list.
    """
    file_path = f"{base_path}/{file_name}_tokens.txt"
    with open(file_path, mode="r", encoding="UTF-8") as f:
        tokens = f.read().split("\n")
    return tokens


In [None]:
def compute_coco_values_for_book(vectors: np.ndarray, 
                                 file_name: str,
                                 language: str,
                                 pool_order: int,
                                 coco_results: list,
                                 lag_growth_factor: float = 1.1):
    """
    Given the vectors for a single book, compute CoCo values at various lags
    and store them in the coco_results list of dicts.
    """
    lrd = LRDEstimator.LRDEstimator(vectors)
    max_lag = int(vectors.shape[0] / 2) # TBD: maybe this should be smaller for poolend embeddings?
    current_lag = 1
    
    while current_lag < max_lag:
        # Example without permutation test:
        coco_value = lrd.calculate_coco(lag=current_lag, pool_order=pool_order)
        
        coco_results.append({
            "language": language,
            "book_id": file_name,
            "pool_order": pool_order,
            "lag": int(current_lag),
            "coco_value": coco_value,
        })
        
        # Increase lag by ~10%
        current_lag = int(np.ceil(current_lag * lag_growth_factor))

In [None]:
def fit_and_store_curves(spgc_df, coco_results, file_name, pool_order):
    """
    Fit power law and stretched exponential to the CoCo results for a single
    book_id and pool_order. Store results back into spgc_df.
    """
    # Filter the relevant CoCo results
    book_data = [
        r for r in coco_results 
        if (r["book_id"] == file_name) and (r["pool_order"] == pool_order)
    ]
    if not book_data:
        return  # No data to fit
    
    lags = np.array([d["lag"] for d in book_data])
    c_coco = np.array([d["coco_value"] for d in book_data])
    abs_coco = np.abs(c_coco)
    
    # Fit power law
    try:
        popt_pl, _ = curve_fit(
            utils.power_law,
            lags,
            abs_coco,
            bounds=([-np.inf, -np.inf], [np.inf, 0]),
            maxfev=5000
        )
        alpha_col = f"alpha_pl_{pool_order}"
        gamma_col = f"gamma_{pool_order}"
        spgc_df.loc[spgc_df["id"] == file_name, alpha_col] = popt_pl[0]
        spgc_df.loc[spgc_df["id"] == file_name, gamma_col] = popt_pl[1]

        # Calculate the fitted values
        fitted_values = utils.power_law(lags, *popt_pl)
        # Calculate the error
        error = utils.calculate_wssr(abs_coco, fitted_values)
        error_col = f"error_pl_{pool_order}"
        spgc_df.loc[spgc_df["id"] == file_name, error_col] = error
        
    except RuntimeError:
        pass  # If fitting fails, leave as NaN
    
    # Fit stretched exponential
    try:
        popt_se, _ = curve_fit(
            utils.stretched_exponential,
            lags,
            abs_coco,
            bounds=([0, 0, -np.inf], [np.inf, 1, np.inf]),
            maxfev=5000
        )
        delta_col = f"delta_{pool_order}"
        beta_col = f"beta_{pool_order}"
        alpha_col = f"alpha_se_{pool_order}"
        spgc_df.loc[spgc_df["id"] == file_name, delta_col] = popt_se[0]
        spgc_df.loc[spgc_df["id"] == file_name, beta_col] = popt_se[1]
        spgc_df.loc[spgc_df["id"] == file_name, alpha_col] = popt_se[2]

        # Calculate the fitted values
        fitted_values = utils.stretched_exponential(lags, *popt_se)
        # Calculate the error
        error = utils.calculate_wssr(abs_coco, fitted_values)
        error_col = f"error_se_{pool_order}"
        spgc_df.loc[spgc_df["id"] == file_name, error_col] = error
    
    except RuntimeError:
        pass  # If fitting fails, leave as NaN

In [None]:
def process_language(spgc_df, language, pool_orders, coco_results):
    """
    Process all books for a single language and multiple pool_orders.
    """
    print(f"Processing language: {language}")
    language_code = re.findall(r"[a-z]{2}", language)[0]
    
    # Filter metadata for the current language
    spgc_metadata_current = spgc_df[spgc_df["language"] == language]
    
    # Load embeddings once per language
    model_current = load_word2vec_embeddings(language_code)
    
    # Iterate through each book in the current language
    for index, row in spgc_metadata_current.iterrows():
        file_name = row["id"]
        
        # Load tokens
        tokens = load_tokens(file_name)
        
        # Build embeddings
        vectors = np.asarray([model_current[w] for w in tokens if w in model_current])
        
        # Skip if not enough vectors
        if len(vectors) < 2:
            continue
        
        # For each pool_order, compute CoCo values
        for p_order in pool_orders:
            compute_coco_values_for_book(
                vectors=vectors,
                file_name=file_name,
                language=language,
                pool_order=p_order,
                coco_results=coco_results
            )
        
        # For each pool_order, fit the curves and store
        for p_order in pool_orders:
            fit_and_store_curves(spgc_df, coco_results, file_name, p_order)

### Run calculations

In [None]:
def main_coco_pipeline(spgc_metadata_sampled):
    """
    Main pipeline function that:
      1) Determines unique languages,
      2) Iterates over each language,
      3) Computes CoCo for each pool_order,
      4) Fits curves, and
      5) Stores results.
    """
    languages = spgc_metadata_sampled["language"].unique()
    pool_orders = [0, 3, 9, 27]
    
    # We'll collect results in this list of dictionaries
    coco_results_records = []
    
    for language in languages:
        process_language(
            spgc_df=spgc_metadata_sampled,
            language=language,
            pool_orders=pool_orders,
            coco_results=coco_results_records
        )
    
    return spgc_metadata_sampled, coco_results_records

In [None]:
spgc_metadata_sampled = pd.read_csv("data/spgc_metadata_sampled.csv")
spgc_metadata_sampled, coco_results = main_coco_pipeline(spgc_metadata_sampled)

# It takes ~2 hours to complete

# Save the results to a CSV file
spgc_metadata_sampled.to_csv("results/spgc_metadata_sampled_after.csv", index=False)
coco_results_df = pd.DataFrame(coco_results)
coco_results_df.to_csv("results/coco_results.csv", index=False)

## Human vs LLM Corpus

### Prepare the data

In [None]:
# Read sampled dataset
df_human_vs_llm_sampled = pd.read_csv("data/human_vs_llm_sampled.csv")

new_columns = [
    # Power law parameters
    "gamma_1", "gamma_3", "gamma_9", "gamma_27",
    "alpha_pl_1", "alpha_pl_3", "alpha_pl_9", "alpha_pl_27",
    "error_pl_1", "error_pl_3", "error_pl_9", "error_pl_27",

    # Stretched exponential parameters
    "delta_1", "delta_3", "delta_9", "delta_27",
    "beta_1", "beta_3", "beta_9", "beta_27",
    "alpha_se_1", "alpha_se_3", "alpha_se_9", "alpha_se_27",
    "error_se_1", "error_se_3", "error_se_9", "error_se_27",
]
df_human_vs_llm_sampled[new_columns] = np.nan

In [None]:
model = KeyedVectors.load_word2vec_format("embeddings/word2vec_en.bin", binary=True)

### Run calculations

In [None]:
pool_orders = [1, 3, 9, 27]
coco_results = []

In [None]:
## Runtime approx. 3h
for i, row in df_human_vs_llm_sampled.iterrows():
    if (i+1) % 100 == 0:
        print(f"Processing row {i+1}/{len(df_human_vs_llm_sampled)}")
    text = row["text"]
    tokens = list(tokenize(text, lowercase=True))
    vectors = np.asarray([model[w] for w in tokens if w in model])
    
    # Skip if not enough vectors
    if len(vectors) < 2:
        continue
    
    # For each pool_order, compute CoCo values
    for p_order in pool_orders:
        lrd = LRDEstimator.LRDEstimator(vectors)
        max_lag = int(vectors.shape[0] / 2)
        current_lag = 1
        while current_lag < max_lag:
            coco_value = lrd.calculate_coco(lag=current_lag, pool_order=p_order)
            coco_results.append({
                "text": text,
                "source": row["source"],
                "pool_order": p_order,
                "lag": int(current_lag),
                "coco_value": coco_value,
            })
            current_lag = int(np.ceil(current_lag * 1.1))
        
    
    # For each pool_order, fit the curves
    for p_order in pool_orders:
        # Filter the relevant CoCo results
        book_data = [
            r for r in coco_results 
            if (r["text"] == text) and (r["pool_order"] == p_order)
        ]
        if not book_data:
            continue
        
        lags = np.array([d["lag"] for d in book_data])
        c_coco = np.array([d["coco_value"] for d in book_data])
        abs_coco = np.abs(c_coco)
        
        # Fit power law
        try:
            popt_pl, _ = curve_fit(
                utils.power_law,
                lags,
                abs_coco,
                bounds=([-np.inf, -np.inf], [np.inf, 0]),
                maxfev=5000
            )
            alpha_col = f"alpha_pl_{p_order}"
            gamma_col = f"gamma_{p_order}"
            df_human_vs_llm_sampled.loc[i, alpha_col] = popt_pl[0]
            df_human_vs_llm_sampled.loc[i, gamma_col] = popt_pl[1]

            # Calculate the fitted values
            fitted_values = utils.power_law(lags, *popt_pl)
            # Calculate the error
            error = utils.calculate_wssr(abs_coco, fitted_values)
            error_col = f"error_pl_{p_order}"
            df_human_vs_llm_sampled.loc[i, error_col] = error

        except RuntimeError:
            pass  # If fitting fails, leave as NaN
        
        # Fit stretched exponential
        try:
            popt_se, _ = curve_fit(
                utils.stretched_exponential,
                lags,
                abs_coco,
                bounds=([0, 0, -np.inf], [np.inf, 1, np.inf]),
                maxfev=5000
            )
            delta_col = f"delta_{p_order}"
            beta_col = f"beta_{p_order}"
            alpha_col = f"alpha_se_{p_order}"
            df_human_vs_llm_sampled.loc[i, delta_col] = popt_se[0]
            df_human_vs_llm_sampled.loc[i, beta_col] = popt_se[1]
            df_human_vs_llm_sampled.loc[i, alpha_col] = popt_se[2]

            # Calculate the fitted values
            fitted_values = utils.stretched_exponential(lags, *popt_se)
            # Calculate the error
            error = utils.calculate_wssr(abs_coco, fitted_values)
            error_col = f"error_se_{p_order}"
            df_human_vs_llm_sampled.loc[i, error_col] = error
            
        except RuntimeError:
            pass  # If fitting fails, leave as NaN


In [None]:
# Save the results to a CSV file
df_human_vs_llm_sampled.to_csv("results/human_vs_llm_sampled_after.csv", index=False)
coco_results_df = pd.DataFrame(coco_results)
coco_results_df.to_csv("results/human_vs_llm_coco_results.csv", index=False)