In [None]:
from gensim.models import KeyedVectors
from gensim.utils import tokenize
import numpy as np
import pandas as pd
import LRDEstimator
from scipy.optimize import curve_fit
import utils
import matplotlib.pyplot as plt
import os

In [None]:
df_human_vs_llm_sampled = pd.read_csv("data/human_vs_llm_sampled.csv")
df_human_vs_llm_sampled = df_human_vs_llm_sampled[df_human_vs_llm_sampled["source"] == "GPT-3.5"]
df_human_vs_llm_sampled = df_human_vs_llm_sampled[df_human_vs_llm_sampled["text_length"] == df_human_vs_llm_sampled["text_length"].max()]
tokens = []
for text in df_human_vs_llm_sampled["text"]:
    tokens = list(tokenize(str(text), lowercase=True))

model_path = "embeddings/word2vec_en.bin"
model = KeyedVectors.load_word2vec_format(model_path, binary=True)

vectors = np.asarray([model[w] for w in tokens if w in model])

In [None]:
lrd = LRDEstimator.LRDEstimator(vectors)

In [None]:
max_lag = int(vectors.shape[0] / 2)
lag_growth_factor = 1.1

# Results for the standard correlation
lags_corr = []
corr_standard = []

# Results for coco with pool_order=0
lags_coco0 = []
corr_coco0 = []

# Results for coco with pool_order=3
lags_coco3 = []
corr_coco3 = []

# Results for coco with pool_order=9
lags_coco9 = []
corr_coco9 = []

# Results for coco with pool_order=27
lags_coco27 = []
corr_coco27 = []

# Calculate for standard correlation
current_lag = 1
while current_lag < max_lag:
    lags_corr.append(current_lag)
    corr_standard.append(lrd.calculate_corr(current_lag))
    current_lag = int(np.ceil(current_lag * lag_growth_factor))

# Calculate for coco with pool_order=0
current_lag = 1
while current_lag < max_lag:
    lags_coco0.append(current_lag)
    corr_coco0.append(lrd.calculate_coco(current_lag, pool_order=0))
    current_lag = int(np.ceil(current_lag * lag_growth_factor))

# Calculate for coco with pool_order=3
current_lag = 1
while current_lag < max_lag:
    lags_coco3.append(current_lag)
    corr_coco3.append(lrd.calculate_coco(current_lag, pool_order=3))
    current_lag = int(np.ceil(current_lag * lag_growth_factor))

# Calculate for coco with pool_order=9
current_lag = 1
while current_lag < max_lag:
    lags_coco9.append(current_lag)
    corr_coco9.append(lrd.calculate_coco(current_lag, pool_order=9))
    current_lag = int(np.ceil(current_lag * lag_growth_factor))

# Calculate for coco with pool_order=27
current_lag = 1
while current_lag < max_lag:
    lags_coco27.append(current_lag)
    corr_coco27.append(lrd.calculate_coco(current_lag, pool_order=27))
    current_lag = int(np.ceil(current_lag * lag_growth_factor))

In [None]:
# Fit power law and stretched exponential for standard correlation
popt_pl_standard, _ = curve_fit(
    utils.power_law,
    lags_corr,
    np.abs(corr_standard),
    bounds=([-np.inf, -np.inf], [np.inf, 0]),
    maxfev=5000
)

popt_se_standard, _ = curve_fit(
    utils.stretched_exponential,
    lags_corr,
    np.abs(corr_standard),
    bounds=([0, 0, -np.inf], [np.inf, 1, np.inf]),
    maxfev=5000
)

# Fit power law and stretched exponential for coco0
min_lag, max_lag = [0+1, 1000]
lags_coco0_fit = np.asarray(lags_coco0)
corr_coco0_fit = np.asarray(corr_coco0)
mask = (lags_coco0_fit >= min_lag) & (lags_coco0_fit <= max_lag)
lags_coco0_fit = lags_coco0_fit[mask]
corr_coco0_fit = corr_coco0_fit[mask]

popt_pl_coco0, _ = curve_fit(
    utils.power_law,
    lags_coco0_fit,
    np.abs(corr_coco0_fit),
    bounds=([-np.inf, -np.inf], [np.inf, 0]),
    maxfev=5000
)

popt_se_coco0, _ = curve_fit(
    utils.stretched_exponential,
    lags_coco0_fit,
    np.abs(corr_coco0_fit),
    bounds=([0, 0, -np.inf], [np.inf, 1, np.inf]),
    maxfev=5000
)

# Fit power law and stretched exponential for coco3
min_lag, max_lag = [3+1, 1000]
lags_coco3_fit = np.asarray(lags_coco3)
corr_coco3_fit = np.asarray(corr_coco3)
mask = (lags_coco3_fit >= min_lag) & (lags_coco3_fit <= max_lag)
lags_coco3_fit = lags_coco3_fit[mask]
corr_coco3_fit = corr_coco3_fit[mask]

popt_pl_coco3, _ = curve_fit(
    utils.power_law,
    lags_coco3_fit,
    np.abs(corr_coco3_fit),
    bounds=([-np.inf, -np.inf], [np.inf, 0]),
    maxfev=5000
)

popt_se_coco3, _ = curve_fit(
    utils.stretched_exponential,
    lags_coco3_fit,
    np.abs(corr_coco3_fit),
    bounds=([0, 0, -np.inf], [np.inf, 1, np.inf]),
    maxfev=5000
)

# Fit power law and stretched exponential for coco9
min_lag, max_lag = [9+1, 1000]
lags_coco9_fit = np.asarray(lags_coco9)
corr_coco9_fit = np.asarray(corr_coco9)
mask = (lags_coco9_fit >= min_lag) & (lags_coco9_fit <= max_lag)
lags_coco9_fit = lags_coco9_fit[mask]
corr_coco9_fit = corr_coco9_fit[mask]

popt_pl_coco9, _ = curve_fit(
    utils.power_law,
    lags_coco9_fit,
    np.abs(corr_coco9_fit),
    bounds=([-np.inf, -np.inf], [np.inf, 0]),
    maxfev=5000
)

popt_se_coco9, _ = curve_fit(
    utils.stretched_exponential,
    lags_coco9_fit,
    np.abs(corr_coco9_fit),
    bounds=([0, 0, -np.inf], [np.inf, 1, np.inf]),
    maxfev=5000
)

# Fit power law and stretched exponential for coco27
min_lag, max_lag = [27+1, 1000]
lags_coco27_fit = np.asarray(lags_coco27)
corr_coco27_fit = np.asarray(corr_coco27)
mask = (lags_coco27_fit >= min_lag) & (lags_coco27_fit <= max_lag)
lags_coco27_fit = lags_coco27_fit[mask]
corr_coco27_fit = corr_coco27_fit[mask]

popt_pl_coco27, _ = curve_fit(
    utils.power_law,
    lags_coco27_fit,
    np.abs(corr_coco27_fit),
    bounds=([-np.inf, -np.inf], [np.inf, 0]),
    maxfev=5000
)

popt_se_coco27, _ = curve_fit(
    utils.stretched_exponential,
    lags_coco27_fit,
    np.abs(corr_coco27_fit),
    bounds=([0, 0, -np.inf], [np.inf, 1, np.inf]),
    maxfev=5000
)

In [None]:
def plot_fits(lags, corr, popt_pl, popt_se, title_prefix, save=True):
    plt.figure(figsize=(12, 6))
    
    # Data points
    plt.loglog(lags, np.abs(corr), 'o', label='Data points')
    
    # Power law fit
    plt.loglog(lags, utils.power_law(np.array(lags), *popt_pl), 'r-', 
               label=f'Power law: y = {popt_pl[0]:.4f}x^({popt_pl[1]:.4f})')
    
    # Stretched exponential fit
    plt.loglog(lags, utils.stretched_exponential(np.array(lags), *popt_se), 'g-', 
               label=f'Stretched exp: y = exp(-{popt_se[0]:.4f}(x^{popt_se[1]:.4f}) + {popt_se[2]:.4f})')
    
    plt.xlabel('n')
    plt.ylabel('')
    # plt.title(f'{title_prefix}: Power Law and Stretched Exponential Fits')
    plt.grid(True, which="both", ls="--")
    plt.legend()
    plt.ylim(1e-6, 1e0)
    plt.tight_layout()

    if save:
        # Clean filename by removing spaces and special characters
        filename = title_prefix.replace(" ", "_").replace("=", "").replace("(", "").replace(")", "")
        filepath = os.path.join(save_dir, f"{filename}.pdf")
        plt.savefig(filepath, dpi=300, bbox_inches='tight')

    plt.show()

In [None]:
save_dir = "figures"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Plot for standard correlation
plot_fits(lags_corr, corr_standard, popt_pl_standard, popt_se_standard, 'Corr_HLLMTC', save=True)

# Plot for COCO with pool_order=0
plot_fits(lags_coco0, corr_coco0, popt_pl_coco0, popt_se_coco0, 'Coco_1_HLLMTC', save=True)

# Plot for COCO with pool_order=3
plot_fits(lags_coco3, corr_coco3, popt_pl_coco3, popt_se_coco3, 'Coco_3_HLLMTC', save=True)

# Plot for COCO with pool_order=9
plot_fits(lags_coco9, corr_coco9, popt_pl_coco9, popt_se_coco9, 'Coco_9_HLLMTC', save=True)

# Plot for COCO with pool_order=27
plot_fits(lags_coco27, corr_coco27, popt_pl_coco27, popt_se_coco27, 'Coco_27_HLLMTC', save=True)