In [None]:
# External modules
import numpy as np
import gensim.downloader as api
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import os.path

# Internal modules
from utils import power_law, stretched_exponential

Load pre-trained *word2vec* embedding space. Below we load space for English dictionary of about ~3 mln words.

In [None]:
word2vec = api.load("word2vec-google-news-300")

In [None]:
def load_and_embed(file_path: str) -> np.ndarray:
    if os.path.exists(file_path):
        text_file = open(file_path, mode = "r", encoding = "UTF-8")
    else:
        raise ValueError("File does not exists or incorrect path provided.")
    
    tokens = text_file.read().split("\n")
    vectors = np.asarray([word2vec[w] for w in tokens if w in word2vec])
    print(f"Coverage: {len(vectors) / len(tokens):.4f}.")
    return vectors

Strategy in `LRDEstimator` is to defer heavy computations until they are needed. Example of heavy computation is building prefix sums for large arrays. When it's needed then it's cached inside the class.

In [None]:
class LRDEstimator:
    def __init__(self, vectors: np.ndarray):
        self.vectors = vectors
        self.N, self.d = vectors.shape

        # Cached data
        self.X_unpooled = None          # Normalized unpooled embeddings, shape (N, d)
        self.S_unpooled = None          # Prefix sums of X_unpooled, shape (N+1, d)
        self.current_pool_order = None  # Which pool_order is cached
        self.X_pooled = None            # Normalized pooled embeddings, shape (N, d)
        self.S_pooled = None            # Prefix sums of X_pooled, shape (N+1, d)

        self.polarities = None          # For correlation-based method
    
    def calculate_polarities(self, standardize: bool = False):
        if standardize:
            means = np.mean(self.vectors, axis=1, keepdims=True)
            stds = np.std(self.vectors, axis=1, keepdims=True)
            stds[stds == 0] = 1.0 # Avoid division by zero - if variance is zero then calculate V - EV
            vectors_standardized = (self.vectors - means) / stds
            self.polarities = np.sum(vectors_standardized, axis=1)
        else:
            self.polarities = np.sum(self.vectors, axis=1)
        
    def calculate_corr(self, lag: int, standardize: bool = False):
        if self.polarities is None:
            self.calculate_polarities(standardize)
        return np.corrcoef(self.polarities[:-lag], self.polarities[lag:])[0,1]

    def compute_unpooled(self):
        """
        Compute and cache normalized unpooled embeddings and their prefix sums if not already cached.
        """
        if self.X_unpooled is None:
            # Compute norms and normalized embeddings
            norms = np.linalg.norm(self.vectors, axis=1, keepdims=True)
            norms[norms == 0] = 1.0 # Avoid division by zero
            self.X_unpooled = self.vectors / norms

            # Build prefix sums
            self.S_unpooled = np.zeros((self.N + 1, self.d))
            self.S_unpooled[1:] = np.cumsum(self.X_unpooled, axis=0)
    
    def compute_pooled(self, pool_order: int):
        """
        Compute and cache normalized pooled embeddings and their prefix sums if not already cached.
        """
        # If we already have computed them then do nothing
        if self.current_pool_order == pool_order and self.X_pooled is not None:
            return

        # Build prefix sums: P[i] = F[0] + F[1] + ... + F[i-1]
        P = np.zeros((self.N + 1, self.d))
        P[1:] = np.cumsum(self.vectors, axis=0)

        # Pool embeddings
        pooled = np.zeros((self.N, self.d))
        for i in range(self.N):
            m = min(i + pool_order, self.N - 1)
            pooled[i] = P[m + 1] - P[i]

        # Normalize pooled embeddings
        pooled_norms = np.linalg.norm(pooled, axis=1, keepdims=True)
        pooled_norms[pooled_norms == 0] = 1.0
        self.X_pooled = pooled / pooled_norms

        # Build prefix sums
        self.S_pooled = np.zeros((self.N + 1, self.d))
        self.S_pooled[1:] = np.cumsum(self.X_pooled, axis=0)

        # Update pool order cache
        self.current_pool_order = pool_order

    def pool_embeddings(self, pool_order: int) -> np.ndarray:
        # Build prefix sums: P[i] = F[0] + F[1] + ... + F[i-1]
        P = np.zeros((self.N + 1, self.d))
        P[1:] = np.cumsum(self.vectors, axis=0)

        # Allocate memory for pooled embeddings
        vectors_pooled = np.zeros((self.N, self.d))

        for i in range(self.N):
            m = min(i + pool_order, self.N - 1)
            vectors_pooled[i] = P[m + 1] - P[i]
        
        return vectors_pooled
    
    def calculate_coco(self, lag: int, pool_order: int = 0) -> float:
        length = self.N - lag
        if length <= 0:
            raise ValueError(f"lag={lag} is too large for sequence of length {self.N}")

        if pool_order == 0:
            # Ensure unpooled data is computed
            self.compute_unpooled()

            # Calculate CoCo on unpooled data
            U_sum = self.S_unpooled[self.N - lag] - self.S_unpooled[0]  # sum of X[0..N-lag-1]
            V_sum = self.S_unpooled[self.N] - self.S_unpooled[lag]      # sum of X[lag..N-1]

            E_U = U_sum / length
            E_V = V_sum / length

            U = self.X_unpooled[:self.N - lag]
            V = self.X_unpooled[lag:]
            dot_products = np.sum(U * V, axis=1)
            E_UV = np.mean(dot_products)
            return E_UV - np.dot(E_U, E_V)

        else:
            # Ensure pooled data is computed
            self.compute_pooled(pool_order)

            # Calculate CoCo on pooled data
            U_sum = self.S_pooled[self.N - lag] - self.S_pooled[0]  # sum of X[0..N-lag-1]
            V_sum = self.S_pooled[self.N] - self.S_pooled[lag]      # sum of X[lag..N-1]

            E_U = U_sum / length
            E_V = V_sum / length

            U = self.X_pooled[:self.N - lag]
            V = self.X_pooled[lag:]
            dot_products = np.sum(U * V, axis=1)
            E_UV = np.mean(dot_products)
            return E_UV - np.dot(E_U, E_V)

In [None]:
def fit_and_plot_lrd(n, c, plot_title, **kwargs):
    # Convert inputs to numpy arrays
    n, c = np.array(n), np.array(c)

    # Fit curves
    popt_pl, _ = curve_fit(power_law, n, c, maxfev=5000)
    popt_se, _ = curve_fit(stretched_exponential, n, c, bounds=([0, 0, -np.inf], [np.inf, 1, np.inf]), maxfev=5000)

    # Generate range for plotting fitted curves
    x_fit = np.logspace(np.log10(n.min()), np.log10(n.max()), 200)

    # Make the plot
    plt.figure()
    plt.scatter(n, c, **kwargs)
    plt.plot(x_fit, power_law(x_fit, *popt_pl), label="Power law")
    plt.plot(x_fit, stretched_exponential(x_fit, *popt_se), label="Stretched exp.")

    # Decorate the plot
    plt.xscale("log")
    plt.yscale("log")
    plt.legend()
    plt.title(label=plot_title)
    plt.tight_layout()
    plt.show()

In [None]:
# files = ["PG" + str(i) for i in range(1, 10)]
files = ["PG1"]

for file in files:
    file_path = f"data/{file}_tokens.txt"
    vectors = load_and_embed(file_path)
    lrd = LRDEstimator(vectors)

    max_lag = int(vectors.shape[0] / 2)
    n = range(1, max_lag)
    c_coco = []
    c_coco_8 = []
    c_corr = []

    for lag in n:
        c_coco.append(lrd.calculate_coco(lag))
        c_corr.append(lrd.calculate_corr(lag))
    
    for lag in n:
        c_coco_8.append(lrd.calculate_coco(lag, pool_order=8))

    fit_and_plot_lrd(n, np.abs(c_coco), f"|Coco| of {file}", s = 1)
    fit_and_plot_lrd(n, np.abs(c_coco_8), f"|Coco| of order 8 of {file}", s = 1)
    fit_and_plot_lrd(n, np.abs(c_corr), f"|Corr| of {file}", s = 1)

# Human vs LLM Text Corpus

Source: https://www.kaggle.com/datasets/starblasters8/human-vs-llm-text-corpus

In [None]:
import pandas as pd
from gensim.utils import tokenize

In [None]:
df_human_llm = pd.read_csv("data\Human_vs_LLM_Text_Corpus.csv")

In [None]:
df_human_llm.head(10)

In [None]:
df_human_llm.value_counts("source")

In [None]:
df_human_vs_gpt = df_human_llm[df_human_llm["source"].isin(["Human", "GPT-3.5"])]

# Random sample of 100 texts per group
df_human_vs_gpt_sample = df_human_vs_gpt.groupby("source")[["text", "source", "word_count"]].apply(lambda x: x.sample(20))

texts_human_vs_gpt = np.asarray(df_human_vs_gpt_sample["text"])

In [None]:
# Lists for storing the results
coverages = []
beta_0, beta_3, beta_8 = [], [], []

In [None]:
for i, text in enumerate(texts_human_vs_gpt):
    # Tokenize and embed text
    tokens = list(tokenize(text, lowercase=True))
    vectors = np.asarray([word2vec[w] for w in tokens if w in word2vec])
    coverages.append(len(vectors) / len(tokens))

    # Initialize LRD class
    lrd = LRDEstimator(vectors)

    # Initialize lists to store results
    c_coco_0 = []
    c_coco_3 = []
    c_coco_8 = []

    # Calculate Coco
    n = range(1, int(0.5 * (vectors.shape[0] - 1)))
    for lag in n:
        c_coco_0.append(lrd.calculate_coco(lag, pool_order=0))

    # Fit power law
    try:
        popt_se, _ = curve_fit(stretched_exponential, n, np.abs(c_coco_0), maxfev=5000, bounds=([0, 0, -np.inf], [np.inf, 1, np.inf]))
        beta_0.append(popt_se[1])
    except:
        beta_0.append(99.0)
    
    # Calculate Coco with 3-pooled embeddings
    for lag in n:
        c_coco_3.append(lrd.calculate_coco(lag, pool_order=3))

    # Fit power law
    try:
        popt_se, _ = curve_fit(stretched_exponential, n, np.abs(c_coco_3), maxfev=5000, bounds=([0, 0, -np.inf], [np.inf, 1, np.inf]))
        beta_3.append(popt_se[1])
    except:
        beta_3.append(99.0)
    
    # Calculate Coco with 8-pooled embeddings
    n = range(1, int(0.5 * (vectors.shape[0] - 1)))
    for lag in n:
        c_coco_8.append(lrd.calculate_coco(lag, pool_order=8))

    # Fit power law
    try:
        popt_se, _ = curve_fit(stretched_exponential, n, np.abs(c_coco_8), maxfev=5000, bounds=([0, 0, -np.inf], [np.inf, 1, np.inf]))
        beta_8.append(popt_se[1])
    except:
        beta_8.append(99.0)
    
    

In [None]:
np.mean(beta_0)
np.mean(beta_8)