In [None]:
# External modules
import numpy as np
import gensim.downloader as api
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import os.path

# Internal modules
from utils import power_law, stretched_exponential

Load pre-trained *word2vec* embedding space. Below we load space for English dictionary of about ~3 mln words.

In [None]:
word2vec = api.load("word2vec-google-news-300")

In [None]:
def load_and_embed(file_path: str) -> np.ndarray:
    if os.path.exists(file_path):
        text_file = open(file_path, mode = "r", encoding = "UTF-8")
    else:
        raise ValueError("File does not exists or incorrect path provided.")
    
    tokens = text_file.read().split("\n")
    vectors = np.asarray([word2vec[w] for w in tokens if w in word2vec])
    print(f"Coverage: {len(vectors) / len(tokens):.4f}.")
    return vectors

In [None]:
class LRDEstimator:
    def __init__(self, vectors: np.ndarray):
        self.vectors = vectors
        self.N, self.d = vectors.shape

        # Precompute norms and normalize vectors
        self.norms = np.linalg.norm(self.vectors, axis=1, keepdims=True) # shape (N, 1)
        self.X = self.vectors / self.norms # shape (N, d)

        # Calculate partial sums
        self.S = np.zeros((self.N+1, self.d))
        self.S[1:] = np.cumsum(self.X, axis=0)
    
    def calculate_coco(self, lag: int):
        length = self.N - lag
        
        U_sum = self.S[self.N - lag] - self.S[0] # sum of X[0..N-lag-1]
        V_sum = self.S[self.N] - self.S[lag]     # sum of X[lag..N-1]

        E_U = U_sum / length
        E_V = V_sum / length

        U = self.X[:self.N - lag]
        V = self.X[lag:]
        dot_products = np.sum(U * V, axis=1)
        E_UV = np.mean(dot_products)

        return E_UV - np.dot(E_U, E_V)

In [None]:
def fit_and_plot_lrd(n, c, **kwargs):
    # Convert inputs to numpy arrays
    n, c = np.array(n), np.array(c)

    # Fit curves
    popt_pl, _ = curve_fit(power_law, n, c, maxfev=5000)
    popt_se, _ = curve_fit(stretched_exponential, n, c, bounds=([0, 0], [np.inf, 1]), maxfev=5000)

    # Generate range for plotting fitted curves
    x_fit = np.logspace(np.log10(n.min()), np.log10(n.max()), 200)

    # Make the plot
    plt.figure()
    plt.scatter(n, c, **kwargs)
    plt.plot(x_fit, power_law(x_fit, *popt_pl), label="Power law")
    plt.plot(x_fit, stretched_exponential(x_fit, *popt_se), label="Stretched exp.")

    # Decorate the plot
    plt.xscale("log")
    plt.yscale("log")
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
files = ["PG" + str(i) for i in range(1, 10)]

for file in files:
    file_path = f"data/{file}_tokens.txt"
    vectors = load_and_embed(file_path)
    lrd = LRDEstimator(vectors)

    max_lag = int(vectors.shape[0] / 2)
    n = range(1, max_lag)
    c = []

    for lag in n:
        c.append(lrd.calculate_coco(lag))

    fit_and_plot_lrd(n, c, s = 1)