# Lexical Richness
TTR, Hapax Legomena, MTLD, and sentence length std dev across the dataset by class/author/topic.

In [1]:
import os
import re
import random
from pathlib import Path
from collections import Counter, defaultdict
import math
import pandas as pd
import nltk

WORD_RE = re.compile(r"\b\w+\b", flags=re.UNICODE)

def tokenize(text):
    if not isinstance(text, str):
        return []
    return [w.lower() for w in WORD_RE.findall(text)]

## Load Data

In [2]:
candidates = [Path.cwd(), Path.cwd().parent, Path.cwd().parent.parent]

def find_path(rel):
    for base in candidates:
        p = base.joinpath(rel)
        if p.exists():
            return p.resolve()
    return None

merged_path = find_path(Path('data').joinpath('data.parquet'))
if merged_path:
    df = pd.read_parquet(merged_path)
    print(f"Read merged data.parquet ({merged_path}) with {len(df)} rows")
else:
    df = pd.DataFrame()
    print("No data available to analyze")

Read merged data.parquet (W:\Programming\PKOG\preprecogclean\data\data.parquet) with 1508 rows


## Preprocessing & Grouping

In [3]:
if not df.empty:
    def extract_author(fc):
        if isinstance(fc, dict):
            return fc.get('author') or fc.get('persona_mimicked') or fc.get('persona')
        return None

    df['author_key'] = df['feature_cache'].apply(extract_author)
    df['topic_key'] = df['topic'].astype(str)
    df['class_key'] = df['class']
    print("Normalized grouping columns: author_key, topic_key, class_key")
else:
    print("No dataframe to normalize")

Normalized grouping columns: author_key, topic_key, class_key


## Type-Token Ratio
TTR = unique types / total tokens. Simple vocabulary variety measure.

In [4]:
def type_token_ratio(tokens):
    N = len(tokens)
    if N == 0:
        return 0.0
    return len(set(tokens)) / N

## Hapax Legomena
Words appearing exactly once in a 5000-word window.

In [5]:
def hapax_in_sample(tokens, sample_size=5000):
    N = len(tokens)
    if N == 0:
        return 0
    if N <= sample_size:
        sample = tokens
    else:
        # sample contiguous window to preserve some local structure
        start = random.randint(0, N - sample_size)
        sample = tokens[start:start+sample_size]
    freq = Counter(sample)
    return sum(1 for w,c in freq.items() if c == 1)

## MTLD
Avg length of sequential word strings maintaining TTR >= 0.72. More robust to text length than raw TTR.

In [6]:
def mtld_calc(tokens, ttr_threshold=0.72):
    # Helper single pass
    def mtld_single_pass(token_list):
        factor_count = 0
        token_count = 0
        types = set()
        for w in token_list:
            token_count += 1
            types.add(w)
            ttr = len(types) / token_count
            if ttr <= ttr_threshold:
                factor_count += 1
                token_count = 0
                types = set()
        # residual
        if token_count > 0:
            ttr = len(types) / token_count if token_count else 0
            partial = (1 - ttr) / (1 - ttr_threshold) if (1 - ttr_threshold) != 0 else 0
            factor_count += partial
        return (len(token_list) / factor_count) if factor_count != 0 else float('inf')

    if not tokens:
        return 0.0
    forward = mtld_single_pass(tokens)
    backward = mtld_single_pass(list(reversed(tokens)))
    return (forward + backward) / 2

## Sentence Length Std Dev
Captures rhythmic variety in sentence construction.

In [7]:
def sentence_length_std(text):
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt', quiet=True)
    sents = nltk.sent_tokenize(text)
    if not sents:
        return 0.0
    lengths = [len(tokenize(s)) for s in sents]
    if not lengths:
        return 0.0
    mean = sum(lengths)/len(lengths)
    var = sum((l-mean)**2 for l in lengths)/len(lengths)
    return math.sqrt(var)

## Compute metrics per group

In [8]:
if df.empty:
    print("No data to analyze")
else:
    random.seed(42)

    def metrics_for_series(series_texts):
        all_text = " ".join(series_texts)
        tokens = tokenize(all_text)
        return {
            'n_tokens': len(tokens),
            'n_types': len(set(tokens)),
            'TTR': type_token_ratio(tokens),
            'Hapax_5k': hapax_in_sample(tokens, sample_size=5000),
            'MTLD': mtld_calc(tokens),
            'SentLenStd': sentence_length_std(all_text)
        }

    class_metrics = []
    for cls, grp in df.groupby('class_key'):
        m = metrics_for_series(grp['text'].tolist())
        m['group'] = f'class_{cls}'
        class_metrics.append(m)
    df_class_metrics = pd.DataFrame(class_metrics).set_index('group')
    print("\n### By Class")
    display(df_class_metrics)

    author_metrics = []
    for auth, grp in df.groupby('author_key'):
        m = metrics_for_series(grp['text'].tolist())
        m['group'] = str(auth)
        author_metrics.append(m)
    df_author_metrics = pd.DataFrame(author_metrics).set_index('group')
    print("\n### By Author")
    display(df_author_metrics.sort_values('n_tokens', ascending=False).head(50))

    topic_metrics = []
    for tp, grp in df.groupby('topic_key'):
        m = metrics_for_series(grp['text'].tolist())
        m['group'] = str(tp)
        topic_metrics.append(m)
    df_topic_metrics = pd.DataFrame(topic_metrics).set_index('group')
    print("\n### By Topic")
    display(df_topic_metrics.sort_values('n_tokens', ascending=False).head(50))


### By Class


Unnamed: 0_level_0,n_tokens,n_types,TTR,Hapax_5k,MTLD,SentLenStd
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
class_1,82503,9229,0.111863,986,71.240906,15.7874
class_2,80624,5016,0.062215,737,117.875977,7.659899
class_3,82120,7315,0.089077,999,75.140273,12.096795



### By Author


Unnamed: 0_level_0,n_tokens,n_types,TTR,Hapax_5k,MTLD,SentLenStd
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
gemini-2.5-flash,73048,6600,0.090352,896,109.221087,11.317308
gemini-2.5-flash-lite,56711,5436,0.095854,944,91.919715,8.883763
gemini-3-flash-preview,32985,4319,0.130938,737,67.053886,9.004546
Russell,20837,2052,0.098479,507,55.727864,14.086077
Bacon,20583,3954,0.1921,1012,74.282385,17.797279
James,20568,3652,0.177557,993,86.468636,13.604208
Emerson,20515,4375,0.213259,1060,76.299866,15.928087



### By Topic


Unnamed: 0_level_0,n_tokens,n_types,TTR,Hapax_5k,MTLD,SentLenStd
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
General Philosophy,71054,7092,0.099811,953,83.60389,13.586838
Ethics & Conduct,62400,7337,0.11758,642,88.477273,11.610807
Mind & Knowledge,42710,4254,0.099602,628,73.345592,10.540558
Truth & Reality,33776,3977,0.117746,772,77.142748,10.757413
Religion & Spirit,20653,3674,0.177892,950,92.773029,12.086096
Society & Politics,14654,3265,0.222806,950,87.777986,14.681663
