# Per-Row Feature Enrichment
Computes all 26 features per row (lexical, syntactic, punctuation, function-word PCA) and writes enriched data.parquet.

In [1]:
import re
import math
import random
from collections import Counter
from pathlib import Path
import pandas as pd
import numpy as np
import nltk
import spacy
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer

WORD_RE = re.compile(r"\b\w+\b", flags=re.UNICODE)

def tokenize(text):
    if not isinstance(text, str):
        return []
    return [w.lower() for w in WORD_RE.findall(text)]

## Load Data

In [2]:
candidates = [Path.cwd(), Path.cwd().parent, Path.cwd().parent.parent]

def find_path(rel):
    for base in candidates:
        p = base.joinpath(rel)
        if p.exists():
            return p.resolve()
    return None

merged_path = find_path(Path('data').joinpath('data.parquet'))
if merged_path:
    df = pd.read_parquet(merged_path)
    print(f"Read merged data.parquet ({merged_path}) with {len(df)} rows")
else:
    df = pd.DataFrame()
    print("No data available to enrich")

Read merged data.parquet (W:\Programming\PKOG\preprecogclean\data\data.parquet) with 1508 rows


## Lexical metrics per row
TTR, hapax, MTLD, sentence length std.

In [3]:
def type_token_ratio(tokens):
    N = len(tokens)
    if N == 0:
        return 0.0
    return len(set(tokens)) / N


def hapax_in_sample(tokens, sample_size=5000, seed=None):
    N = len(tokens)
    if N == 0:
        return 0
    if N <= sample_size:
        sample = tokens
    else:
        rng = random.Random(seed)
        start = rng.randint(0, N - sample_size)
        sample = tokens[start:start+sample_size]
    freq = Counter(sample)
    return sum(1 for w,c in freq.items() if c == 1)


def mtld_calc(tokens, ttr_threshold=0.72):
    def mtld_single_pass(token_list):
        factor_count = 0
        token_count = 0
        types = set()
        for w in token_list:
            token_count += 1
            types.add(w)
            ttr = len(types) / token_count
            if ttr <= ttr_threshold:
                factor_count += 1
                token_count = 0
                types = set()
        if token_count > 0:
            ttr = len(types) / token_count if token_count else 0
            partial = (1 - ttr) / (1 - ttr_threshold) if (1 - ttr_threshold) != 0 else 0
            factor_count += partial
        return (len(token_list) / factor_count) if factor_count != 0 else float('inf')

    if not tokens:
        return 0.0
    forward = mtld_single_pass(tokens)
    backward = mtld_single_pass(list(reversed(tokens)))
    return (forward + backward) / 2


def sentence_length_std(text):
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt', quiet=True)
    sents = nltk.sent_tokenize(text)
    if not sents:
        return 0.0
    lengths = [len(tokenize(s)) for s in sents]
    if not lengths:
        return 0.0
    mean = sum(lengths)/len(lengths)
    var = sum((l-mean)**2 for l in lengths)/len(lengths)
    return math.sqrt(var)

## Syntactic metrics per row
Adj/noun ratio, tree depth, FK grade, discourse density.

In [4]:
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

DISCOURSE_MARKERS = {
    "however", "therefore", "consequently", "furthermore", "moreover", "nevertheless",
    "thus", "hence", "accordingly", "subsequently", "conversely", "meanwhile",
    "nonetheless", "notwithstanding", "additionally", "alternatively", "undoubtedly",
    "specifically", "similarly", "finally", "indeed"
}

def get_tree_depth(token):
    if not list(token.children):
        return 1
    return 1 + max(get_tree_depth(child) for child in token.children)


def calculate_average_depth(doc):
    depths = []
    for sent in doc.sents:
        depths.append(get_tree_depth(sent.root))
    return sum(depths) / len(depths) if depths else 0


def syllable_count(word):
    word = word.lower()
    count = len(re.findall(r'[aeiouy]+', word))
    if word.endswith('e'):
        count -= 1
    return max(1, count)


def flesch_kincaid(doc):
    n_words = len([t for t in doc if not t.is_punct])
    n_sents = len(list(doc.sents))
    n_syllables = sum(syllable_count(t.text) for t in doc if not t.is_punct)
    if n_words == 0 or n_sents == 0:
        return 0
    return 0.39 * (n_words / n_sents) + 11.8 * (n_syllables / n_words) - 15.59


def analyze_syntax(text):
    if not text:
        return {
            "adj_noun_ratio": 0,
            "tree_depth": 0,
            "fk_grade": 0,
            "discourse_density_per_100_words": 0
        }
    doc = nlp(text)
    adjs = len([t for t in doc if t.pos_ == "ADJ"])
    nouns = len([t for t in doc if t.pos_ == "NOUN"])
    adj_noun_ratio = adjs / nouns if nouns > 0 else 0
    avg_depth = calculate_average_depth(doc)
    fk_grade = flesch_kincaid(doc)
    discourse_count = len([t for t in doc if t.lower_ in DISCOURSE_MARKERS])
    n_words = len([t for t in doc if not t.is_punct])
    discourse_density_per_100_words = (discourse_count / n_words) * 100 if n_words > 0 else 0
    return {
        "adj_noun_ratio": adj_noun_ratio,
        "tree_depth": avg_depth,
        "fk_grade": fk_grade,
        "discourse_density_per_100_words": discourse_density_per_100_words
    }

## Punctuation densities and function-word PCA

In [5]:
PUNCT_NAME_MAP = {
    ';': 'semicolon',
    ':': 'colon',
    '!': 'exclamation',
    '?': 'question',
    '-': 'hyphen',
    '\u2014': 'emdash',
    '*': 'asterisk',
    "'": 'apos',
    '\u2019': 'apos_curly',
    '(': 'paren_open',
    '"': 'quote'
}
PUNCT_SYMBOLS = list(PUNCT_NAME_MAP.keys())
PUNCT_NAMES = list(PUNCT_NAME_MAP.values())

vectorizer = CountVectorizer(stop_words=None, max_features=100)

if df.empty:
    print("No data to process for PCA/punctuation")
else:
    dtm = vectorizer.fit_transform(df['text'].fillna('').astype(str))
    all_words = vectorizer.get_feature_names_out()
    std_stops = spacy.lang.en.stop_words.STOP_WORDS
    function_indices = [i for i, w in enumerate(all_words) if w in std_stops]
    if len(function_indices) < 20:
        function_indices = list(range(min(50, len(all_words))))
    else:
        function_indices = function_indices[:50]

    if len(function_indices) == 0:
        freq_matrix = dtm.toarray()
    else:
        function_word_dtm = dtm[:, function_indices]
        row_sums = np.array(function_word_dtm.sum(axis=1)).ravel()
        row_sums[row_sums == 0] = 1
        freq_matrix = function_word_dtm.toarray() / row_sums[:, None]

    pca = PCA(n_components=2)
    try:
        components = pca.fit_transform(freq_matrix)
    except Exception as e:
        components = np.zeros((len(df), 2))

    df['function_word_pca_dim1'] = components[:, 0]
    df['function_word_pca_dim2'] = components[:, 1]

    def punct_density(text):
        text = text or ''
        L = len(text)
        if L == 0:
            return {name: 0 for name in PUNCT_NAMES}
        c = Counter(text)
        return {name: (c[sym] / L) * 1000 for sym, name in PUNCT_NAME_MAP.items()}

    punct_df = df['text'].apply(lambda x: pd.Series(punct_density(x)))
    for name in PUNCT_NAMES:
        col = f"punct_{name}"
        df[col] = punct_df[name]

## Assemble feature cache and save

In [6]:
if 'feature_cache' not in df.columns:
    df['feature_cache'] = [{} for _ in range(len(df))]
else:
    df['feature_cache'] = df['feature_cache'].apply(lambda x: x if isinstance(x, dict) else {})

new_features_list = []
for idx, row in df.iterrows():
    text = row.get('text', '') or ''
    tokens = tokenize(text)

    seed = int(idx) if isinstance(idx, (int, np.integer)) else abs(hash(idx)) & 0xffffffff

    lexical_keys = {
        'n_tokens': len(tokens),
        'n_types': len(set(tokens)),
        'ttr': type_token_ratio(tokens),
        'hapax_5k': hapax_in_sample(tokens, sample_size=5000, seed=seed),
        'mtld': mtld_calc(tokens),
        'sent_len_std': sentence_length_std(text),
    }

    syntactic = analyze_syntax(text)
    synt_keys = {
        'adj_noun_ratio': syntactic.get('adj_noun_ratio', 0),
        'tree_depth': syntactic.get('tree_depth', 0),
        'fk_grade': syntactic.get('fk_grade', 0),
        'discourse_density_per_100_words': syntactic.get('discourse_density_per_100_words', 0)
    }

    fc = dict(row['feature_cache']) if isinstance(row['feature_cache'], dict) else {}

    for k, v in lexical_keys.items():
        if k not in fc:
            fc[k] = v

    for k, v in synt_keys.items():
        if k not in fc:
            fc[k] = v

    if 'function_word_pca' not in fc:
        fc['function_word_pca'] = {
            'dim1': float(row.get('function_word_pca_dim1', 0)),
            'dim2': float(row.get('function_word_pca_dim2', 0))
        }

    for name in PUNCT_NAMES:
        col = f'punct_{name}'
        if col not in fc:
            fc[col] = float(row.get(col, 0))

    new_features_list.append(fc)

df['feature_cache'] = new_features_list

drop_cols = ['function_word_pca_dim1', 'function_word_pca_dim2'] + [f'punct_{name}' for name in PUNCT_NAMES]
df = df.drop(columns=drop_cols, errors='ignore')

print("Feature cache updated for all rows")
print("Lexical: n_tokens, n_types, ttr, hapax_5k, mtld, sent_len_std")
print("Syntactic: adj_noun_ratio, tree_depth, fk_grade, discourse_density_per_100_words")
print("Stylometric: function_word_pca (dim1, dim2)")
print("Punctuation: 11 punct_* features")

Feature cache updated for all rows
Lexical: n_tokens, n_types, ttr, hapax_5k, mtld, sent_len_std
Syntactic: adj_noun_ratio, tree_depth, fk_grade, discourse_density_per_100_words
Stylometric: function_word_pca (dim1, dim2)
Punctuation: 11 punct_* features


In [7]:
out_path = Path.cwd().joinpath('data.parquet')
df.to_parquet(out_path, index=False)
print(f"Wrote enriched parquet to: {out_path}")

import json
sample_fc = df.sample(1).iloc[0]['feature_cache']
print('\nSample feature_cache:')
print(json.dumps(sample_fc, indent=2, default=str))

Wrote enriched parquet to: w:\Programming\PKOG\preprecogclean\data_analysis\data.parquet

Sample feature_cache:
{
  "author": "gemini-3-flash-preview",
  "avg_sent_length": 19.25,
  "book_title": null,
  "persona_mimicked": "Imposter_James",
  "word_count": 154,
  "n_tokens": 154,
  "n_types": 94,
  "ttr": 0.6103896103896104,
  "hapax_5k": 76,
  "mtld": 58.48877236381374,
  "sent_len_std": 7.578753195612059,
  "adj_noun_ratio": 0.4482758620689655,
  "tree_depth": 6.5,
  "fk_grade": 9.157759740259742,
  "discourse_density_per_100_words": 0.0,
  "function_word_pca": {
    "dim1": -0.12540206561297113,
    "dim2": -0.01780129978267521
  },
  "punct_semicolon": 2.4242424242424243,
  "punct_colon": 0.0,
  "punct_exclamation": 0.0,
  "punct_question": 0.0,
  "punct_hyphen": 0.0,
  "punct_emdash": 0.0,
  "punct_asterisk": 0.0,
  "punct_apos": 0.0,
  "punct_apos_curly": 0.0,
  "punct_paren_open": 0.0,
  "punct_quote": 7.2727272727272725
}
