# Word Extraction (Single-Threaded) from Cleaned Articles

This notebook builds Dutch word frequencies from the cleaned article texts stored by the text extraction pipeline. It reads from `output/articles_text_export.sqlite` (table `articles`) and writes a word database to `output/words_database.sqlite`.

Key points:
- Single-threaded batch processing (no multi-threading)
- Uses spaCy Dutch model for tokenization and POS
- Efficient SQLite upserts and yearly frequencies
- Exports multiple word lists for downstream use

In [1]:
# 1) Import Required Libraries
import os
import re
import sqlite3
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
from collections import Counter, defaultdict

import pandas as pd
from tqdm import tqdm

# Minimal display for VS Code
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_colwidth', 120)

print("✅ Imports ready")

✅ Imports ready


In [2]:
# 2) Configure Paths and Parameters
INPUT_DB = os.getenv('WORDS_INPUT_DB', 'output/articles_text_export.sqlite')
INPUT_TABLE = os.getenv('WORDS_INPUT_TABLE', 'articles')
OUTPUT_DB = os.getenv('WORDS_OUTPUT_DB', 'output/words_database.sqlite')
BATCH_SIZE = int(os.getenv('WORDS_BATCH_SIZE', '2000'))
MIN_TEXT_LENGTH = int(os.getenv('WORDS_MIN_TEXT_LENGTH', '20'))
EXCLUDE_STOPWORDS = os.getenv('WORDS_EXCLUDE_STOPWORDS', '1') == '1'
COMMON_WORD_THRESHOLD = int(os.getenv('WORDS_COMMON_WORD_THRESHOLD', '10'))

OUT_EXPORT_DIR = os.getenv('WORDS_EXPORT_DIR', 'output/exports')

print('Configuration:')
print(f'  INPUT_DB={INPUT_DB}')
print(f'  INPUT_TABLE={INPUT_TABLE}')
print(f'  OUTPUT_DB={OUTPUT_DB}')
print(f'  BATCH_SIZE={BATCH_SIZE}')
print(f'  MIN_TEXT_LENGTH={MIN_TEXT_LENGTH}')
print(f'  EXCLUDE_STOPWORDS={EXCLUDE_STOPWORDS}')
print(f'  COMMON_WORD_THRESHOLD={COMMON_WORD_THRESHOLD}')
print(f'  OUT_EXPORT_DIR={OUT_EXPORT_DIR}')

Configuration:
  INPUT_DB=output/articles_text_export.sqlite
  INPUT_TABLE=articles
  OUTPUT_DB=output/words_database.sqlite
  BATCH_SIZE=2000
  MIN_TEXT_LENGTH=20
  EXCLUDE_STOPWORDS=True
  COMMON_WORD_THRESHOLD=10
  OUT_EXPORT_DIR=output/exports


In [3]:
# 3) Install and Load spaCy Dutch Model
try:
    import spacy  # type: ignore
    print('spaCy available')
except ImportError:
    spacy = None
    print('spaCy not installed. Install with: pip install spacy; python -m spacy download nl_core_news_sm')

nlp = None
if 'spacy' in globals() and spacy is not None:
    try:
        nlp = spacy.load('nl_core_news_sm')
        print('✅ Loaded spaCy model: nl_core_news_sm')
    except Exception as e:
        print(f'⚠️ Could not load nl_core_news_sm: {e}')
        print('Install it via: python -m spacy download nl_core_news_sm')
else:
    print('⚠️ spaCy unavailable; tokenization will not run until installed')

spaCy available
✅ Loaded spaCy model: nl_core_news_sm
✅ Loaded spaCy model: nl_core_news_sm


In [4]:
# 4) Connect to Articles SQLite Database (read-only) and validate

def connect_readonly(db_path: str) -> sqlite3.Connection:
    uri = f"file:{Path(db_path).as_posix()}?mode=ro"
    conn = sqlite3.connect(uri, uri=True)
    conn.text_factory = str  # slightly faster conversions
    return conn

required_cols = {'article_id', 'published_time', 'published_timestamp', 'content', 'text_length'}

def validate_articles_table(conn: sqlite3.Connection, table: str) -> None:
    cur = conn.cursor()
    cur.execute(f"PRAGMA table_info({table})")
    cols = {row[1] for row in cur.fetchall()}
    missing = required_cols - cols
    if missing:
        raise RuntimeError(f"Missing required columns in {table}: {sorted(missing)}")
    # Quick row count of usable rows
    cur.execute(f"SELECT COUNT(*) FROM {table} WHERE content IS NOT NULL AND text_length >= ?", (MIN_TEXT_LENGTH,))
    total = cur.fetchone()[0]
    print(f"✅ Articles table '{table}' OK, usable rows (>=MIN_TEXT_LENGTH): {total:,}")

# Try connecting
if os.path.exists(INPUT_DB):
    try:
        art_conn = connect_readonly(INPUT_DB)
        validate_articles_table(art_conn, INPUT_TABLE)
    except Exception as e:
        print(f"❌ Failed to open/validate articles DB: {e}")
        art_conn = None
else:
    print(f"❌ Input DB not found: {INPUT_DB}")
    art_conn = None

✅ Articles table 'articles' OK, usable rows (>=MIN_TEXT_LENGTH): 295,097


In [5]:
# 5) Helper: Stream Articles in Batches

def stream_article_batches(conn: sqlite3.Connection, table: str, batch_size: int, min_text_len: int):
    """Yield batches of rows as lists of tuples (article_id, published_time, published_timestamp, content)."""
    cur = conn.cursor()
    # Prefer rowid pagination for stability
    # Select only needed fields and filter early
    last_rowid = 0
    fetched = 0
    while True:
        cur.execute(
            f"""
            SELECT rowid, article_id, published_time, published_timestamp, content
            FROM {table}
            WHERE rowid > ? AND content IS NOT NULL AND text_length >= ?
            ORDER BY rowid
            LIMIT ?
            """,
            (last_rowid, min_text_len, batch_size)
        )
        rows = cur.fetchall()
        if not rows:
            break
        last_rowid = rows[-1][0]
        fetched += len(rows)
        yield [(r[1], r[2], r[3], r[4]) for r in rows]
    print(f"📦 Finished streaming articles. Total yielded: {fetched:,}")

In [6]:
# 6) Tokenization and Filtering Functions (spaCy)

def should_include_token(token, exclude_stop: bool = EXCLUDE_STOPWORDS) -> bool:
    if not token.text or not token.text.strip():
        return False
    if not token.is_alpha:
        return False
    if len(token.text) < 2 or len(token.text) > 25:
        return False
    if token.pos_ in {'PUNCT', 'SPACE', 'X'}:
        return False
    # Exclude long ALLCAPS acronyms
    if token.text.isupper() and len(token.text) > 3:
        return False
    if exclude_stop and token.is_stop:
        return False
    return True


def extract_words_from_text(text: str, nlp_model) -> List[Dict[str, object]]:
    if not text or not nlp_model:
        return []
    doc = nlp_model(text)
    out: List[Dict[str, object]] = []
    for tok in doc:
        if should_include_token(tok):
            out.append({
                'word': tok.text.lower(),
                'lemma': tok.lemma_.lower() if tok.lemma_ else tok.text.lower(),
                'pos': tok.pos_,
                'is_alpha': tok.is_alpha,
                'is_stop': tok.is_stop,
                'length': len(tok.text),
            })
    return out

print("✅ Tokenization helpers ready")

✅ Tokenization helpers ready


In [7]:
# 7) POS Category Mapping

def get_pos_category(pos_tag: str) -> str:
    mapping = {
        'NOUN': 'noun',
        'PROPN': 'proper_noun',
        'VERB': 'verb',
        'ADJ': 'adjective',
        'ADV': 'adverb',
        'PRON': 'pronoun',
        'DET': 'determiner',
        'ADP': 'preposition',
        'CONJ': 'conjunction',
        'CCONJ': 'conjunction',
        'SCONJ': 'conjunction',
        'NUM': 'number',
        'PART': 'particle',
        'INTJ': 'interjection',
        'AUX': 'auxiliary',
    }
    return mapping.get(pos_tag, 'other')

print('✅ POS category mapping ready')

✅ POS category mapping ready


In [8]:
# 8) Word Database Schema (SQLite)

def setup_word_db(db_path: str) -> sqlite3.Connection:
    Path(db_path).parent.mkdir(parents=True, exist_ok=True)
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()

    # Performance PRAGMAs for faster bulk upserts
    try:
        cur.execute("PRAGMA journal_mode = WAL;")              # better concurrency and throughput
        cur.execute("PRAGMA synchronous = NORMAL;")            # safer than OFF, still faster
        cur.execute("PRAGMA temp_store = MEMORY;")
        cur.execute("PRAGMA cache_size = -200000;")            # ~200MB page cache
        cur.execute("PRAGMA page_size = 4096;")                # default; only affects new DBs
        cur.execute("PRAGMA mmap_size = 3000000000;")          # ~3GB, if supported
        cur.execute("PRAGMA wal_autocheckpoint = 1000;")
        cur.execute("PRAGMA busy_timeout = 5000;")
    except Exception:
        pass

    cur.execute(
        """
        CREATE TABLE IF NOT EXISTS words (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            word TEXT NOT NULL,
            lemma TEXT NOT NULL,
            pos_tag TEXT NOT NULL,
            pos_category TEXT NOT NULL,
            total_frequency INTEGER DEFAULT 0,
            first_seen TEXT,
            last_seen TEXT,
            created_at TEXT DEFAULT (datetime('now')),
            UNIQUE(word, lemma, pos_tag)
        )
        """
    )
    cur.execute(
        """
        CREATE TABLE IF NOT EXISTS word_frequencies (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            word_id INTEGER NOT NULL,
            year INTEGER NOT NULL,
            frequency INTEGER DEFAULT 0,
            UNIQUE(word_id, year)
        )
        """
    )
    cur.execute(
        """
        CREATE TABLE IF NOT EXISTS processing_log (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            articles_processed INTEGER,
            words_extracted INTEGER,
            processing_date TEXT DEFAULT (datetime('now')),
            notes TEXT
        )
        """
    )
    cur.execute("CREATE INDEX IF NOT EXISTS idx_words_lookup ON words(word, lemma, pos_tag)")
    cur.execute("CREATE INDEX IF NOT EXISTS idx_words_category ON words(pos_category)")
    cur.execute("CREATE INDEX IF NOT EXISTS idx_yearly_word ON word_frequencies(word_id)")
    cur.execute("CREATE INDEX IF NOT EXISTS idx_yearly_year ON word_frequencies(year)")
    conn.commit()
    print(f"✅ Word DB ready at {db_path}")
    return conn

In [9]:
# 9) Upsert Helpers
from itertools import groupby

def ensure_words_exist(conn: sqlite3.Connection, items: List[Tuple[str, str, str, str, int]]) -> Dict[Tuple[str,str,str], int]:
    """
    Ensure (word, lemma, pos_tag) exist; return mapping to ids.
    items: list of (word, lemma, pos_tag, pos_category, year) – year used to set first/last seen.
    Uses a TEMP table + join to fetch IDs in one query for speed.
    """
    cur = conn.cursor()
    # Deduplicate keys
    to_insert: Dict[Tuple[str,str,str], Tuple[str,str,str,str,int]] = {}
    for w, l, p, c, y in items:
        key = (w, l, p)
        if key not in to_insert:
            to_insert[key] = (w, l, p, c, y)
    data = list(to_insert.values())

    # Insert new words (IGNORE on duplicates)
    cur.executemany(
        """
        INSERT OR IGNORE INTO words (word, lemma, pos_tag, pos_category, first_seen, last_seen)
        VALUES (?, ?, ?, ?, ?, ?)
        """,
        [(w, l, p, c, f"{y}-01-01", f"{y}-12-31") for (w,l,p,c,y) in data]
    )

    # Update last_seen if needed (keep earliest first_seen as inserted)
    cur.executemany(
        """
        UPDATE words SET last_seen = CASE WHEN last_seen < ? THEN ? ELSE last_seen END
        WHERE word = ? AND lemma = ? AND pos_tag = ?
        """,
        [(f"{y}-12-31", f"{y}-12-31", w, l, p) for (w,l,p,c,y) in data]
    )

    # Use a TEMP table to fetch IDs for all keys at once
    cur.execute("CREATE TEMP TABLE IF NOT EXISTS tmp_keys (word TEXT, lemma TEXT, pos_tag TEXT)")
    cur.execute("DELETE FROM tmp_keys")
    cur.executemany("INSERT INTO tmp_keys (word, lemma, pos_tag) VALUES (?, ?, ?)", [(w,l,p) for (w,l,p,c,y) in data])

    cur.execute(
        """
        SELECT w.word, w.lemma, w.pos_tag, w.id
        FROM words w
        JOIN tmp_keys k ON k.word = w.word AND k.lemma = w.lemma AND k.pos_tag = w.pos_tag
        """
    )
    ids: Dict[Tuple[str,str,str], int] = {}
    for w, l, p, wid in cur.fetchall():
        ids[(w,l,p)] = wid

    return ids


def bump_word_counts(conn: sqlite3.Connection, counts_by_id: Dict[int, int]) -> None:
    if not counts_by_id:
        return
    cur = conn.cursor()
    cur.executemany(
        "UPDATE words SET total_frequency = total_frequency + ? WHERE id = ?",
        [(cnt, wid) for wid, cnt in counts_by_id.items()]
    )


def bump_yearly_counts(conn: sqlite3.Connection, yearly_items: List[Tuple[int, int, int]]) -> None:
    """yearly_items: list of (word_id, year, count). Performs a single UPSERT per row."""
    if not yearly_items:
        return
    # Aggregate duplicates within the batch to minimize upserts
    agg: Dict[Tuple[int,int], int] = defaultdict(int)
    for wid, yr, cnt in yearly_items:
        agg[(wid, yr)] += cnt
    rows = [(wid, yr, cnt) for (wid, yr), cnt in agg.items()]

    cur = conn.cursor()
    cur.executemany(
        """
        INSERT INTO word_frequencies (word_id, year, frequency)
        VALUES (?, ?, ?)
        ON CONFLICT(word_id, year) DO UPDATE SET
            frequency = frequency + excluded.frequency
        """,
        rows
    )

print('✅ Upsert helpers ready')

✅ Upsert helpers ready


In [10]:
# 10) Processing Pipeline (Single-Threaded, Batch)

def parse_year(published_time: Optional[str], published_ts: Optional[float], default_year: int = 2015) -> int:
    if published_time:
        try:
            return pd.to_datetime(published_time).year
        except Exception:
            pass
    if published_ts is not None:
        try:
            return pd.to_datetime(published_ts, unit='s').year
        except Exception:
            pass
    return default_year


def process_articles_to_words(
    articles_conn: sqlite3.Connection,
    words_db_path: str,
    batch_size: int = BATCH_SIZE,
    min_text_len: int = MIN_TEXT_LENGTH,
    nlp_model = nlp,
) -> Dict[str, int]:
    if nlp_model is None:
        print('❌ spaCy model not loaded')
        return {}

    words_conn = setup_word_db(words_db_path)
    stats = {
        'batches': 0,
        'articles_processed': 0,
        'words_extracted': 0,
        'skipped': 0,
        'errors': 0,
    }

    # Prepare a cursor for occasional optimize
    wcur = words_conn.cursor()

    batch_iter = stream_article_batches(articles_conn, INPUT_TABLE, batch_size, min_text_len)
    for batch in tqdm(batch_iter, desc='Processing article batches'):
        stats['batches'] += 1

        # Aggregate counts: key = (word, lemma, pos, pos_category, year)
        key_counter: Counter = Counter()

        # Build once, reuse inner variables for speed
        get_cat = get_pos_category
        for (article_id, published_time, published_ts, content) in batch:
            try:
                year = parse_year(published_time, published_ts)
                tokens = extract_words_from_text(content, nlp_model)
                if not tokens:
                    stats['skipped'] += 1
                    continue
                # Extend counter
                for t in tokens:
                    key_counter[(t['word'], t['lemma'], t['pos'], get_cat(str(t['pos'])), year)] += 1
                stats['articles_processed'] += 1
                stats['words_extracted'] += len(tokens)
            except Exception:
                stats['errors'] += 1
                continue

        if not key_counter:
            continue

        # Prepare items for upsert
        items: List[Tuple[str,str,str,str,int]] = [(w,l,p,c,y) for (w,l,p,c,y), cnt in key_counter.items()]

        with words_conn:  # single transaction per batch
            ids_map = ensure_words_exist(words_conn, items)
            # Aggregate by word_id
            counts_by_id: Dict[int, int] = defaultdict(int)
            yearly_items: List[Tuple[int,int,int]] = []
            for (w,l,p,c,y), cnt in key_counter.items():
                wid = ids_map.get((w,l,p))
                if wid is None:
                    words_conn.execute(
                        "INSERT OR IGNORE INTO words (word, lemma, pos_tag, pos_category) VALUES (?,?,?,?)",
                        (w,l,p,c)
                    )
                    wid = words_conn.execute(
                        "SELECT id FROM words WHERE word=? AND lemma=? AND pos_tag=?",
                        (w,l,p)
                    ).fetchone()[0]
                counts_by_id[wid] += cnt
                yearly_items.append((wid, y, cnt))

            bump_word_counts(words_conn, counts_by_id)
            bump_yearly_counts(words_conn, yearly_items)

        # Periodic housekeeping
        if stats['batches'] % 25 == 0:
            try:
                wcur.execute("PRAGMA optimize;")
            except Exception:
                pass

        # Periodic log row
        if stats['batches'] % 10 == 0:
            with words_conn:
                words_conn.execute(
                    "INSERT INTO processing_log (articles_processed, words_extracted, notes) VALUES (?, ?, ?)",
                    (stats['articles_processed'], stats['words_extracted'], f"After batch {stats['batches']}")
                )

    # Final optimize
    try:
        wcur.execute("PRAGMA optimize;")
    except Exception:
        pass

    # Final log row
    with words_conn:
        words_conn.execute(
            "INSERT INTO processing_log (articles_processed, words_extracted, notes) VALUES (?, ?, ?)",
            (stats['articles_processed'], stats['words_extracted'], 'Final summary')
        )

    print('✅ Processing complete')
    print(stats)
    return stats

In [11]:
# 11) Progress Logging and Basic Stats

def print_basic_stats(db_path: str):
    if not os.path.exists(db_path):
        print(f"No DB at {db_path}")
        return
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    cur.execute("SELECT COUNT(*) FROM words")
    words_count = cur.fetchone()[0]
    cur.execute("SELECT SUM(total_frequency) FROM words")
    total_freq = cur.fetchone()[0]
    cur.execute("SELECT COUNT(*) FROM word_frequencies")
    yearly_rows = cur.fetchone()[0]
    print(f"DB stats - unique words: {words_count:,}, total freq: {total_freq:,}, yearly rows: {yearly_rows:,}")
    conn.close()

print('✅ Stats helper ready')

✅ Stats helper ready


In [12]:
# 12) Analysis Queries (Top Words, POS Breakdown, Yearly Trends)

def top_words(db_path: str, limit: int = 20):
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    cur.execute(
        """
        SELECT word, lemma, pos_category, total_frequency
        FROM words
        ORDER BY total_frequency DESC
        LIMIT ?
        """,
        (limit,)
    )
    rows = cur.fetchall()
    conn.close()
    df = pd.DataFrame(rows, columns=['word','lemma','pos_category','total_frequency'])
    print(df)


def pos_breakdown(db_path: str):
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    cur.execute(
        "SELECT pos_category, COUNT(*), AVG(total_frequency) FROM words GROUP BY pos_category ORDER BY 2 DESC"
    )
    rows = cur.fetchall()
    conn.close()
    df = pd.DataFrame(rows, columns=['pos_category','unique_words','avg_total_freq'])
    print(df)


def yearly_trends(db_path: str):
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    cur.execute(
        "SELECT year, COUNT(*), SUM(frequency) FROM word_frequencies GROUP BY year ORDER BY year"
    )
    rows = cur.fetchall()
    conn.close()
    df = pd.DataFrame(rows, columns=['year','unique_words','total_instances'])
    print(df)

print('✅ Analysis helpers ready')

✅ Analysis helpers ready


In [13]:
# 13) Export Word Lists (TXT/CSV)
import csv

def export_word_lists(db_path: str = OUTPUT_DB, out_dir: str = OUT_EXPORT_DIR, common_threshold: int = COMMON_WORD_THRESHOLD):
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()

    # All words
    cur.execute("SELECT DISTINCT word FROM words ORDER BY word")
    words = [r[0] for r in cur.fetchall()]
    with open(Path(out_dir)/'all_words.txt', 'w', encoding='utf-8') as f:
        for w in words:
            f.write(w + '\n')
    print(f"Exported all_words.txt ({len(words):,} words)")

    # Common words
    cur.execute("SELECT word, total_frequency FROM words WHERE total_frequency >= ? ORDER BY total_frequency DESC", (common_threshold,))
    rows = cur.fetchall()
    with open(Path(out_dir)/'common_words.txt', 'w', encoding='utf-8') as f:
        for w, freq in rows:
            f.write(f"{w}\t{freq}\n")
    print(f"Exported common_words.txt ({len(rows):,} rows)")

    # POS specific
    for pos in ['noun','verb','adjective','adverb']:
        cur.execute("SELECT word, total_frequency FROM words WHERE pos_category=? ORDER BY total_frequency DESC", (pos,))
        rows = cur.fetchall()
        with open(Path(out_dir)/(f"{pos}_words.txt"), 'w', encoding='utf-8') as f:
            for w, freq in rows:
                f.write(f"{w}\t{freq}\n")
        print(f"Exported {pos}_words.txt ({len(rows):,} rows)")

    # Full CSV
    cur.execute("SELECT word, lemma, pos_tag, pos_category, total_frequency, first_seen, last_seen FROM words ORDER BY total_frequency DESC")
    rows = cur.fetchall()
    with open(Path(out_dir)/'words_full_data.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['word','lemma','pos_tag','pos_category','total_frequency','first_seen','last_seen'])
        writer.writerows(rows)
    print("Exported words_full_data.csv")

    # Game words
    cur.execute(
        """
        SELECT word FROM words
        WHERE LENGTH(word) BETWEEN 4 AND 8
          AND total_frequency >= ?
          AND pos_category IN ('noun','verb','adjective')
        ORDER BY total_frequency DESC
        """,
        (max(5, common_threshold//2),)
    )
    game = [r[0] for r in cur.fetchall()]
    with open(Path(out_dir)/'game_words.txt', 'w', encoding='utf-8') as f:
        for w in game:
            f.write(w + '\n')
    print(f"Exported game_words.txt ({len(game):,} words)")

    conn.close()

print('✅ Export helpers ready')

✅ Export helpers ready


## 14) Extraction

In [None]:
if art_conn is None:
    try:
        art_conn = connect_readonly(INPUT_DB)
        validate_articles_table(art_conn, INPUT_TABLE)
    except Exception as e:
        print(f"❌ Cannot open input DB for extraction: {e}")
        art_conn = None

if art_conn and nlp is not None:
    # Create a temporary view to limit rows
    cur = art_conn.cursor()
    cur.execute(f"SELECT COUNT(*) FROM {INPUT_TABLE} WHERE content IS NOT NULL AND text_length >= ?", (MIN_TEXT_LENGTH,))
    available = cur.fetchone()[0]
    print(f"Available rows >= MIN_TEXT_LENGTH: {available:,}")

    print("\n🚀 Running extraction...")
    stats = process_articles_to_words(art_conn, OUTPUT_DB, batch_size=max(100, BATCH_SIZE), min_text_len=MIN_TEXT_LENGTH, nlp_model=nlp)
    print_basic_stats(OUTPUT_DB)

    print("\n🔎 Top 10 words:")
    top_words(OUTPUT_DB, limit=10)

    print("\n📊 POS breakdown:")
    pos_breakdown(OUTPUT_DB)

    print("\n📈 Yearly trends:")
    yearly_trends(OUTPUT_DB)

    print("\n💾 Exports:")
    export_word_lists(OUTPUT_DB, OUT_EXPORT_DIR, COMMON_WORD_THRESHOLD)
else:
    print("⚠️ Skipping extraction (missing articles connection or spaCy model)")

Available rows >= MIN_TEXT_LENGTH: 295,097

🚀 Running smoke test...
✅ Word DB ready at output/words_database.sqlite


Processing article batches: 591it [6:38:37, 40.47s/it]


📦 Finished streaming articles. Total yielded: 295,097
✅ Processing complete
{'batches': 591, 'articles_processed': 295097, 'words_extracted': 53549626, 'skipped': 0, 'errors': 0}
DB stats - unique words: 841,764, total freq: 53,549,626, yearly rows: 2,408,652

🔎 Top 10 words:
        word      lemma pos_category  total_frequency
0       jaar       jaar         noun           330465
1       zegt     zeggen         verb           267452
2     mensen       mens         noun           231235
3       twee       twee       number           219437
4       gaat       gaan         verb           188659
5  nederland  nederland  proper_noun           149891
6       gaan       gaan         verb           146466
7       goed       goed    adjective           127884
8        uur        uur         noun           126045
9     nieuwe      nieuw    adjective           124807

📊 POS breakdown:
    pos_category  unique_words  avg_total_freq
0           noun        351506       59.013300
1    proper_noun 