In [1]:
import pandas as pd
import numpy as np
import re
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import normalize
import html
from bs4 import BeautifulSoup
import os
from pathlib import Path
import json
from scipy import sparse
import joblib

df = pd.read_csv("C:\\datasources\\ArticleScrubbed.csv", sep='|', usecols=['ArticleId','Title','PublicationDate','Publication','Links','Description','url'])

#deduplication and NaN cleanup
df.drop_duplicates('Title')

#substituting multiple spaces with single space
df['Description']= df['Description'].apply(lambda x: re.sub(r'\s+',' ', str(x)))

df.tail(6)

Unnamed: 0,ArticleId,Title,PublicationDate,Publication,Links,Description,url
13845,14049,Trump Sues Wall Street Journal for Article on ...,2025-08-18,<i>New York Times</i>,https://www.nytimes.com/2025/07/18/business/me...,<p>President Trump on Friday accused Rupert Mu...,trump-sues-wall-street-journal-article-note-ep...
13846,14050,The FBI's Jeffrey Epstein Prison Video Had Nea...,2025-07-15,<i>Wired</i>,https://www.wired.com/story/the-fbis-jeffrey-e...,<p><strong>Newly uncovered metadata reveals th...,-fbis-jeffrey-epstein-prison-video-had-nearly-...
13847,14051,When Your Power Meter Becomes a Tool of Mass S...,2025-07-21,When Your Power Meter Becomes a Tool of Mass S...,https://www.eff.org/deeplinks/2025/07/when-you...,"<p>In California, the law explicitly protects ...",when-your-power-meter-becomes-tool-mass-survei...
13848,14052,Uri Berliner: What Congress Should Ask NPR’s C...,2025-02-04,<i>The Free Press</i>,https://www.thefp.com/p/uri-berliner-will-cong...,<p>It’s been about 10 months since I wrote an ...,uri-berliner-what-congress-should-ask-nprs-chief
13849,14053,CIA Contradicts Obama Officials’ Sworn Denials...,2025-07-08,<i>Real Clear Investigations</i>,https://www.realclearinvestigations.com/articl...,<p>New evidence suggests that some of the high...,cia-contradicts-obama-officials-sworn-denials-...
13850,14054,Obama admin assessed Russia played no role in ...,2025-07-18,<i>New York Post</i>,https://nypost.com/2025/07/18/us-news/obama-ad...,<p>The Obama administration knew before and af...,obama-admin-assessed-russia-played-no-role-201...


In [2]:
#make some unique wtk urls
makeurl = df['url'].astype(str)
df['url'] = "https://www.wanttoknow.info/a-" + makeurl

#remove markup from publications
df['Publication'] = df['Publication'].str.replace('<i>', '')
df['Publication'] = df['Publication'].str.replace('</i>', '')
df['Publication'] = df['Publication'].str.replace('<em>', '')
df['Publication'] = df['Publication'].str.replace('<', '')
df['Publication'] = df['Publication'].str.replace('>', '')

#begin cleanup of media sources
df['Publication'], df['pub2'] = df['Publication'].str.split('/', 1).str
df['Publication'], df['pubdetail'] = df['Publication'].str.split('(', 1).str
df['pubdetail'] = df['pubdetail'].astype(str).apply(lambda u: u.strip(')'))
df['Publication'] = df['Publication'].str.strip()
df['Publication'] = df['Publication'].astype(str).apply(lambda u: u.strip('"'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('\'',''))

#drop extra columns
df.drop(columns=['pub2', 'pubdetail'], inplace=True)

#one-off translations. Move sources with more than two variations to mediacondense
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('Seattle times', 'Seattle Times'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('Scientific American Blog', 'Scientific American'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('The Sacramento Bee', 'Sacramento Bee'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('Mother Jones Magazine', 'Mother Jones'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('Sydney Mountain Herald', 'Sydney Morning Herald'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('The Nation magazine', 'The Nation'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('Chicago Sun-Times News Group', 'Chicago Sun-Times'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('Wired magazine', 'Wired'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('The New Yorker magazine', 'The New Yorker'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('Fortune magazine', 'Fortune'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('The Atlantic Monthly', 'The Atlantic'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('Tikkun Magazine - March', 'Tikkun Magazine'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('The Daily Mail', 'Daily Mail'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('U.S. Right to Know', 'US Right to Know'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('Wired Magazine', 'Wired'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('Minneapolis Star-Tribune', 'Star Tribune'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('CNBC News', 'CNBC'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('Minneapolis Star Tribune', 'Star Tribune'))
df['Publication'] = df['Publication'].astype(str).apply(lambda r: r.replace('The New Yorker', 'New Yorker'))

#mediacondense is a list of dictionaries for when there are more than 2 variations of a media source
mediacondense = []

abc = ['ABC News Australia', 'ABC News blog', 'ABC6', 'abcnews.com', 'WCPO - Cincinnatis ABC Affiliate', 'ABC News', 'ABC News blogs', 'ABC News Blog', 'ABC News Good Morning America', 'ABC New', 'ABC News Nightline', 'ABC15', 'ABC News 20', 'abc4.com', 'ABC Action News', 'ABCs Arizona Affiliate','WXYZ - Detroits ABC News Affiliate']
mcat1 = "ABC"
m1 = dict.fromkeys(abc, mcat1)
mediacondense.append(m1)

ode = ['Ode Magazine, June 2005 Issue', 'Ode Magazine, July 2005 Issue', 'Ode magazine']
mcat2 = "Ode Magazine"
m2 = dict.fromkeys(ode, mcat2)
mediacondense.append(m2)

nbc = ['NBC Milwaukee Affiliate', 'NBC Philadelphia', 'NBC Today', 'NBC New York', 'NBC Right Now', 'NBC Chicago', 'NBC Miami', 'NBC Washington', 'NBC Los Angeles', 'NBC Oklahoma City', 'NBC News']
mcat3 = "NBC"
m3 = dict.fromkeys(nbc, mcat3)
mediacondense.append(m3)

vfr = ['Vanity Fair August 2006 Issue', 'Vanity Fair September 2005 Issue', 'Vanity Fair magazine']
mcat4 = "Vanity Fair"
m4 = dict.fromkeys(vfr, mcat4)
mediacondense.append(m4)

nyt = ['The New York Times', 'New York Times Blog', 'New York Times blog']
mcat5 = "New York Times"
m5 = dict.fromkeys(nyt, mcat5)
mediacondense.append(m5)

unw = ['U.S. News and World Report', 'U.S. News &amp; World Report blog', 'U.S. News & World Report', 'U.S. News & World Report blog', 'US News & World Report magazine', 'US News & World Report']
mcat6 = "US News and World Report"
m6 = dict.fromkeys(unw, mcat6)
mediacondense.append(m6)

nwk = ['Newsweek magazine', 'Newsweek blog', 'Newsweek Magazine', 'Newsweek magazine blog']
mcat7 = "Newsweek"
m7 = dict.fromkeys(nwk, mcat7)
mediacondense.append(m7)

bbc = ['BBC Radio', 'BBC News blog', 'BBC Blogs', 'BBC News']
mcat8 = "BBC"
m8 = dict.fromkeys(bbc, mcat8)
mediacondense.append(m8)

fop = ['Foreign Policy Magazine May', 'Foreign Policy Journal']
mcat9 = "Foreign Policy"
m9 = dict.fromkeys(fop, mcat9)
mediacondense.append(m9)

wap = ['Washington Post blog', 'washingtonpost.com', 'Washingon Post', 'Washginton Post', 'The Washington Post']
mcat10 = "Washington Post"
m10 = dict.fromkeys(wap, mcat10)
mediacondense.append(m10)

tlg = ['The Telegraph blogs', 'Daily Telegraph', 'Telegraph']
mcat11 = "The Telegraph"
m11 = dict.fromkeys(tlg, mcat11)
mediacondense.append(m11)

nsa = ['U.S. National Security Agency Website', 'National Security Agency  Website', 'NSA Technical Journal, Vol. XI', 'National Security Agency  Website, NSA Technical Journal, Vol. XI']
mcat12 = "NSA Website"
m12 = dict.fromkeys(nsa, mcat12)
mediacondense.append(m12)

msn = ['MSN Money', 'MSN of Australia', 'MSN Canada', 'MSN']
mcat13 = "MSN News"
m13 = dict.fromkeys(msn, mcat13)
mediacondense.append(m13)

tim = ['Time magazine', 'Time Magazine', 'TIME Magazine', 'Time Magazine blog']
mcat14 = "Time"
m14 = dict.fromkeys(tim, mcat14)
mediacondense.append(m14)

psc = ['Popular Science - March 2007 Issue', 'Popular Science Magazine', 'Popular Science magazine']
mcat15 = "Popular Science"
m15 = dict.fromkeys(psc, mcat15)
mediacondense.append(m15)

cnn = ['CNN blog', 'CNN Money', 'CNN International', 'CNN World', 'CNN The Situation Room', 'CNN Lou Dobbs Tonight', 'CNN Video Clip', 'CNN Larry King Live', 'CNN News']
mcat16 = "CNN"
m16 = dict.fromkeys(cnn, mcat16)
mediacondense.append(m16)

cbs = ['CBS Las Vegas Affiliate', 'CBS Cleveland', 'CBS4-TV', 'CBS Philly', 'CBS News', 'KCBS', 'CBS Atlanta', 'CBS Affiliate KUTV', 'CBS News Chicago, Associated Press', 'CBS News, Sacramento Affiliate', 'WCBS News - New York CBS Affiliate', 'CBS News 60 Minutes', 'CBS News 60 Minutes Overtime', 'CBS Los Angeles', 'CBS 60 Minutes', 'CBS News blog', 'CBS News, Stockton Affiliate']
mcat17 = "CBS"
m17 = dict.fromkeys(cbs, mcat17)
mediacondense.append(m17)

yho = ['Yahoo! News', 'Yahoo!', 'Yahoo! Finance', 'Yahoo! News Australia', 'Yahoo News', 'Yahoo Finance']
mcat18 = "Yahoo"
m18 = dict.fromkeys(yho, mcat18)
mediacondense.append(m18)

wsj = ['The Wall Street Journal', 'Wall Street Journal blog', 'Full Page Ad in Wall Street Journal', 'Wall Street Journal Article by Former FBI Director Louis Freeh', 'Wall Street Journal Blog']
mcat19 = "Wall Street Journal"
m19 = dict.fromkeys(wsj, mcat19)
mediacondense.append(m19)

fox = ['Fox News Chicago', 'WJBK Fox 2', 'Fox News video clip', 'FOX News', 'Fox 19', 'Fox News Affiliate']
mcat20 = "Fox"
m20 = dict.fromkeys(fox, mcat20)
mediacondense.append(m20)

icp = ['The Intercept With Glenn Greenwald', 'The Intercept with Glenn Greenwald']
mcat21 = "The Intercept"
m21 = dict.fromkeys(icp, mcat21)
mediacondense.append(m21)

lat = ['Los Angeles Times blog', 'The Los Angeles Times', 'LA Times']
mcat22 = "Los Angeles Times"
m22 = dict.fromkeys(lat, mcat22)
mediacondense.append(m22)

pbs = ['PBS Nova Program', 'PBS Frontline', 'PBS, CBS, Fox compilation', 'PBS News', 'PBS Bill Moyers Journal', 'PBS Newshour', 'PBS Blog']
mcat23 = "PBS"
m23 = dict.fromkeys(pbs, mcat23)
mediacondense.append(m23)

ecn = ['The Economist blog', 'The Economist Magazine', 'The Economist magazine']
mcat24 = "The Economist"
m24 = dict.fromkeys(ecn, mcat24)
mediacondense.append(m24)

npr = ['NPR All Things Considered', 'National Public Radio', 'NPR News', 'NPR blog', 'NPR Blog', 'Minnesota Public Radio']
mcat25 = "NPR"
m25 = dict.fromkeys(npr, mcat25)
mediacondense.append(m25)

sfc = ['The San Francisco Chronicle', 'San Francisco Chronicle SFs leading newspaper)']
mcat26 = "San Francisco Chronicle"
m26 = dict.fromkeys(sfc, mcat26)
mediacondense.append(m26)

cbc = ['Canadian Broadcasting Corporation', 'CBC News', 'CBC News [Canadas Public Broadcasting System]']
mcat27 = "CBC"
m27 = dict.fromkeys(cbc, mcat27)
mediacondense.append(m27)
    
frb = ['Forbes Magazine', 'Forbes blog', 'Forbes India Magazine', 'Forbes magazine', 'Forbes.com', 'Forbes.com blog', 'Forbes India Magazine']
mcat28 = "Forbes"
m28 = dict.fromkeys(frb, mcat28)
mediacondense.append(m28)
    
rst = ['Rolling Stone blog', 'Rolling Stone magazine']
mcat29 = "Rolling Stone"
m29 = dict.fromkeys(rst, mcat29)
mediacondense.append(m29)
    
grd = ['A Guardian blog', 'The Guardian blog', 'Guardian', 'The Guardian']
mcat30 = "The Guardian"
m30 = dict.fromkeys(grd, mcat30)
mediacondense.append(m30)
    
ngc = ['NationalGeographic.com', 'National Geographic October 2004 Issue', 'National Geographic News', 'NationalGeographic.com blog']
mcat31 = "National Geographic"
m31 = dict.fromkeys(ngc, mcat31)
mediacondense.append(m31)
    
mbc = ['MSNBC News', 'MSNBC Today', 'MSNBC: Keith Olbermann blog', 'MSNBC The Rachel Maddow Show']
mcat32 = "MSNBC"
m32 = dict.fromkeys(mbc, mcat32)
mediacondense.append(m32)
    
rut = ['Reuters News Agency', 'Reuters News', 'Reuters Health', 'Reuters blog']
mcat33 = "Reuters"
m33 = dict.fromkeys(rut, mcat33)
mediacondense.append(m33)

blb = ['Bloomberg News Service', 'Businessweek', 'BloombergBusinessWeek', 'BloombergBusinessweek', 'BusinessWeek magazine', 'BusinessWeek', 'BusinessWeek Magazine', 'Bloomberg Businessweek', 'Bloomberg News']
mcat34 = "Bloomberg"
m34 = dict.fromkeys(blb, mcat34)
mediacondense.append(m34)

sdt = ['Times of London', 'The Times', 'Sunday Times']
mcat35 = "London Times"
m35 = dict.fromkeys(sdt, mcat35)
mediacondense.append(m35)

ind = ['Independent', 'The The The The The The The The Independent', 'The The The The The The The Independent']
mcat36 = "The Independent"
m36 = dict.fromkeys(ind, mcat36)
mediacondense.append(m36)

#the list of dictionaries is then turned into a new little dataframe

key = []
val = []
for i in mediacondense:
    for k,v in i.items():
        key.append(k)
        val.append(v)

mediakeys = pd.DataFrame({'asis': key, 'clean': val})

#replacement values are mapped from new dataframe to complete main dataframe 
#mediakeys.set_index('asis', inplace=True)
#mediakeys.to_csv("C:\\datasources\\mediakeys.csv", sep='|', index=True, encoding='utf-8')

# Creating a dictionary from the 'asis' and 'clean' columns
replace_dict = pd.Series(mediakeys['clean'].values, index=mediakeys['asis']).to_dict()

# Using map to update the 'Publication' column
df['Publication'] = df['Publication'].map(replace_dict).fillna(df['Publication'])

# Merging df with mediakeys on Publication column and asis column
df = df.merge(mediakeys, how='left', left_on='Publication', right_on='asis')

# Updating the Publication column with clean values
df['Publication'] = df['clean'].combine_first(df['Publication'])

# Dropping unnecessary columns
df.drop(columns=['asis', 'clean'], inplace=True)

df.tail(6)

  df['Publication'], df['pub2'] = df['Publication'].str.split('/', 1).str
  df['Publication'], df['pubdetail'] = df['Publication'].str.split('(', 1).str


Unnamed: 0,ArticleId,Title,PublicationDate,Publication,Links,Description,url
13845,14049,Trump Sues Wall Street Journal for Article on ...,2025-08-18,New York Times,https://www.nytimes.com/2025/07/18/business/me...,<p>President Trump on Friday accused Rupert Mu...,https://www.wanttoknow.info/a-trump-sues-wall-...
13846,14050,The FBI's Jeffrey Epstein Prison Video Had Nea...,2025-07-15,Wired,https://www.wired.com/story/the-fbis-jeffrey-e...,<p><strong>Newly uncovered metadata reveals th...,https://www.wanttoknow.info/a--fbis-jeffrey-ep...
13847,14051,When Your Power Meter Becomes a Tool of Mass S...,2025-07-21,When Your Power Meter Becomes a Tool of Mass S...,https://www.eff.org/deeplinks/2025/07/when-you...,"<p>In California, the law explicitly protects ...",https://www.wanttoknow.info/a-when-your-power-...
13848,14052,Uri Berliner: What Congress Should Ask NPR’s C...,2025-02-04,The Free Press,https://www.thefp.com/p/uri-berliner-will-cong...,<p>It’s been about 10 months since I wrote an ...,https://www.wanttoknow.info/a-uri-berliner-wha...
13849,14053,CIA Contradicts Obama Officials’ Sworn Denials...,2025-07-08,Real Clear Investigations,https://www.realclearinvestigations.com/articl...,<p>New evidence suggests that some of the high...,https://www.wanttoknow.info/a-cia-contradicts-...
13850,14054,Obama admin assessed Russia played no role in ...,2025-07-18,New York Post,https://nypost.com/2025/07/18/us-news/obama-ad...,<p>The Obama administration knew before and af...,https://www.wanttoknow.info/a-obama-admin-asse...


In [3]:
# --- Rename columns for storage---
df = df.rename(columns={
    'ArticleId': 'ID',
    'PublicationDate': 'Date',
    'Links': 'Source',
    'Description': 'Summary',
    'url': 'WTKurl'
})

# --- Ensure "Note:" begins on a new line in Summary ---
note_pat = re.compile(r'(?i)(?<!^)(?<!\n)Note:', flags=re.MULTILINE)
df['Summary'] = df['Summary'].apply(lambda s: note_pat.sub('\nNote:', str(s)))

# If needed: df['Title'] = df['Title'].fillna(''); df['Summary'] = df['Summary'].fillna('')

# Combine Title + Summary into a single text field for vectorization
df['__text_for_tfidf__'] = (df['Title'].fillna('') + ' ' + df['Summary'].fillna('')).astype(str)

# Build TF-IDF vectors

vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',        # remove if you want every token
    ngram_range=(1, 2),          # unigrams + bigrams (adjust as you like)
    max_df=0.9,                  # ignore extremely common terms
    min_df=2,                    # ignore very rare terms
    token_pattern=r'(?u)\b\w+\b' # keeps alphanumerics (HTML tags will contribute tokens)
)

X = vectorizer.fit_transform(df['__text_for_tfidf__'])  # sparse CSR matrix (n_rows x n_features)

# 3) Prepare output directory
out_dir = Path(r"C:/datasources/ai")
out_dir.mkdir(parents=True, exist_ok=True)

# 4) Save the sparse matrix of vectors
#    We’ll save a single matrix and an ordered list of IDs (row i ↔ df.ID[i]).
vectors_path = out_dir / "tfidf_vectors_by_row.npz"
sparse.save_npz(vectors_path, X)

# 5) Save the ordered IDs (to align rows back to IDs)
ids_path = out_dir / "ids_by_row.json"
ids_list = df['ID'].astype(str).tolist()
with open(ids_path, "w", encoding="utf-8") as f:
    json.dump(ids_list, f, ensure_ascii=False, indent=2)

# 6) Save the vocabulary (token -> column index) for quick lookup
vocab_path = out_dir / "tfidf_vocabulary.json"
with open(vocab_path, "w", encoding="utf-8") as f:
    json.dump(vectorizer.vocabulary_, f, ensure_ascii=False, indent=2)

# 7) Save feature names (index -> token) as JSON to avoid pickle issues
try:
    feature_names = vectorizer.get_feature_names_out()  # sklearn >= 1.0
except AttributeError:
    feature_names = vectorizer.get_feature_names()      # older sklearn

feature_names_path = out_dir / "tfidf_feature_names.json"
with open(feature_names_path, "w", encoding="utf-8") as f:
    json.dump(list(feature_names), f, ensure_ascii=False, indent=2)

# (Optional) keep vocabulary too (already in your script)
vocab_path = out_dir / "tfidf_vocabulary.json"
with open(vocab_path, "w", encoding="utf-8") as f:
    json.dump(vectorizer.vocabulary_, f, ensure_ascii=False, indent=2)

# 8) Save the full fitted vectorizer (recommended)
joblib.dump(vectorizer, out_dir / "tfidf_vectorizer.joblib")

# 9) Update manifest paths (swap .npy -> .json)
manifest = {
    "vectors_npz": str(vectors_path),
    "ids_by_row_json": str(ids_path),
    "vocabulary_json": str(vocab_path),
    "feature_names_json": str(feature_names_path),  # <— updated
    "vectorizer_joblib": str(out_dir / "tfidf_vectorizer.joblib"),
    "schema": {
        "row_alignment": "row i in vectors corresponds to ids_by_row_json[i]",
        "vector_type": "TF-IDF (CSR sparse matrix)",
        "token_indexing": "vocabulary_json maps token -> column index; feature_names_json[i] = token at column i"
    }
}
with open(out_dir / "tfidf_manifest.json", "w", encoding="utf-8") as f:
    json.dump(manifest, f, ensure_ascii=False, indent=2)

# 10) Save the whole dataframe as JSON (records) for auditing or future use
df_json_path = out_dir / "articles_df.json"
df.to_json(df_json_path, orient="records", force_ascii=False, indent=2)

# (Optional) clean up the temporary column
df.drop(columns=['__text_for_tfidf__'], inplace=True)



In [4]:
# CLUSTER DOCS FOR SEARCH ROUTING
# ------------------------------
OUT_DIR = Path(r"C:/datasources/ai")
OUT_DIR.mkdir(parents=True, exist_ok=True)

INITIAL_K = 24          # start with "a couple dozen"
MAX_CLUSTER_SIZE = 1000 # hard cap per cluster
RANDOM_STATE = 42
BATCH_SIZE = 4096
MAX_ITER = 200
N_INIT = 20             # more robust starts for k-means++

# ------------------------------
# LOAD ARTIFACTS
# ------------------------------
df = pd.read_json(OUT_DIR / "articles_df.json", orient="records")
X = sparse.load_npz(OUT_DIR / "tfidf_vectors_by_row.npz").tocsr()
with open(OUT_DIR / "ids_by_row.json", "r", encoding="utf-8") as f:
    ids_by_row = json.load(f)

# L2-normalize rows so Euclidean distance ~ cosine distance
X = normalize(X, norm="l2", copy=False)

# ------------------------------
# HELPERS
# ------------------------------
def run_kmeans(X_sub, k):
    km = MiniBatchKMeans(
        n_clusters=k,
        random_state=RANDOM_STATE,
        batch_size=BATCH_SIZE,
        max_iter=MAX_ITER,
        n_init=N_INIT,
        init="k-means++",
        verbose=0,
    )
    labels = km.fit_predict(X_sub)
    centers = km.cluster_centers_  # dense
    # Normalize centers for cosine-friendly nearest-centroid search
    centers = centers / (np.linalg.norm(centers, axis=1, keepdims=True) + 1e-12)
    return labels, centers

def split_count(size, cap):
    """How many subclusters are needed to ensure each ≤ cap?"""
    return max(2, math.ceil(size / cap))

# ------------------------------
# RECURSIVE SPLITTING
# ------------------------------
# We track clusters as a list of index arrays (global row indices into X / ids_by_row).
# Start: one pass with K=INITIAL_K, then split any big ones.
n = X.shape[0]
initial_labels, initial_centers = run_kmeans(X, INITIAL_K)

clusters = []
for c in range(INITIAL_K):
    idx = np.where(initial_labels == c)[0]
    clusters.append(idx)

final_clusters = []      # list of np.array of indices (global)
final_centroids = []     # parallel list of centroid vectors (dense, L2-normalized)

queue = list(range(len(clusters)))
while queue:
    i = queue.pop()
    idx = clusters[i]
    if idx.size <= MAX_CLUSTER_SIZE:
        # compute centroid for this leaf cluster
        # centroid = normalized mean of member vectors
        X_sub = X[idx]
        # (handle possible all-zero corner-case)
        centroid = X_sub.mean(axis=0).A1
        norm = np.linalg.norm(centroid)
        if norm > 0:
            centroid = centroid / norm
        final_clusters.append(idx)
        final_centroids.append(centroid)
        continue

    # Need to split this cluster further
    X_sub = X[idx]
    k_sub = split_count(idx.size, MAX_CLUSTER_SIZE)
    sub_labels, sub_centers = run_kmeans(X_sub, k_sub)

    # Create new clusters to process
    for sc in range(k_sub):
        sub_idx = idx[np.where(sub_labels == sc)[0]]
        clusters.append(sub_idx)
        queue.append(len(clusters) - 1)  # schedule for possible further split
    # (We do not keep the parent once it's split)

# ------------------------------
# SAVE RESULTS
# ------------------------------
# Map: final cluster id -> list of global row indices (into X & ids_by_row)
num_final = len(final_clusters)

# 1) Save per-cluster shard: IDs and VECTORS
#    - Vectors: sparse rows saved into tfidf_cluster_{k}.npz
#    - IDs: json list aligned to rows in that shard file
shard_paths = []
idlist_paths = []
for k, idx in enumerate(final_clusters, start=1):
    shard_X = X[idx]
    vectors_path = OUT_DIR / f"tfidf_cluster_{k}.npz"
    ids_path = OUT_DIR / f"ids_cluster_{k}.json"
    sparse.save_npz(vectors_path, shard_X)
    with open(ids_path, "w", encoding="utf-8") as f:
        json.dump([str(ids_by_row[i]) for i in idx], f, ensure_ascii=False, indent=2)
    shard_paths.append(str(vectors_path))
    idlist_paths.append(str(ids_path))

# 2) Save centroids matrix (dense, L2-normalized rows)
centroids = np.vstack(final_centroids) if final_centroids else np.zeros((0, X.shape[1]), dtype=np.float32)
np.save(OUT_DIR / "centroids.npy", centroids.astype(np.float32))

# 3) Save cluster membership quick lookups
# cluster_index.json: cluster_id (1-based) -> {"size": int, "ids_path": str, "vectors_path": str}
cluster_index = {
    str(k): {
        "size": int(final_clusters[k-1].size),
        "ids_path": idlist_paths[k-1],
        "vectors_path": shard_paths[k-1],
    }
    for k in range(1, num_final + 1)
}
with open(OUT_DIR / "cluster_index.json", "w", encoding="utf-8") as f:
    json.dump(cluster_index, f, ensure_ascii=False, indent=2)

# 4) Save assignment for each row ID → cluster_id
row_to_cluster = np.empty(n, dtype=np.int32)
for cid, idx in enumerate(final_clusters, start=1):
    row_to_cluster[idx] = cid
id_to_cluster = {str(ids_by_row[i]): int(row_to_cluster[i]) for i in range(n)}
with open(OUT_DIR / "id_to_cluster.json", "w", encoding="utf-8") as f:
    json.dump(id_to_cluster, f, ensure_ascii=False, indent=2)

# 5) Optional: cluster sizes summary
sizes = [int(arr.size) for arr in final_clusters]
sizes_df = pd.DataFrame({"cluster_id": list(range(1, num_final+1)), "size": sizes})
sizes_df.to_csv(OUT_DIR / "cluster_sizes_recursive.csv", index=False)

# 6) Manifest
manifest = {
    "shards": cluster_index,
    "centroids_npy": str(OUT_DIR / "centroids.npy"),
    "id_to_cluster": str(OUT_DIR / "id_to_cluster.json"),
    "cluster_sizes_csv": str(OUT_DIR / "cluster_sizes_recursive.csv"),
    "notes": {
        "normalization": "Rows and centroids L2-normalized for cosine similarity via dot product.",
        "routing": "Use centroids.npy for nearest-centroid routing of queries.",
        "cap": f"All clusters <= {MAX_CLUSTER_SIZE} items via recursive splitting."
    }
}
with open(OUT_DIR / "recursive_kmeans_manifest.json", "w", encoding="utf-8") as f:
    json.dump(manifest, f, ensure_ascii=False, indent=2)

print(f"Final clusters: {num_final} | min={min(sizes)} max={max(sizes)}")


Final clusters: 30 | min=47 max=999


In [5]:
# --- Query routing + in-shard search (TF-IDF) ---

OUT_DIR = Path(r"C:/datasources/ai")

# Load vectorizer and centroids
vectorizer = joblib.load(OUT_DIR / "tfidf_vectorizer.joblib")
centroids = np.load(OUT_DIR / "centroids.npy", allow_pickle=False)  # (num_clusters_final x dim), L2-normalized

# Build a quick ID -> full JSON record map
articles_df = pd.read_json(OUT_DIR / "articles_df.json", orient="records")
record_by_id = {str(rec["ID"]): rec for rec in articles_df.to_dict(orient="records")}

def vectorize_query(text: str):
    q = vectorizer.transform([text])          # CSR (1 x d)
    q = normalize(q, norm="l2", copy=False)   # cosine-friendly
    return q

def top_centroids_for_query(q_vec, topn=3):
    # q_vec: CSR 1xD; centroids: dense (C x D), already L2-normalized
    q_dense = q_vec.toarray().astype(np.float32)  # fine for a single query
    sims = (q_dense @ centroids.T).ravel()        # cosine similarity via dot product
    order = np.argsort(-sims)[:topn]
    return order, sims[order]

def search_shards(q_vec, cluster_ids, topk=20):
    """
    Given a query vector and a list of 1-based cluster_ids,
    return topk results across those shards as dicts:
    {"id": str, "score": float, "cluster_id": int}
    """
    results = []
    for cluster_id in cluster_ids:
        ids_path = OUT_DIR / f"ids_cluster_{cluster_id}.json"
        vecs_path = OUT_DIR / f"tfidf_cluster_{cluster_id}.npz"

        with open(ids_path, "r", encoding="utf-8") as f:
            shard_ids = json.load(f)  # list[str], aligned to rows in vecs
        Xc = sparse.load_npz(vecs_path).tocsr()     # rows already L2-normalized

        # Cosine similarity = dot product (since everything is L2-normalized)
        scores = (q_vec @ Xc.T)                     # 1 x Nshard
        scores = np.asarray(scores.todense()).ravel()

        # Keep per-shard top candidates
        local_top = np.argsort(-scores)[:topk]
        for i in local_top:
            if scores[i] > 0:
                results.append({
                    "id": str(shard_ids[i]),
                    "score": float(scores[i]),
                    "cluster_id": int(cluster_id),
                })

    # Global topk across the selected shards
    results.sort(key=lambda r: -r["score"])
    return results[:topk]

# ---- Example usage ----
query = "CIA torture report Senate Intelligence Committee 2014"
q_vec = vectorize_query(query)

# choose 1–3 nearest centroids
centroid_order, centroid_scores = top_centroids_for_query(q_vec, topn=3)
# centroids are 0-based in the array; shard files are 1-based
candidate_cluster_ids = [int(i) + 1 for i in centroid_order]

hits = search_shards(q_vec, candidate_cluster_ids, topk=30)

# Print line + full JSON for each hit
for h in hits[:10]:
    _id = h["id"]
    cluster_id = h["cluster_id"]
    score = h["score"]
    print(f"[cluster {cluster_id}] {_id}  score={score:.4f}")

    rec = record_by_id.get(_id)
    if rec is not None:
        # Pretty-print the full JSON record
        print(json.dumps(rec, ensure_ascii=False, indent=2))
    else:
        print(f"(No full record found for ID={_id})")


[cluster 18] 13322  score=0.4246
{
  "ID": 13322,
  "Title": "More Than 10 Years Later, The Senate Torture Report Is Still Secret",
  "Date": "2024-06-27",
  "Publication": "The Intercept",
  "Source": "https://theintercept.com/2024/06/27/senate-torture-report-cia-lawsuit/",
  "Summary": "<p><strong>The Senate Select Committee on Intelligence submitted its 6,700-page “torture report” about the CIA to the White House in April 2014. More than 10 years later, the full report remains secret after a federal appellate court dismissed a lawsuit I filed in the hopes of forcing its release</strong>. The document “includes comprehensive and excruciating detail” about the CIA’s “program of indefinite secret detention and the use of brutal interrogation techniques,” the late Sen. Dianne Feinstein, who chaired the Senate intelligence committee at the time, wrote in a <a href=https://www.intelligence.senate.gov/sites/default/files/publications/CRPT-113srpt288.pdf\" target=\"_blank\">2014 summary</a>