In [4]:
import wikipediaapi, random

USER_AGENT = (
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0_0) "
    "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15 "
    "(WikipediaGraphResearch/1.0; contact: egor@example.com)"
)

wiki = wikipediaapi.Wikipedia(
    language='en',
    user_agent=USER_AGENT
)


categories = [
    "Category:Information technology",
    "Category:Computer science",
    "Category:Software engineering",
    "Category:Computer security",
    "Category:Artificial intelligence",
    "Category:Data management",
    "Category:Networking"
]

In [5]:
pages = set()

for cat_name in categories:
    cat = wiki.page(cat_name)
    members = [p for p in cat.categorymembers.values() if p.ns == 0]  # ns=0 => article
    random.shuffle(members)
    for page in members[:15]:  # взять 15 случайных статей
        pages.add(page.title)

print(len(pages))  # примерно 100 статей

90


In [6]:
pages

{'ANSI 834 Enrollment Implementation Format',
 'Agentic AI',
 'Agnostic (data)',
 'Artificial consciousness',
 'Autognostics',
 'Automated medical scribe',
 'Autonomic networking',
 'Behavior tree',
 'Belgrade IT sector',
 'BlueBorne (security vulnerability)',
 'Boolean',
 'CA Gen',
 'CAPTCHA',
 'Catalytic computing',
 'Chinese Information Processing Society of China',
 'Cleo Communications',
 'Cloud Data Management Interface',
 'Commission on Enhancing National Cybersecurity',
 'Component-based software engineering',
 'Computational gastronomy',
 'Computer science',
 'Computer science in sport',
 'Connectionist expert system',
 'Content repository',
 'Control-flow integrity',
 'Cryptographic module',
 'Cyber Ireland',
 'Cybersecurity engineering',
 'Cybersex trafficking',
 'Data conditioning',
 'Data engineering',
 'Data integration',
 'Data library',
 'Datasphere',
 'Defensive computing',
 'Distributed transaction',
 'E-research',
 'E-services',
 'Feedback neural network',
 'Filter a

In [2]:
import json

nodes = json.loads(
    open("../data/wiki_dedup_embds.json").read()
)

In [10]:
import numpy as np

In [11]:
titles = list(nodes.keys())
embeddings = np.array([nodes[t] for t in titles])

In [1]:
import json
import numpy as np
from sklearn.neighbors import NearestNeighbors

# === Load embeddings ===
with open("../data/wiki_dedup_embds.json", "r", encoding="utf-8") as f:
    data = json.load(f)

titles = list(data.keys())

embeddings = np.array([data[t]["embedding"] for t in titles])
categories_map = {t: data[t]["categories"] for t in titles}

print(f"Loaded {len(titles)} nodes with embeddings & categories.")

# === Fit nearest neighbors ===
nn = NearestNeighbors(n_neighbors=10, metric="cosine")
nn.fit(embeddings)


def nearest(title: str, k: int = 10):
    if title not in titles:
        print(f"Title '{title}' not found.")
        return
    
    idx = titles.index(title)
    vec = embeddings[idx].reshape(1, -1)

    distances, indices = nn.kneighbors(vec, n_neighbors=k)

    print(f"\nClosest to: {title}")
    print(f"Categories: {categories_map[title]}\n")

    for d, i in zip(distances[0], indices[0]):
        cand = titles[i]
        cand_categories = categories_map[cand]

        # Intersection score
        inter = set(categories_map[title]) & set(cand_categories)
        inter_ratio = len(inter) / (len(set(categories_map[title]) | set(cand_categories)) + 1e-9)

        print(
            f"  {cand:50s}  dist={d:.3f} |  {cand_categories} "
            f"cat_intersection={len(inter)} | jaccard={inter_ratio:.3f}"
        )


Loaded 14550 nodes with embeddings & categories.


In [2]:
nearest("Machine learning")


Closest to: Machine learning
Categories: ['Category:Cybernetics', 'Category:Learning', 'Category:Machine learning']

  Machine learning                                    dist=0.000 |  ['Category:Cybernetics', 'Category:Learning', 'Category:Machine learning'] cat_intersection=3 | jaccard=1.000
  Data augmentation                                   dist=0.347 |  ['Category:Machine learning'] cat_intersection=1 | jaccard=0.333
  Q-learning                                          dist=0.426 |  ['Category:Machine learning algorithms', 'Category:Reinforcement learning'] cat_intersection=0 | jaccard=0.000
  Learning rate                                       dist=0.437 |  ['Category:Machine learning', 'Category:Optimization algorithms and methods'] cat_intersection=1 | jaccard=0.250
  Inductive programming                               dist=0.452 |  ['Category:Machine learning', 'Category:Programming paradigms'] cat_intersection=1 | jaccard=0.250
  AutoML                                    