In [8]:
import re
import ast
import json
import typing as tp
import pandas as pd
from textwrap import shorten

In [22]:
PROMPT_TEMPLATE = open("../prompts/filter_labels.txt", "r").read()

categories = [
    "Machine learning",
    "Biology",
    "Ancient Rome",
    "Neural networks",
    "Psychology",
]

indexed = "\n".join(f"{i}: {c}" for i,c in enumerate(categories))

prompt = PROMPT_TEMPLATE.replace("{{title}}", "Backpropagation") \
                        .replace("{{summary}}", "Backpropagation is an algorithm ...") \
                        .replace("{{indexed_category_list}}", indexed)

print(prompt)

You are a classification assistant. Your task is to choose which categories from a provided list are semantically relevant to a given Wikipedia article.

You will be given:
1) The article title
2) A short text/summary describing the article
3) A list of candidate categories, indexed from 0 to N-1

Your goal:
- Select ALL categories that apply to the article's meaning or topic.
- You MUST output ONLY a JSON list of integers (category indices).
- Do NOT output category names or explanations.
- If no categories fit, return an empty list `[]`.

Guidelines:
- Choose categories representing the main topic, not formatting/maintenance tags.
- Prefer domain-level, thematic, or conceptual categories over meta/technical/admin ones.
- Ignore categories describing Wikipedia housekeeping, templates, flags, stubs, etc.

Return ONLY:
A JSON list of selected indices.
Example:
[2, 5, 6]


ARTICLE TITLE:
Backpropagation

ARTICLE SUMMARY:
{{text}}

CANDIDATE CATEGORIES (indexed):
{{indexed_categories}}




In [23]:
PROMPT_TEMPLATE = (
    PROMPT_TEMPLATE
    .replace("{{title}}", "{title}")
    .replace("{{text}}", "{text}")
    .replace("{{indexed_categories}}", "{indexed_categories}")
)

In [9]:
graph = json.loads(open("../data/wiki_graph_dedup.json", "r").read())

In [10]:
def build_prompt(node: tp.Dict[str, tp.Any], max_text_len: int = 2000) -> str:
    title = node.get("title", "").strip()
    text = node.get("text", "").strip()
    text = shorten(text, width=max_text_len, placeholder=" ...")

    categories = node.get("categories", [])
    indexed = "\n".join(f"{i}: {c}" for i, c in enumerate(categories))

    prompt = PROMPT_TEMPLATE.format(
        title=title,
        text=text,
        indexed_categories=indexed
    )
    return prompt

In [26]:
nodes = graph["nodes"]

prompts = {}
for title, node in nodes.items():
    prompts[title] = build_prompt(node)

In [27]:
df = pd.DataFrame(prompts.items(), columns=["title", "prompt"])

In [31]:
print(df.iloc[123]['prompt'])

You are a classification assistant. Your task is to choose which categories from a provided list are semantically relevant to a given Wikipedia article.

You will be given:
1) The article title
2) A short text/summary describing the article
3) A list of candidate categories, indexed from 0 to N-1

Your goal:
- Select ALL categories that apply to the article's meaning or topic.
- You MUST output ONLY a JSON list of integers (category indices).
- Do NOT output category names or explanations.
- If no categories fit, return an empty list `[]`.

Guidelines:
- Choose categories representing the main topic, not formatting/maintenance tags.
- Prefer domain-level, thematic, or conceptual categories over meta/technical/admin ones.
- Ignore categories describing Wikipedia housekeeping, templates, flags, stubs, etc.

Return ONLY:
A JSON list of selected indices.
Example:
[2, 5, 6]


ARTICLE TITLE:
2018 Google data breach

ARTICLE SUMMARY:
The 2018 Google data breach was a major data privacy scanda

In [32]:
df.to_csv("../data/filtering_prompts.csv")

---

### Processing the LLM-as-a-judge result

In [11]:
path = "../data/filtered_categories.csv"
df = pd.read_csv(path)
df.head(3)

Unnamed: 0.2,Unnamed: 0.1,prompt,Unnamed: 0,title,deepseek_answer
0,0,You are a classification assistant. Your task ...,0,Data binding,"<think>\nAlright, so I've got this task to fig..."
1,1,You are a classification assistant. Your task ...,1,Intelligent agent,"<think>\nAlright, let's tackle this classifica..."
2,2,You are a classification assistant. Your task ...,2,Self-management (computer science),"<think>\nAlright, let's tackle this classifica..."


In [12]:
# Parse <think> section
def extract_list(answer):
    answer = answer.split("</think>")[-1].strip()
    try:
        match = re.search(r"\[.*?\]", answer, re.DOTALL)
        if match:
            return ast.literal_eval(match.group())
        else:
            return None
    except Exception:
        return None

df["valid_idx"] = df["deepseek_answer"].apply(extract_list)

In [13]:
graph = json.loads(open("../data/wiki_graph_dedup.json", "r").read()) # read the graph

In [14]:
NOISE_KEYWORDS = [
    "births", "deaths", "living people", "films", "television", 
    "musicians", "sports", "movies", "establishments", "schools",
    "universities", "museums", "politicians", "countries", 
    "historical", "events", "wikipedia", "organizations", "politics",
    "latin-language", "ancient greek", "french-language", "japanese-language",
    "german-language", "chinese-language", "spanish-language", "russian-language",
    "pages including recorded pronunciations", "pages with plain ipa",
    "pages with french ipa", "pages using the phonos extension",
    "american inventions", "elections using electoral votes", "united states supreme court cases",
    "disambiguation", "cities in", "unincorporated communities", "nations at", "journals",
    "academy", "academia", "schools", "universities", "sports", "olympics", "countries",
    "english-language", "french-language", "german-language", "spanish-language", "latin-language",
    "russian-language", "japanese-language", "chinese-language"
]

def is_noise_category(category: str) -> bool:
    cat_lower: str = category.lower()
    return any(keyword in cat_lower for keyword in NOISE_KEYWORDS)

In [15]:
for _, row in df.iterrows():
    title = row['title']
    valid_idx = row['valid_idx']  # valid categories that are meaningful

    # Remove noisy categories
    all_categories = graph['nodes'][title]['categories']
    filtered_categories = [all_categories[i] for i in valid_idx if i < len(all_categories)]
    graph['nodes'][title]['categories'] = filtered_categories

In [None]:
titles_to_remove = []

for title, node in graph['nodes'].items():
    categories = node.get('categories', [])
    if any(is_noise_category(cat) for cat in categories):
        titles_to_remove.append(title)

for title in titles_to_remove:
    del graph['nodes'][title]

print(f"[INFO] Removed {len(titles_to_remove)} noisy nodes. Remaining nodes: {len(graph['nodes'])}")

[INFO] Removed 6616 noisy nodes. Remaining nodes: 7934


In [17]:
remaining_nodes = set(graph['nodes'].keys())

graph['edges'] = [
    (src, dst) for src, dst in graph['edges']
    if src in remaining_nodes and dst in remaining_nodes
]

print(f"[INFO] Remaining edges after cleanup: {len(graph['edges'])}")

[INFO] Remaining edges after cleanup: 7859


In [46]:
# Save graph with filtered categories
with open("../data/wiki_graph_filtered_v2.json", "w", encoding="utf-8") as f:
    json.dump(graph, f, ensure_ascii=False, indent=2)