# Wikipedia Category Sampling Pipeline

This notebook samples Wikipedia articles by category to create domain-specific datasets. It extracts text from Wikipedia articles within specified categories and creates balanced training/development splits.

## Features
- Category-based article collection
- Recursive subcategory processing  
- Word-limit based sampling
- Train/dev split generation

In [None]:
import wikipediaapi
import random 

wikipedia = wikipediaapi.Wikipedia(user_agent='Academic Research Project', language='en')

API_URL = "https://en.wikipedia.org/w/api.php"

wiki_html = wikipediaapi.Wikipedia(
    user_agent='Academic Research Project',
    language='en',
    extract_format=wikipediaapi.ExtractFormat.WIKI
)

def get_category_articles(category):
    cat = wikipedia.page(f"Category:{category}")
    return cat.categorymembers.values()

def get_children_articles(children):
    articles = []
    for child in children:
        child_articles = get_category_articles(child)  # Missing assignment
        articles.extend(child_articles)  # Use extend instead of append
    return articles

def get_page_text(title):
    page = wiki_html.page(title)
    if page.exists():
        return page.text
    else:
        return None
    

## 1. Wikipedia API Setup and Utility Functions

In [None]:
def get_texts(articles, no_words):
    text = ""
    text_len = 0
    children = []
    counter = 0
    for article in articles:
        counter += 1
        if counter % 50 == 0:
            print(f"Processed {counter} articles, current text length: {text_len} words.")
        if article.title.startswith("Category:"):
            children.append(article.title.replace("Category:", ""))
            continue
        page_text = get_page_text(article.title)
        if page_text:
            text += page_text + "\n\n "
            text_len += len(page_text.split())
        if text_len >= no_words:
            break
    print(f"Collected {len(text.split())} words from {len(articles)} articles.")
    return " ".join(text.split()[:no_words]), get_children_articles(children)

def create_dataset_from_category(category, name, no_words):
    total_words = int(no_words * 1.2)
    articles = list(get_category_articles(category))
    random.seed(42)
    text = ""
    text_len = 0
    while text_len < total_words and articles: 
        random.shuffle(articles)    
        for i in range(0, len(articles), 20):
            print(f"Processing article: {articles[i].title})")
        print(f"Current layer has {len(articles)} articles, total words collected: {text_len}")   
        article_texts, children_articles = get_texts(articles, total_words - text_len)
        text += article_texts + "\n\n "
        text_len += len(article_texts.split())
        articles = children_articles
        print(f" Finished layer, moving further. Total words collected: {text_len}")


    with open(f"../datasets/wiki_categories/{name}.train", 'w') as f:
        train_text = " ".join(text.split()[:no_words])
        print(f"Total words: {len(train_text.split())}")
        f.write(train_text)
    with open(f"../datasets/wiki_categories/{name}_dev.train", 'w') as f:
        train_text = " ".join(text.split()[no_words:total_words])
        print(f"Total dev words: {len(train_text.split())}")
        f.write(train_text)
    
    


In [None]:
create_dataset_from_category("Quantum mechanics", "wiki_subfields_of_physics", 1_000_000)

## 2. Category-Specific Dataset Generation

Examples of creating 1M-word datasets from different Wikipedia categories.

In [None]:
create_dataset_from_category("History", "wiki_history", 1_000_000)

In [None]:
create_dataset_from_category("Culture", "wiki_culture", 1_000_000)

In [None]:
create_dataset_from_category("Society", "wiki_society", 1_000_000)

In [None]:
create_dataset_from_category("Linguistics", "wiki_linguistics", 1_000_000)