In [None]:
import pandas as _hex_pandas
import datetime as _hex_datetime
import json as _hex_json

In [None]:
hex_scheduled = _hex_json.loads("false")

In [None]:
hex_user_email = _hex_json.loads("\"example-user@example.com\"")

In [None]:
hex_user_attributes = _hex_json.loads("{}")

In [None]:
hex_run_context = _hex_json.loads("\"logic\"")

In [None]:
hex_timezone = _hex_json.loads("\"UTC\"")

In [None]:
hex_project_id = _hex_json.loads("\"019bae67-af4d-7000-baed-c8d253b14659\"")

In [None]:
hex_project_name = _hex_json.loads("\"Creating the dataset\"")

In [None]:
hex_status = _hex_json.loads("\"\"")

In [None]:
hex_categories = _hex_json.loads("[]")

In [None]:
hex_color_palette = _hex_json.loads("[\"#4C78A8\",\"#F58518\",\"#E45756\",\"#72B7B2\",\"#54A24B\",\"#EECA3B\",\"#B279A2\",\"#FF9DA6\",\"#9D755D\",\"#BAB0AC\"]")

## Wikipedia Scientific Articles Dataset

**Goal**: Extract and analyze Wikipedia articles about scientific subjects, filtering out non-scientific content (personalities, organizations, events).

**Process**:

1. **Extract article names & IDs** from Wikipedia list pages (glossary pages, science topic lists)
   - Input: `lists.txt` containing Wikipedia list page titles
   - Uses Wikipedia's query API with `generator=links` to get article titles + page IDs
   - Handles pagination via continuation tokens (500 results/request)
   - Deduplicates across lists â†’ outputs `article_names.csv`

2. **Fetch article metadata** in batches
   - Batch requests: 50 page IDs per API call (~473 batches for 23k articles)
   - Uses async with concurrency limiting (respects Wikipedia rate limits)
   - Fetches: content length, links, sections, references, categories (all/meta/science), languages, revision dates, summaries
   - **Why page IDs**: Titles with special characters (commas in chemical names) break batch API; IDs are cleaner

3. **Filter dataset** to remove non-scientific articles
   - Currently identifying keywords for: people, organizations, events, letter/list articles
   - Will use categories, content patterns, and metadata to classify articles

**Current status**: ~22.6k articles fetched with full metadata, ready for filtering.

## Getting the articles names and IDs

API calls are expensive, so to save time I first got only the articles names and IDs.



API calls are expensive, so to save time I first got only the articles names and IDs.



In [None]:
# This is for reference, Claude. Do not execute this cell. Just read it
# Here is a code snippet that illustrates the other columns. Apply this context
# On the next cell
 
# Categories
all_cats = []
meta_cats = []
if 'categories' in page:
    for cat in page['categories']:
        cat_name = cat['title'].replace('Category:', '')
        all_cats.append(cat_name)
        if 'hidden' in cat:
            meta_cats.append(cat_name)

# Revisions
num_revs = len(page.get('revisions', []))
last_rev = page.get('revisions', [{}])[0].get('timestamp', None) if num_revs > 0 else None

# Languages
langs = [ll['lang'] for ll in page.get('langlinks', [])]

return {
    'all_categories': all_cats,
    'meta_categories': meta_cats,
    'science_categories': [c for c in all_cats if c not in meta_cats],
    'last_revision_date': last_rev,
    'num_languages': len(langs),
    'languages': langs,
    'rating': None
}
        

In [None]:
import requests

def extract_articles_from_list(page_title):
    """Extract article links with page IDs from a Wikipedia list page"""
    try:
        url = "https://en.wikipedia.org/w/api.php"
        headers = {'User-Agent': 'WikipediaBot/1.0 (Educational Project)'}
        
        all_articles = []
        continue_params = {}
        
        while True:
            params = {
                'action': 'query',
                'generator': 'links',
                'titles': page_title,
                'gpllimit': 'max',
                'gplnamespace': 0,
                'format': 'json'
            }
            params.update(continue_params)
            
            response = requests.get(url, params=params, headers=headers, timeout=10)
            data = response.json()
            
            if 'query' in data and 'pages' in data['query']:
                articles = [(page.get('title'), page.get('pageid')) 
                           for page in data['query']['pages'].values()
                           if page.get('pageid') is not None]
                all_articles.extend(articles)
            
            # Check for continuation
            if 'continue' in data:
                continue_params = data['continue']
            else:
                break
        
        return all_articles
        
    except Exception as e:
        print(f"  Error extracting from {page_title}: {e}")
        return []

In [None]:
import csv

articles = []

with open('lists.txt', 'r', encoding='utf-8') as f:
    list_pages = [line.strip() for line in f if line.strip()]

for page in list_pages:
    articles.extend(extract_articles_from_list(page))

# Remove duplicates (by title)
seen_titles = set()
unique_articles = []
for title, page_id in articles:
    if title not in seen_titles:
        seen_titles.add(title)
        unique_articles.append((title, page_id))

unique_articles.sort(key=lambda x: x[0])

# Save to CSV
with open('article_names.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['title', 'page_id'])
    writer.writerows(unique_articles)

print(f"Saved {len(unique_articles)} unique articles to article_names.csv")

Saved 22633 unique articles to article_names.csv


## Creating the dataset



In [None]:
import aiohttp
import asyncio

async def fetch_article_data_batch(page_ids, session):
    """Fetch metadata for up to 50 articles in a single API call"""
    url = "https://en.wikipedia.org/w/api.php"
    headers = {'User-Agent': 'WikipediaBot/1.0 (Educational Project)'}
    
    # Join page IDs with pipe separator (Wikipedia batch API format)
    ids_param = '|'.join(str(pid) for pid in page_ids)
    
    params = {
        'action': 'query',
        'pageids': ids_param,
        'prop': 'info|revisions|links|categories|extracts|langlinks',
        'inprop': 'url',
        'rvprop': 'content|timestamp',
        'rvslots': 'main',
        'pllimit': 'max',
        'cllimit': 'max',
        'lllimit': 'max',
        'exintro': 'true',
        'explaintext': 'true',
        'format': 'json'
    }
    
    try:
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            data = await response.json()
            
            if 'query' not in data or 'pages' not in data['query']:
                return []
            
            results = []
            for page in data['query']['pages'].values():
                # Skip if page doesn't exist
                if 'missing' in page or 'invalid' in page:
                    continue
                
                # Extract basic info
                page_id = page.get('pageid', None)
                title = page.get('title', '')
                url = page.get('fullurl', '')
                
                # Get content
                content = ''
                if 'revisions' in page and len(page['revisions']) > 0:
                    content = page['revisions'][0].get('slots', {}).get('main', {}).get('*', '')
                
                content_length = len(content)
                num_sections = content.count('\n==')
                num_references = content.count('<ref')
                num_links = len(page.get('links', []))
                
                # Categories
                all_cats = []
                meta_cats = []
                if 'categories' in page:
                    for cat in page['categories']:
                        cat_name = cat['title'].replace('Category:', '')
                        all_cats.append(cat_name)
                        if 'hidden' in cat:
                            meta_cats.append(cat_name)
                
                science_cats = [c for c in all_cats if c not in meta_cats]
                
                # Revisions
                num_revs = len(page.get('revisions', []))
                last_rev = page.get('revisions', [{}])[0].get('timestamp', None) if num_revs > 0 else None
                
                # Languages
                langs = [ll['lang'] for ll in page.get('langlinks', [])]
                
                summary = page.get('extract', '')
                summary_length = len(summary)
                
                sentences = [s.strip() for s in summary.split('.') if s.strip()]
                avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences) if sentences else 0
                
                results.append({
                    'page_id': page_id,
                    'title': title,
                    'url': url,
                    'content_length': content_length,
                    'num_links': num_links,
                    'num_sections': num_sections,
                    'num_references': num_references,
                    'all_categories': all_cats,
                    'meta_categories': meta_cats,
                    'science_categories': science_cats,
                    'num_categories': len(all_cats),
                    'summary_length': summary_length,
                    'avg_sentence_length': round(avg_sentence_length, 2),
                    'last_revision_date': last_rev,
                    'num_languages': len(langs),
                    'languages': langs
                })
            
            return results
        
    except Exception as e:
        print(f"Error fetching batch: {e}")
        return []


async def batch_fetch_metadata(page_ids, batch_size=50):
    """Fetch metadata for all articles using batched requests"""
    from tqdm.auto import tqdm
    
    all_results = []
    
    # Split into batches
    batches = [page_ids[i:i+batch_size] for i in range(0, len(page_ids), batch_size)]
    
    async with aiohttp.ClientSession() as session:
        semaphore = asyncio.Semaphore(1)
        
        async def fetch_batch_with_limit(batch):
            async with semaphore:
                await asyncio.sleep(0.5)
                return await fetch_article_data_batch(batch, session)
        
        tasks = [fetch_batch_with_limit(batch) for batch in batches]
        
        for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Fetching batches"):
            batch_results = await coro
            all_results.extend(batch_results)
    
    return all_results


In [None]:
import time
import pandas as pd

# Load page IDs from CSV
df = pd.read_csv('article_names.csv')
page_ids = df['page_id'].dropna().astype(int).tolist()

print(f"Fetching metadata for {len(page_ids)} articles...")

# Run async code
start_time = time.perf_counter()
article_data = await batch_fetch_metadata(page_ids)
end_time = time.perf_counter()

print(f"\nCompleted in {end_time - start_time:.1f} seconds")
print(f"Successfully fetched: {len(article_data)} articles")
print(f"Failed/missing: {len(page_ids) - len(article_data)} articles")

Fetching metadata for 22633 articles...


Fetching batches:   0%|          | 0/453 [00:00<?, ?it/s]


Completed in 478.8 seconds
Successfully fetched: 22633 articles
Failed/missing: 0 articles


In [None]:
import aiohttp
import asyncio
import json

# Test with first 3 page IDs
test_ids = page_ids[:3]
print(f"Testing with page IDs: {test_ids}")

async def test_api_response():
    url = "https://en.wikipedia.org/w/api.php"
    headers = {'User-Agent': 'WikipediaBot/1.0 (Educational Project)'}
    ids_param = '|'.join(str(pid) for pid in test_ids)
    
    params = {
        'action': 'query',
        'pageids': ids_param,
        'prop': 'info|revisions|links|categories|extracts|langlinks',
        'inprop': 'url',
        'rvprop': 'content|timestamp',
        'rvslots': 'main',
        'pllimit': 'max',
        'cllimit': 'max',
        'clshow': '!hidden|hidden',
        'lllimit': 'max',
        'exintro': 'true',
        'explaintext': 'true',
        'format': 'json'
    }
    
    async with aiohttp.ClientSession() as session:
        async with session.get(url, params=params, headers=headers, timeout=30) as response:
            data = await response.json()
            return data

raw_response = await test_api_response()
print(f"\nResponse keys: {raw_response.keys()}")

if 'error' in raw_response:
    print(f"\nERROR from Wikipedia API:")
    print(json.dumps(raw_response['error'], indent=2))
elif 'query' in raw_response:
    print(f"Query keys: {raw_response['query'].keys()}")
    if 'pages' in raw_response['query']:
        print(f"Number of pages: {len(raw_response['query']['pages'])}")
        print(f"\nFirst page:")
        first_page = list(raw_response['query']['pages'].values())[0]
        print(json.dumps(first_page, indent=2)[:1000])  # First 1000 chars

Testing with page IDs: [175149, 1215764, 37376135]

Response keys: dict_keys(['error', 'servedby'])

ERROR from Wikipedia API:
{
  "code": "show",
  "info": "Incorrect parameter - mutually exclusive values may not be supplied.",
  "*": "See https://en.wikipedia.org/w/api.php for API usage. Subscribe to the mediawiki-api-announce mailing list at &lt;https://lists.wikimedia.org/postorius/lists/mediawiki-api-announce.lists.wikimedia.org/&gt; for notice of API deprecations and breaking changes."
}


In [None]:
df = pd.DataFrame(article_data)
df.to_csv('article_data.csv', index=False)

In [None]:
people_keywords = {'births', 'deaths', 'people', 'living'}
letter_keywords = {'letters', 'letter', 'alphabet'}
event_keywords = {'events', 'battles', 'wars', 'conflicts', 'disasters', 'treaties'}
org_keywords = {'organizations', 'organisations', 'companies', 'institutions', 'universities', 'agencies'}