In [None]:
import pandas as _hex_pandas
import datetime as _hex_datetime
import json as _hex_json

In [None]:
hex_scheduled = _hex_json.loads("false")

In [None]:
hex_user_email = _hex_json.loads("\"example-user@example.com\"")

In [None]:
hex_user_attributes = _hex_json.loads("{}")

In [None]:
hex_run_context = _hex_json.loads("\"logic\"")

In [None]:
hex_timezone = _hex_json.loads("\"UTC\"")

In [None]:
hex_project_id = _hex_json.loads("\"019bae67-af4d-7000-baed-c8d253b14659\"")

In [None]:
hex_project_name = _hex_json.loads("\"Creating the dataset\"")

In [None]:
hex_status = _hex_json.loads("\"\"")

In [None]:
hex_categories = _hex_json.loads("[]")

In [None]:
hex_color_palette = _hex_json.loads("[\"#4C78A8\",\"#F58518\",\"#E45756\",\"#72B7B2\",\"#54A24B\",\"#EECA3B\",\"#B279A2\",\"#FF9DA6\",\"#9D755D\",\"#BAB0AC\"]")

## Getting the articles names

API calls are expensive, so to save time I first got only the articles names.



In [None]:
import requests

def extract_articles_from_list(page_title):
    """Try to extract article links from a Wikipedia list page"""
    try:
        url = "https://en.wikipedia.org/w/api.php"
        headers = {'User-Agent': 'WikipediaBot/1.0 (Educational Project)'}
        
        params = {
            'action': 'parse',
            'page': page_title,
            'prop': 'links',
            'format': 'json'
        }
        
        response = requests.get(url, params=params, headers=headers, timeout=10)
        data = response.json()
        
        if 'parse' in data and 'links' in data['parse']:
            # Get all internal links (namespace 0 = main articles)
            articles = [link['*'] for link in data['parse']['links'] 
                       if link.get('ns') == 0]
            return articles
        
        return []
        
    except Exception as e:
        print(f"  Error extracting from {page_title}: {e}")
        return []


In [None]:
articles = set()

# Read list pages
with open('lists.txt', 'r', encoding='utf-8') as f:
    list_pages = [line.strip() for line in f if line.strip()]

for page in list_pages:
    articles.update(extract_articles_from_list(page))

articles = sorted(articles)

with open('article_names.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(articles))    

## Creating the dataset



### Simplified version



In [None]:
import aiohttp
import asyncio

async def fetch_article_data(article_title, session):
    """Fetch straightforward article metadata"""
    url = "https://en.wikipedia.org/w/api.php"
    headers = {'User-Agent': 'WikipediaBot/1.0 (Educational Project)'}
    
    # Get page info, content, links, categories, references
    params = {
        'action': 'query',
        'titles': article_title,
        'prop': 'info|revisions|links|categories|extracts',
        'inprop': 'url',
        'rvprop': 'content|timestamp',
        'rvslots': 'main',
        'pllimit': 'max',
        'cllimit': 'max',
        'exintro': 'true',
        'explaintext': 'true',
        'format': 'json'
    }
    
    try:
        async with session.get(url, params=params, headers=headers, timeout=10) as response:
            data = await response.json()
            
            if 'query' not in data or 'pages' not in data['query']:
                return None
            
            page = list(data['query']['pages'].values())[0]
            
            # Skip if page doesn't exist
            if 'missing' in page or 'invalid' in page:
                return None
            
            # Extract basic info
            title = page.get('title', '')
            url = page.get('fullurl', '')
            
            # Get content
            content = ''
            if 'revisions' in page and len(page['revisions']) > 0:
                content = page['revisions'][0].get('slots', {}).get('main', {}).get('*', '')
            
            content_length = len(content)
            
            # Count sections (## markers in wikitext)
            num_sections = content.count('\n==')
            
            # Count references
            num_references = content.count('<ref')
            
            # Count links
            num_links = len(page.get('links', []))
            
            # Count categories
            num_categories = len(page.get('categories', []))
            
            # Get summary/extract
            summary = page.get('extract', '')
            summary_length = len(summary)
            
            # Calculate average sentence length (rough approximation)
            sentences = [s.strip() for s in summary.split('.') if s.strip()]
            avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences) if sentences else 0
            
            return {
                'title': title,
                'url': url,
                'content_length': content_length,
                'num_links': num_links,
                'num_sections': num_sections,
                'num_references': num_references,
                'num_categories': num_categories,
                'summary_length': summary_length,
                'avg_sentence_length': round(avg_sentence_length, 2)
            }
        
    except Exception as e:
        print(f"Error fetching {article_title}: {e}")
        return None


async def fetch_all_metadata(article_names):
    """Fetch metadata for all articles with concurrency control"""
    from tqdm.auto import tqdm
    
    results = []
    
    async with aiohttp.ClientSession() as session:
        # Limit to 15 concurrent requests
        semaphore = asyncio.Semaphore(15)
        
        async def fetch_with_limit(article):
            async with semaphore:
                return await fetch_article_data(article, session)
        
        # Create tasks with progress bar
        tasks = [fetch_with_limit(article) for article in article_names]
        
        # Run with progress tracking
        for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Fetching"):
            result = await coro
            if result:
                results.append(result)
    
    return results

In [None]:
import time

# Load article names
with open('article_names.txt', 'r', encoding='utf-8') as f:
    article_names = [line.strip() for line in f if line.strip()]

print(f"Fetching metadata for {len(article_names)} articles...")

# Run async code
start_time = time.perf_counter()
article_data = await fetch_all_metadata(article_names)
end_time = time.perf_counter()

print(f"\nCompleted in {end_time - start_time:.1f} seconds")
print(f"Successfully fetched: {len(article_data)} articles")
print(f"Failed/missing: {len(article_names) - len(article_data)} articles")

Fetching metadata for 23633 articles...
