In [None]:
import pandas as _hex_pandas
import datetime as _hex_datetime
import json as _hex_json

In [None]:
hex_scheduled = _hex_json.loads("false")

In [None]:
hex_user_email = _hex_json.loads("\"example-user@example.com\"")

In [None]:
hex_user_attributes = _hex_json.loads("{}")

In [None]:
hex_run_context = _hex_json.loads("\"logic\"")

In [None]:
hex_timezone = _hex_json.loads("\"UTC\"")

In [None]:
hex_project_id = _hex_json.loads("\"019bae67-af4d-7000-baed-c8d253b14659\"")

In [None]:
hex_project_name = _hex_json.loads("\"Creating the dataset\"")

In [None]:
hex_status = _hex_json.loads("\"\"")

In [None]:
hex_categories = _hex_json.loads("[]")

In [None]:
hex_color_palette = _hex_json.loads("[\"#4C78A8\",\"#F58518\",\"#E45756\",\"#72B7B2\",\"#54A24B\",\"#EECA3B\",\"#B279A2\",\"#FF9DA6\",\"#9D755D\",\"#BAB0AC\"]")

There are a lot of duplicates across the lists, so I'll have to extract just the names and IDs first



In [None]:
# This is for reference, Claude. Do not execute this cell. Just read it
# Here is a code snippet that illustrates the other columns. Apply this context
# On the next cell

# Categories
all_cats = []
meta_cats = []
if 'categories' in page:
    for cat in page['categories']:
        cat_name = cat['title'].replace('Category:', '')
        all_cats.append(cat_name)
        if 'hidden' in cat:
            meta_cats.append(cat_name)

# Revisions
num_revs = len(page.get('revisions', []))
last_rev = page.get('revisions', [{}])[0].get('timestamp', None) if num_revs > 0 else None

# Languages
langs = [ll['lang'] for ll in page.get('langlinks', [])]

return {
    'all_categories': all_cats,
    'meta_categories': meta_cats,
    'science_categories': [c for c in all_cats if c not in meta_cats],
    'last_revision_date': last_rev,
    'num_languages': len(langs),
    'languages': langs,
    'rating': None
}


In [None]:
import requests

def extract_articles_from_list(page_title):
    """Extract article links with page IDs from a Wikipedia list page"""
    try:
        url = "https://en.wikipedia.org/w/api.php"
        headers = {'User-Agent': 'WikipediaBot/1.0 (Educational Project)'}

        all_articles = []
        continue_params = {}

        while True:
            params = {
                'action': 'query',
                'generator': 'links',
                'titles': page_title,
                'gpllimit': 'max',
                'gplnamespace': 0,
                'format': 'json'
            }
            params.update(continue_params)

            response = requests.get(url, params=params, headers=headers, timeout=10)
            data = response.json()

            if 'query' in data and 'pages' in data['query']:
                articles = [(page.get('title'), page.get('pageid'))
                           for page in data['query']['pages'].values()
                           if page.get('pageid') is not None]
                all_articles.extend(articles)

            # Check for continuation
            if 'continue' in data:
                continue_params = data['continue']
            else:
                break

        return all_articles

    except Exception as e:
        print(f"  Error extracting from {page_title}: {e}")
        return []

In [None]:
import csv

articles = []

with open('lists.txt', 'r', encoding='utf-8') as f:
    list_pages = [line.strip() for line in f if line.strip()]

for page in list_pages:
    articles.extend(extract_articles_from_list(page))

# Remove duplicates (by title)
seen_titles = set()
unique_articles = []
for title, page_id in articles:
    if title not in seen_titles:
        seen_titles.add(title)
        unique_articles.append((title, page_id))

unique_articles.sort(key=lambda x: x[0])

# Save to CSV
with open('article_names.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['title', 'page_id'])
    writer.writerows(unique_articles)

print(f"Saved {len(unique_articles)} unique articles to article_names.csv")

Saved 22633 unique articles to article_names.csv


In [None]:
import requests
import time

def validate_article_data(article):
    """Check if article has complete/valid data"""
    # Must have categories (empty list = failed fetch)
    if not article.get('all_categories') or len(article.get('all_categories', [])) == 0:
        return False

    # Should have some content
    if article.get('content_length', 0) == 0:
        return False

    # Should have a title
    if not article.get('title'):
        return False

    return True


def fetch_article_data_batch(page_ids, retry_count=0):
    """Fetch metadata for up to 50 articles in a single API call - synchronous version"""
    url = "https://en.wikipedia.org/w/api.php"
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}

    # Join page IDs with pipe separator (Wikipedia batch API format)
    ids_param = '|'.join(str(pid) for pid in page_ids)

    params = {
        'action': 'query',
        'pageids': ids_param,
        'prop': 'info|revisions|links|categories|extracts|langlinks',
        'inprop': 'url',
        'rvprop': 'content|timestamp',
        'rvslots': 'main',
        'pllimit': 'max',
        'cllimit': 'max',
        'clprop': 'hidden',
        'lllimit': 'max',
        'exintro': 'true',
        'explaintext': 'true',
        'format': 'json'
    }

    try:
        response = requests.get(url, params=params, headers=headers, timeout=30)

        if response.status_code != 200:
            if retry_count < 3:
                print(f"  Retry {retry_count + 1}/3 for batch (HTTP {response.status_code})")
                time.sleep(2 ** retry_count)  # Exponential backoff
                return fetch_article_data_batch(page_ids, retry_count + 1)
            else:
                print(f"  ERROR: HTTP {response.status_code} - batch failed after 3 retries")
                return []

        data = response.json()

        if 'query' not in data or 'pages' not in data['query']:
            return []

        # Rate limiting between main request and talk page request
        time.sleep(0.5)

        # Now fetch talk pages for ratings
        titles = [page.get('title', '') for page in data['query']['pages'].values() if 'title' in page]
        talk_titles = [f"Talk:{title}" for title in titles]
        talk_params = {
            'action': 'query',
            'titles': '|'.join(talk_titles),
            'prop': 'categories',
            'cllimit': 'max',
            'format': 'json'
        }

        talk_data_map = {}
        talk_response = requests.get(url, params=talk_params, headers=headers, timeout=30)
        if talk_response.status_code == 200:
            talk_json = talk_response.json()
            if 'query' in talk_json and 'pages' in talk_json['query']:
                for page in talk_json['query']['pages'].values():
                    if 'title' in page and page['title'].startswith('Talk:'):
                        original_title = page['title'].replace('Talk:', '', 1)
                        talk_data_map[original_title] = page

        results = []
        for page in data['query']['pages'].values():
            # Skip if page doesn't exist
            if 'missing' in page or 'invalid' in page:
                continue

            # Extract basic info
            page_id = page.get('pageid', None)
            title = page.get('title', '')
            url = page.get('fullurl', '')

            # Get content
            content = ''
            if 'revisions' in page and len(page['revisions']) > 0:
                content = page['revisions'][0].get('slots', {}).get('main', {}).get('*', '')

            content_length = len(content)
            num_sections = content.count('\n==')
            num_references = content.count('<ref')
            num_links = len(page.get('links', []))

            # Categories
            all_cats = []
            meta_cats = []
            if 'categories' in page:
                for cat in page['categories']:
                    cat_name = cat['title'].replace('Category:', '')
                    all_cats.append(cat_name)
                    if 'hidden' in cat:
                        meta_cats.append(cat_name)

            science_cats = [c for c in all_cats if c not in meta_cats]

            # Revisions
            num_revs = len(page.get('revisions', []))
            last_rev = page.get('revisions', [{}])[0].get('timestamp', None) if num_revs > 0 else None

            # Languages
            langs = [ll['lang'] for ll in page.get('langlinks', [])]

            summary = page.get('extract', '')
            summary_length = len(summary)

            sentences = [s.strip() for s in summary.split('.') if s.strip()]
            avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences) if sentences else 0

            # Extract rating from talk page
            rating = None
            if title in talk_data_map and 'categories' in talk_data_map[title]:
                talk_cats = [cat['title'].replace('Category:', '') for cat in talk_data_map[title]['categories']]
                for cat in talk_cats:
                    cat_lower = cat.lower()
                    if '-class' in cat_lower and 'articles' in cat_lower:
                        for rating_class in ['fa-class', 'a-class', 'ga-class', 'b-class', 'c-class', 'start-class', 'stub-class']:
                            if cat_lower.startswith(rating_class):
                                rating = rating_class.replace('-class', '').upper()
                                break
                        if rating:
                            break

            article = {
                'page_id': page_id,
                'title': title,
                'url': url,
                'content_length': content_length,
                'num_links': num_links,
                'num_sections': num_sections,
                'num_references': num_references,
                'all_categories': all_cats,
                'meta_categories': meta_cats,
                'science_categories': science_cats,
                'num_categories': len(all_cats),
                'summary_length': summary_length,
                'avg_sentence_length': round(avg_sentence_length, 2),
                'last_revision_date': last_rev,
                'num_languages': len(langs),
                'languages': langs,
                'rating': rating
            }

            # Validate article data
            if validate_article_data(article):
                results.append(article)
            else:
                # Mark for retry
                results.append(None)

        return results

    except Exception as e:
        if retry_count < 3:
            print(f"  Retry {retry_count + 1}/3 for batch (Exception: {e})")
            time.sleep(2 ** retry_count)
            return fetch_article_data_batch(page_ids, retry_count + 1)
        else:
            print(f"  ERROR: {e} - batch failed after 3 retries")
            return []


def batch_fetch_metadata(page_ids, batch_size=50):
    """Fetch metadata for all articles using batched synchronous requests"""
    from tqdm.auto import tqdm

    all_results = []
    failed_page_ids = []

    # Split into batches
    batches = [page_ids[i:i+batch_size] for i in range(0, len(page_ids), batch_size)]

    for batch in tqdm(batches, desc="Fetching batches"):
        batch_results = fetch_article_data_batch(batch)

        # Separate valid and invalid results
        for i, result in enumerate(batch_results):
            if result is None:
                # This article failed validation - retry individually
                failed_page_ids.append(batch[i])
            else:
                all_results.append(result)

        # Rate limiting between batches
        time.sleep(1.0)

    # Retry failed articles individually
    if failed_page_ids:
        print(f"\n\nRetrying {len(failed_page_ids)} failed articles individually...")
        for page_id in tqdm(failed_page_ids, desc="Retrying failed articles"):
            retry_results = fetch_article_data_batch([page_id])
            if retry_results and retry_results[0] is not None:
                all_results.append(retry_results[0])
            else:
                print(f"  PERMANENTLY FAILED: page_id {page_id}")
            time.sleep(0.5)

    return all_results


In [None]:
import time
import pandas as pd

# Load page IDs from CSV
df = pd.read_csv('article_names.csv')
page_ids = df['page_id'].dropna().astype(int).tolist()

print(f"Fetching metadata for {len(page_ids)} articles...")

# Run synchronous code
start_time = time.perf_counter()
article_data = batch_fetch_metadata(page_ids)
end_time = time.perf_counter()

print(f"\nCompleted in {end_time - start_time:.1f} seconds")
print(f"Successfully fetched: {len(article_data)} articles")
print(f"Failed/missing: {len(page_ids) - len(article_data)} articles")


Fetching metadata for 22633 articles...


Fetching batches:   0%|          | 0/453 [00:00<?, ?it/s]

In [None]:
article_data = pd.DataFrame(article_data)
article_data.to_csv('article_data.csv', index=False)

In [None]:
people_keywords = {'births', 'deaths', 'people', 'living'}
letter_keywords = {'letters', 'letter', 'alphabet'}
event_keywords = {'events', 'battles', 'wars', 'conflicts', 'disasters', 'treaties', 'history'}
org_keywords = {'organizations', 'organisations', 'companies', 'institutions', 'universities', 'agencies'}
other_keywords = { 'book'}

In [None]:
# Load the saved article data to examine structure
import pandas as pd
df_check = pd.read_csv('article_data.csv')
print(f"Shape: {df_check.shape}")
print(f"\nColumns: {df_check.columns.tolist()}")
print(f"\nSample of categories column:")
print(df_check['all_categories'].head(2))


In [None]:
import ast

def should_filter_article(categories_str, filter_keywords):
    """
    Check if any category contains any of the filter keywords.
    Returns True if article should be REMOVED.
    """
    if pd.isna(categories_str) or categories_str == '[]':
        return False

    # Parse the string representation of list into actual list
    try:
        categories = ast.literal_eval(categories_str)
    except:
        return False

    # Check each category against all filter keywords
    for category in categories:
        category_lower = category.lower()
        for keyword in filter_keywords:
            if keyword in category_lower:
                return True

    return False

# Combine all filter keywords
all_filter_keywords = (
    people_keywords |
    letter_keywords |
    event_keywords |
    org_keywords |
    other_keywords
)

print(f"Total filter keywords: {len(all_filter_keywords)}")
print(f"Keywords: {sorted(all_filter_keywords)}")


In [None]:
# Load the data
df = pd.read_csv('article_data.csv')

print(f"Original dataset: {len(df)} articles")

# Apply filter - mark articles to remove
df['should_filter'] = df['all_categories'].apply(
    lambda x: should_filter_article(x, all_filter_keywords)
)

# Show filtering stats
filtered_out = df['should_filter'].sum()
print(f"Articles to filter out: {filtered_out} ({filtered_out/len(df)*100:.1f}%)")
print(f"Articles remaining: {len(df) - filtered_out}")

# Create clean dataset
df_clean = df[~df['should_filter']].drop(columns=['should_filter'])

# Save filtered dataset
df_clean.to_csv('article_data_filtered.csv', index=False)
print(f"\nSaved filtered dataset to 'article_data_filtered.csv'")


In [None]:
# Load the category-filtered dataset
df_filtered = pd.read_csv('article_data_filtered.csv')

print(f"After category filtering: {len(df_filtered):,} articles")

# Check if rating column exists
if 'rating' not in df_filtered.columns:
    print("\n⚠️  ERROR: 'rating' column not found in dataset")
    print("The metadata fetch may not have completed successfully.")
else:
    # Filter to keep only articles WITH ratings
    df_with_ratings = df_filtered[df_filtered['rating'].notna()].copy()

    print(f"\nArticles WITH ratings: {len(df_with_ratings):,} ({len(df_with_ratings)/len(df_filtered)*100:.1f}%)")
    print(f"Articles WITHOUT ratings: {(~df_filtered['rating'].notna()).sum():,}")

    # Show rating distribution
    print(f"\nRating distribution:")
    rating_counts = df_with_ratings['rating'].value_counts().sort_index()
    for rating, count in rating_counts.items():
        print(f"  {rating}: {count:,}")

    # Save final dataset
    df_with_ratings.to_csv('article_data_final.csv', index=False)
    print(f"\n✓ Saved final dataset: {len(df_with_ratings):,} articles → 'article_data_final.csv'")


In [None]:
import pandas as pd

# Load the old ML-ready dataset
df_old = pd.read_csv('wikipedia_dataset_ml_ready.csv')

# Load our current final dataset
df_new = pd.read_csv('article_data_final.csv')

print("OLD DATASET (wikipedia_dataset_ml_ready.csv):")
print("=" * 70)
print(f"Articles: {len(df_old):,}")
print(f"Columns: {df_old.columns.tolist()}")
print(f"\nShape: {df_old.shape}")

print("\n\nNEW DATASET (article_data_final.csv):")
print("=" * 70)
print(f"Articles: {len(df_new):,}")
print(f"Columns: {df_new.columns.tolist()}")
print(f"\nShape: {df_new.shape}")

print("\n\nDIFFERENCE:")
print("=" * 70)
print(f"Article count difference: {len(df_old) - len(df_new):,}")

# Check rating distribution in old dataset
if 'rating' in df_old.columns:
    print(f"\nOld dataset rating distribution:")
    print(df_old['rating'].value_counts().sort_index())

if 'rating' in df_new.columns:
    print(f"\nNew dataset rating distribution:")
    print(df_new['rating'].value_counts().sort_index())


In [None]:
import ast

# Load the filtered dataset
df_final = pd.read_csv('article_data_final.csv')

# Check for articles with 'agencies' in their categories
articles_with_agencies = []

for idx, row in df_final.iterrows():
    try:
        categories = ast.literal_eval(row['all_categories'])
        for cat in categories:
            if 'agencies' in cat.lower():
                articles_with_agencies.append({
                    'title': row['title'],
                    'category': cat
                })
                break
    except:
        continue

print(f"Articles with 'agencies' in categories: {len(articles_with_agencies)}")
print("\nExamples:")
for item in articles_with_agencies[:10]:
    print(f"  - {item['title']}")
    print(f"    Category: {item['category']}")


In [None]:
import ast

# Load the final dataset
df_final = pd.read_csv('article_data_final.csv')

# Find ANSMET article
ansmet = df_final[df_final['page_id'] == 1851775]

if len(ansmet) > 0:
    print(f"FOUND: {ansmet.iloc[0]['title']}")
    print(f"Page ID: {ansmet.iloc[0]['page_id']}")
    print(f"\nAll categories:")
    print("=" * 70)

    try:
        categories = ast.literal_eval(ansmet.iloc[0]['all_categories'])
        for i, cat in enumerate(categories, 1):
            print(f"{i}. {cat}")
            # Check if it contains 'agencies'
            if 'agencies' in cat.lower():
                print(f"   ^^^ CONTAINS 'agencies'!")
    except Exception as e:
        print(f"Error parsing categories: {e}")
else:
    print("Article with page_id 1851775 NOT FOUND in article_data_final.csv")

    # Check in the pre-rating-filter dataset
    df_filtered = pd.read_csv('article_data_filtered.csv')
    ansmet_filtered = df_filtered[df_filtered['page_id'] == 1851775]

    if len(ansmet_filtered) > 0:
        print("\nBut it WAS found in article_data_filtered.csv (before rating filter)")
        print("This means it was filtered out due to missing rating, not categories")
    else:
        print("\nAlso NOT found in article_data_filtered.csv")
        print("Checking original article_data.csv...")

        df_original = pd.read_csv('article_data.csv')
        ansmet_original = df_original[df_original['page_id'] == 1851775]

        if len(ansmet_original) > 0:
            print("\nFOUND in article_data.csv (before category filtering)!")
            print(f"Title: {ansmet_original.iloc[0]['title']}")
            print(f"\nCategories:")
            try:
                categories = ast.literal_eval(ansmet_original.iloc[0]['all_categories'])
                for i, cat in enumerate(categories, 1):
                    print(f"{i}. {cat}")
                    if 'agencies' in cat.lower():
                        print(f"   ^^^ CONTAINS 'agencies' - SHOULD HAVE BEEN FILTERED!")
            except:
                pass


In [None]:
# Look for should_remove_article function definition
for i, cell in enumerate(notebook['cells']):
    if cell['cell_type'] == 'code':
        source = ''.join(cell['source'])
        if 'should_remove_article' in source and 'def ' in source:
            print(f"Cell {i} - should_remove_article definition:")
            print("=" * 70)
            print(source)
            print("\n" + "=" * 70)
            break


In [None]:
# Recreate old word-based matching
def should_remove_OLD(categories_str, keywords):
    """Old approach: word-based matching"""
    if pd.isna(categories_str) or categories_str == '[]':
        return False

    try:
        categories = ast.literal_eval(categories_str)
    except:
        return False

    for cat in categories:
        # Split category into words
        words = {word.lower() for word in cat.split()}
        # Check for intersection with keywords
        if words & keywords:
            return True
    return False

# Test both approaches
df_test = pd.read_csv('article_data.csv')

# Apply OLD word-based filter
old_filter = df_test['all_categories'].apply(
    lambda x: should_remove_OLD(x, all_filter_keywords)
)

# Apply NEW substring filter (already done in C19, but let's recalculate)
new_filter = df_test['all_categories'].apply(
    lambda x: should_filter_article(x, all_filter_keywords)
)

print("Comparison:")
print(f"Old word-based:    {old_filter.sum()} filtered out, {len(df_test) - old_filter.sum()} kept")
print(f"New substring:     {new_filter.sum()} filtered out, {len(df_test) - new_filter.sum()} kept")
print(f"\nDifference: {abs(old_filter.sum() - new_filter.sum())} articles")
