In [2]:
import requests
from datetime import datetime
import random
import re
from utils import *

In [3]:
def fetch_articles(start_date, end_date, limit=5000, min_views=1000):
    articles = []
    view_counts = {}
    base_titles = set()
    S = requests.Session()
    URL = "https://en.wikipedia.org/w/api.php"
    
    start_date = datetime.fromisoformat(start_date.replace("Z", "+00:00"))
    end_date = datetime.fromisoformat(end_date.replace("Z", "+00:00"))

    params = {
        "action": "query",
        "format": "json",
        "list": "allpages",
        "aplimit": "max",
        "apfilterredir": "nonredirects",
        "apnamespace": 0,
        "apfrom": chr(random.randint(65, 90))  # Start from a random letter A-Z
    }

    while len(articles) < limit:
        response = S.get(url=URL, params=params)
        data = response.json()
        allpages = data['query']['allpages']
        for page in allpages:
            pageid = page['pageid']
            title = page['title']
            
            # Skip titles that start with numbers
            if re.match(r'^\d', title):
                continue
            
            rev_params = {
                "action": "query",
                "format": "json",
                "prop": "revisions",
                "rvprop": "timestamp",
                "rvdir": "newer",
                "rvlimit": 1,
                "pageids": pageid
            }
            rev_response = S.get(url=URL, params=rev_params)
            rev_data = rev_response.json()
            creation_date_str = rev_data['query']['pages'][str(pageid)]['revisions'][0]['timestamp']
            creation_date = datetime.fromisoformat(creation_date_str.replace("Z", "+00:00"))
            
            if start_date <= creation_date <= end_date:
                view_params = {
                    "action": "query",
                    "format": "json",
                    "prop": "pageviews",
                    "titles": title
                }
                view_response = S.get(url=URL, params=view_params)
                view_data = view_response.json()
                pageviews = next(iter(view_data['query']['pages'].values())).get('pageviews', {})
                total_views = sum(view for view in pageviews.values() if view)

                if total_views >= min_views:
                    base_title = re.sub(r'\s\d{4}.*', '', title)  # Remove year and subsequent text
                    if base_title not in base_titles:
                        articles.append(title)
                        view_counts[title] = total_views
                        base_titles.add(base_title)
                    
                    if len(articles) >= limit:
                        break     
        if 'continue' not in data:
            break
        else:
            params['apcontinue'] = data['continue']['apcontinue']
    
    S.close()
    return articles, view_counts

pre_2021_articles, pre_2021_view_counts = fetch_articles("2020-01-01T00:00:00Z", "2020-12-31T23:59:59Z", limit=10000)
post_2024_articles, post_2024_view_counts = fetch_articles("2024-01-01T00:00:00Z", "2024-12-31T23:59:59Z", limit=10000)

In [10]:
print(len(pre_2021_articles), len(post_2024_articles))

10000 10000


In [12]:
def extract_texts(articles, extracts_size=5000):
    extracts = {}
    S = requests.Session()
    URL = "https://en.wikipedia.org/w/api.php"

    for title in articles:
        params = {
            "action": "query",
            "prop": "extracts",
            "exsentences": 10,
            "exlimit": 1,
            "titles": title,
            "explaintext": 1,
            "formatversion": 2,
            "format": "json"
        }
        response = S.get(url=URL, params=params)
        data = response.json()
        if 'query' in data and 'pages' in data['query']:
            page = next(iter(data['query']['pages']), None)
            if page and 'extract' in page:
                if len(page['extract']) > 50 and 'user' not in page['title'].lower() and not re.match(r'.+?:.+', page['title']):
                    extracts[page['title']] = page['extract']
        
        if len(extracts) >= extracts_size:
            break
    
    S.close()
    return extracts

pre_2021_texts = extract_texts(pre_2021_articles)
post_2024_texts = extract_texts(post_2024_articles)

5000
5000


In [None]:
print("Number of pre-2021 texts:", len(pre_2021_texts))
print("Number of post-2024 texts:", len(post_2024_texts))
print("Pre-2021 article view counts:", pre_2021_view_counts)
print("Post-2024 article view counts:", post_2024_view_counts)

In [16]:
#save to json
save_json(pre_2021_texts, '../datasets/pre_2021_articles.json')
save_json(post_2024_texts, '../datasets/post_2024_articles.json')