In [None]:
import wikipediaapi
import requests
import json
import datetime
import time
from inputimeout import inputimeout, TimeoutOccurred
from requests.exceptions import ReadTimeout

In [None]:
def get_last_complete_month_date_range():
    """
    Returns the start and end dates (YYYYMMDD) for the last complete month.
    """
    today = datetime.date.today()
    first_day_this_month = today.replace(day=1)
    last_day_last_month = first_day_this_month - datetime.timedelta(days=1)
    start_date = last_day_last_month.replace(day=1).strftime("%Y%m%d")
    end_date = last_day_last_month.strftime("%Y%m%d")
    return start_date, end_date

In [None]:
def get_pageviews(title, start_date, end_date, retries=3):
    """
    For a given article title, retrieves the total daily pageviews over the specified date range.
    Uses a retry mechanism if connection issues occur.
    """
    article_title = title.replace(" ", "_")
    url = (
        f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"
        f"en.wikipedia/all-access/user/{article_title}/daily/{start_date}/{end_date}"
    )
    headers = {
        "User-Agent": "UniCourseWikipediaBot (mehmetaltintas@etu.edu.tr)"
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        data = response.json()
        total_views = sum(item.get("views", 0) for item in data.get("items", []))
        return total_views
    except requests.exceptions.RequestException as e:
        if retries > 0:
            print(f"Request failed for '{title}' with error: {e}. Retrying ({retries} left)...")
            time.sleep(1)
            return get_pageviews(title, start_date, end_date, retries - 1)
        else:
            print(f"Failed to retrieve pageviews for '{title}' after retries. Error: {e}")
            return 0

In [None]:
def search_wikipedia(query, limit=500):
    """
    Uses the MediaWiki API to search for a query, returning up to `limit` results.
    Each result is a dict containing the page title and pageid.
    """
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "search",
        "srsearch": query,
        "srlimit": limit
    }
    response = requests.get(url, params=params, timeout=10)
    response.raise_for_status()
    data = response.json()
    return data['query']['search']

In [None]:
def confirm_next_topic(topic):
    """
    Prompts the user to confirm proceeding to the next topic.
    Waits up to 5 minutes for input; if none is provided, auto-confirms.
    """
    prompt = f"\nFinished processing topic '{topic}'. Proceed to the next topic? (y/n): "
    try:
        answer = inputimeout(prompt=prompt, timeout=300)
    except TimeoutOccurred:
        print("\nNo response received within 5 minutes. Auto-confirming.")
        answer = "y"
    return answer.lower() == "y"

In [None]:
def main():
    topics = [
        "Computer science",
        "Medicine",
        "Law",
        "Engineering",
        "Physics",
        "Biology",
        "Economics",
        "History",
        "Psychology",
        "Art"
    ]
    
    start_date, end_date = get_last_complete_month_date_range()
    print(f"Using pageviews date range: {start_date} to {end_date}")

    wiki = wikipediaapi.Wikipedia(language='en', user_agent="UniCourseWikipediaBot (mehmetaltintas@etu.edu.tr)")

    for topic in topics:
        print(f"\nProcessing topic: {topic}")
        try:
            search_results = search_wikipedia(topic, limit=1000)
        except Exception as e:
            print(f"Error during search for topic '{topic}': {e}")
            search_results = []
        
        articles = []
        failed_pages = []
        
        for result in search_results:
            title = result['title']
            pageid = result.get('pageid', None)
            try:
                page = wiki.page(title)
                try:
                    exists = page.exists()
                except Exception as e:
                    raise Exception(f"Error checking existence for '{title}': {e}")
            except Exception as e:
                error_msg = f"Error retrieving page for '{title}': {e}"
                print(error_msg)
                failed_pages.append({"original_title": title, "error": error_msg})
                continue

            if not exists:
                error_msg = f"Page '{title}' does not exist according to wikipediaapi."
                print(error_msg)
                failed_pages.append({"original_title": title, "error": error_msg})
                continue

            canonical_title = page.title
            views = get_pageviews(canonical_title, start_date, end_date)
            articles.append({
                "title": canonical_title,
                "pageid": pageid,
                "views": views
            })
            time.sleep(0.1)
        
        articles = sorted(articles, key=lambda x: x["views"], reverse=True)[:500]
        
        safe_topic = topic.replace(" ", "_")
        articles_filename = f"{safe_topic}_articles.json"
        failed_filename = f"{safe_topic}_failed_pages.json"
        
        with open(articles_filename, "w", encoding="utf-8") as f:
            json.dump(articles, f, ensure_ascii=False, indent=4)
        print(f"Articles for topic '{topic}' saved to '{articles_filename}'.")

        with open(failed_filename, "w", encoding="utf-8") as f:
            json.dump(failed_pages, f, ensure_ascii=False, indent=4)
        print(f"Failed pages for topic '{topic}' saved to '{failed_filename}'.")

        if not confirm_next_topic(topic):
            print("User opted not to continue. Exiting.")
            break

In [None]:
main()