In [1]:
# src/scrape_apple_reviews.py

import time
import requests
import pandas as pd
from datetime import datetime

def fetch_apple_reviews(app_id: str, country: str = "us", max_pages: int = 10, sleep: float = 0.2) -> pd.DataFrame:
    """
    Scrape App Store reviews via iTunes RSS feed.
    Returns a DataFrame with:
    app_id, country, review_date, rating, title, content
    """
    rows = []
    seen_first_entry_id = None

    for page in range(1, max_pages + 1):
        url = f"https://itunes.apple.com/{country}/rss/customerreviews/page={page}/id={app_id}/sortby=mostrecent/json"
        r = requests.get(url, timeout=30)

        if r.status_code != 200:
            print(f"{country.upper()} app {app_id} page {page}: status={r.status_code} -> stopping")
            break

        data = r.json()
        feed = data.get("feed", {})
        entries = feed.get("entry", [])

        # Normalize entries
        if isinstance(entries, dict):
            entries = [entries]
        if not isinstance(entries, list):
            entries = []

        # Apple sometimes returns a page that repeats earlier pages.
        # We detect repetition by tracking the first entry id.
        if entries and isinstance(entries[0], dict):
            first_id = entries[0].get("id", {}).get("label", None)
            if page == 1:
                seen_first_entry_id = first_id
            else:
                if first_id == seen_first_entry_id:
                    print(f"{country.upper()} app {app_id} page {page}: repeating page -> stopping")
                    break

        extracted = 0
        for e in entries:
            if not isinstance(e, dict):
                continue

            rating_str = e.get("im:rating", {}).get("label", None)
            if rating_str is None:
                continue

            title = e.get("title", {}).get("label", "")
            content = e.get("content", {}).get("label", "")
            date_str = e.get("updated", {}).get("label", "")

            try:
                review_date = datetime.fromisoformat(date_str.replace("Z", "+00:00"))
            except:
                review_date = None

            try:
                rating = int(rating_str)
            except:
                rating = None

            rows.append({
                "app_id": str(app_id),
                "country": str(country),
                "review_date": review_date,
                "rating": rating,
                "title": title,
                "content": content,
            })
            extracted += 1

        print(f"{country.upper()} app {app_id} page {page}: extracted {extracted} reviews (total={len(rows)})")

        if extracted == 0:
            break

        time.sleep(sleep)

    return pd.DataFrame(rows)
