<a target="_blank" href="https://colab.research.google.com/github/masood/2024-pets-privacy-labels-policies/blob/main/app-store-crawl.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

# Crawl Apple App Store (Demo)

The sitemaps containing links to the Apple App Store are available at [this sitemap link](https://apps.apple.com/sitemaps_apps_index_app_1.xml) and are listed on the App Store's [robots.txt file](https://apps.apple.com/robots.txt). Each of the listed `XML` files (after unzipping) contains a list of app URLs that we then visit and parse.

However, for a quick demo, we provide examples of parsing individual app URLs, i.e., once the URLs have been gathered from the aforementioned sitemap.

ðŸ’¡ This demo can be run using the default CPU runtime

In [1]:
import json
import random
import re
import requests
from lxml import etree
from urllib.parse import urlparse
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# Helper/Utility Functions

In [2]:
# The apple app store base url.
_apple_store_url = f"https://apps.apple.com"

# The apple app store api base url.
_apple_api_url = f"https://amp-api.apps.apple.com"

# Setup the retry class.
_retries = Retry(total=2, backoff_factor=2, status_forcelist=[404, 429])

# Setup some random user agent strings.
_user_agents = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:86.0) Gecko/20100101 Firefox/86.0"
]

In [3]:
def parse_app(app_html):
    """Function to parse an app HTML into JSON.

    :param app_html: The html string.
    :return: The resulting JSON string after parsing.
    """
    dom = etree.HTML(app_html)
    raw_json = dom.xpath('//script[@type="fastboot/shoebox" and @id="shoebox-media-api-cache-apps"]/text()')[0].strip()
    response_dict = json.loads(raw_json)
    if len(response_dict.keys()) == 1:
        response_dict = json.loads(response_dict[list(response_dict.keys())[0]])
        for key in response_dict:
            if isinstance(response_dict[key], list):
                response_dict = response_dict[key][0]
                break
    return response_dict

def get_token(app_text):
    """Function to get the authorization bearer token from the original web page.

    :param app_text: The text from the app web page string.
    :return: The bearer token.
    """
    tags = app_text.splitlines()
    for tag in tags:
        if re.match(r"<meta.+web-experience-app/config/environment", tag):
            token = re.search(r"token%22%3A%22(.+?)%22", tag).group(1)
            return f"bearer {token}"

def get_app_id_from_app_url(app_url):
    """Function to parse the app id from the app url

    Example app url: https://apps.apple.com/lu/app/fun-keyboard-emoji-themes/id1455957386

    :param app_url: The app's url string.
    :return: The app id (number portion only) parsed from the app url string. e.g. 1455957386 not id1455957386
    """

    # Parse the app url
    parsed_url = urlparse(app_url)

    # Remove the app_id from the app_url
    # Split the url path on the "/"
    # The rsplit function starts splitting from the right, with max split set to 1, which returns only two items.
    # Then app id is the last item
    # Do not return the "id" part of the url_id string, so only return 1455957386 not id1455957386
    return parsed_url.path.rsplit("/", 1)[-1][2:]

def get_app_country_code_from_app_url(app_url):
    """Function to parse the country from the app URL

    Example app url: https://apps.apple.com/lu/app/fun-keyboard-emoji-themes/id1455957386

    :param app_url: The app's URL string.
    :return: The country code string parsed from the app URL string.
    """

    # Parse the app URL string.
    parsed_url = urlparse(app_url)

    # Remove the country code from the app_url
    # Split the URL path on the "/"
    # The rsplit function starts splitting from the right
    # Then app name is the second from last item
    return parsed_url.path.rsplit("/")[-4]

def get_api_session():
    """Method to get a new api session."""

    # Setup the api session.
    session = requests.Session()
    session.mount(_apple_api_url, HTTPAdapter(max_retries=_retries))

    return session


def get_api(url, headers=None, params=None):
    """Method to get from the Apple app store API.

    :param url: The url to get.
    :param headers: The headers to use during the get request.
    :param params: The parameters to use during the get request.
    :return: The resulting response from the get request.
    """

    try:
        return get_api_session().get(url, headers=headers, params=params)
    except ConnectionError as connection_error:
        print('Failed to get from api.', connection_error, sep='---')


def get_app_session():
    """Method to get a new app session."""

    # Setup the app session.
    session = requests.Session()
    session.mount(_apple_store_url, HTTPAdapter(max_retries=_retries))

    return session


def get_app_url(url, headers=None, params=None):
    """Method to get an app url.

    :param url: The url to get.
    :param headers: The headers to use during the get request.
    :param params: The parameters to use during the get request.
    :return: The resulting response from the get request.
    """

    # Try to get the url.
    try:

        # Return the resulting response from the get request.
        return get_app_session().get(url, headers=headers, params=params)

    # Handle the error.
    except ConnectionError as connection_error:

        # Log the error.
        print('Failed to get app url.', connection_error, sep='---')

## Get App Metadata from Response

In [4]:
def get_app_metadata(app_data):
    """Method to metadata from response dictionary.

    :param app_data: Dictionary extracted from response.
    :return: Metadata parsed the dictionary.
    """
    metadata = {}
    # app_type VARCHAR(20),
    try:
        metadata['app_type'] = app_data['type']
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')
        metadata['app_type'] = ""

    # user_rating_value DECIMAL(2, 1),
    try:
        metadata['user_rating_value'] = app_data['attributes']['userRating']['value']
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')
        metadata['user_rating_value'] = 0

    # user_rating_count INT,
    try:
        metadata['user_rating_count'] = app_data['attributes']['userRating']['ratingCount']
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')
        metadata['user_rating_count'] = 0

    # user_rating_label VARCHAR(20),
    try:
        metadata['user_rating_label'] = app_data['attributes']['userRating']['ariaLabelForRatings']
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')
        metadata['user_rating_label'] = ""

    # artist_name VARCHAR(50),
    try:
        metadata['artist_name'] = app_data['attributes']['artistName']
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')
        metadata['artist_name'] = ""

    # app_store_position INT,
    try:
        metadata['app_store_position'] = app_data['attributes']['chartPositions']['appStore']['position']
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')
        metadata['app_store_position'] = 0

    # app_store_genre_name VARCHAR(20),
    try:
        metadata['app_store_genre_name'] = app_data['attributes']['chartPositions']['appStore']['genreName']
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')
        metadata['app_store_genre_name'] = ""

    # app_store_genre_code INT,
    try:
        metadata['app_store_genre_code'] = app_data['attributes']['chartPositions']['appStore']['genre']
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')
        metadata['app_store_genre_code'] = 0

    # app_store_chart VARCHAR(30),
    try:
        metadata['app_store_chart'] = app_data['attributes']['chartPositions']['appStore']['chart']
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')
        metadata['app_store_chart'] = ""

    # content_rating VARCHAR(10),
    try:
        metadata['content_rating'] = app_data['attributes']['contentRatingsBySystem']['appsApple']['name']
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')
        metadata['content_rating'] = ""

    # distribution_kind VARCHAR(20),
    try:
        metadata['distribution_kind'] = app_data['attributes']['distributionKind']
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')
        metadata['distribution_kind'] = ""

    # app_name VARCHAR(50) NOT NULL,
    try:
        metadata['app_name'] = app_data['attributes']['name']
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')
        metadata['app_name'] = ""

    # Pre define variables found in the data list for loop.
    app_version = ''
    version_release_date = ''
    release_date = ''
    privacy_policy_url = ''
    has_in_app_purchases = False
    seller = ''

    # Get the data list from the app data JSON.
    try:
        platform_attributes = {}
        for platform in app_data['attributes']['platformAttributes']:
            platform_attributes = app_data['attributes']['platformAttributes'][platform]
            if platform == 'ios':
                break
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')


    # app_version VARCHAR(50) NOT NULL,
    try:
        metadata['app_version'] = platform_attributes['versionHistory'][0]['versionDisplay']
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')
    # version_release_date DATE,
    try:
        metadata['version_release_date'] = platform_attributes['versionHistory'][0]['releaseDate']
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')
    # release_date DATE,
    try:
        metadata['release_date'] = platform_attributes['releaseDate']
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')
    # privacy_policy_url VARCHAR(200),
    try:
        metadata['privacy_policy_url'] = platform_attributes['privacyPolicyUrl']
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')
    # has_in_app_purchases BOOL,
    try:
        metadata['has_in_app_purchases'] = platform_attributes['hasInAppPurchases']
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')
    # seller VARCHAR(50),
    try:
        metadata['seller'] = platform_attributes['seller']
    except Exception as exception:
        print(f'Failed to access value from app_data JSON. {exception}')

    # Pre define variables found in the included list data.
    metadata['price_formatted'] = ''
    metadata['price'] = 0
    metadata['currency_code'] = ''
    metadata['app_flavor'] = ''
    metadata['app_size'] = 0


    if 'offers' in platform_attributes:
        for offer in platform_attributes['offers']:
            try:
                offer_type = offer['type']
            except Exception as exception:
                print(f'Failed to access value from app_data JSON. {exception}')
                offer_type = ""

            if offer_type == 'get' or offer_type == 'buy':
                # price_formatted VARCHAR(10),
                try:
                    metadata['price_formatted'] = offer['priceFormatted']
                except Exception as exception:
                    print(f'Failed to access value from app_data JSON. {exception}')
                # price INT,
                try:
                    metadata['price_'] = offer['price']
                except Exception as exception:
                    print(f'Failed to access value from app_data JSON. {exception}')
                # currency_code VARCHAR(10),
                try:
                    metadata['currency_code'] = offer['currencyCode']
                except Exception as exception:
                    print(f'Failed to access value from app_data JSON. {exception}')

                try:
                    assets = offer['assets']
                except Exception as exception:
                    print(f'Failed to access value from app_data JSON. {exception}')
                    assets = []

                for asset in assets:
                    # app_flavor VARCHAR(20),
                    try:
                        metadata['app_flavor'] = asset['flavor']
                    except Exception as exception:
                        print(f'Failed to access value from app_data JSON. {exception}')
                    # app_size INT,
                    try:
                        metadata['app_size'] = asset['size']
                    except Exception as exception:
                        print(f'Failed to access value from app_data JSON. {exception}')
                    break
    return metadata

## Get Privacy Label Details

In [5]:
def get_privacy_labels(app_url, app_id, country_code, token):
        # Prepare the request headers for requesting the privacy details.
        request_headers = {
            "Accept": "application/json",
            "Authorization": token,
            "Connection": "keep-alive",
            "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
            "Origin": _apple_store_url,
            "Referer": app_url,
            "User-Agent": random.choice(_user_agents),
        }

        # Prepare the apple api url and request parameters for requesting the privacy details.
        app_apple_api_url = f"https://amp-api.apps.apple.com/v1/catalog/{country_code}/apps/{app_id}"
        request_params = {'platform': 'web', 'fields': 'privacyDetails', 'l': 'en-us'}

        # Use the Apple api to request the privacy details.
        response = get_api(app_apple_api_url, headers=request_headers, params=request_params)

        # Load the response into JSON data.
        privacy_details_json = json.loads(response.text)

        privacy_details = []
        if len(privacy_details_json) > 0:
            # Get the privacy type list.
            try:
                privacy_types = privacy_details_json['data'][0]['attributes']['privacyDetails']['privacyTypes']
            except Exception as exception:
                print(f'Failed to access value from app_data JSON. {exception}')
                privacy_types = []

            for privacy_type in privacy_types:
                privacy_detail_entry = {}

                # privacy_type VARCHAR(50) NOT NULL,
                privacy_detail_entry['privacyType'] = privacy_type['identifier']

                # Get the data category list.
                try:
                    data_categories = privacy_type['dataCategories']
                    privacy_detail_entry['dataCategories'] = []
                except Exception as exception:
                    print(f'Failed to access value from app_data JSON. {exception}')
                    data_categories = []

                # For each data category insert the data category identifier and the related data types.
                for data_category in data_categories:
                    data_category_entry = {}
                    # data_category VARCHAR(100) NOT NULL,
                    data_category_entry['dataCategory'] = data_category['identifier']

                    # Get the data type list.
                    try:
                        data_category_entry['dataTypes'] = data_category['dataTypes']
                    except Exception as exception:
                        print(f'Failed to access value from app_data JSON. {exception}')
                        data_category_entry['dataTypes'] = []

                    privacy_detail_entry['dataCategories'].append(data_category_entry)

                # For each purpose insert the purpose and the related data categories.
                try:
                    purposes = privacy_type['purposes']
                    privacy_detail_entry['purposes'] = []
                except Exception as exception:
                    print(f'Failed to access value from app_data JSON. {exception}')
                    purposes = []

                for purpose in purposes:
                    purpose_entry = {}
                    # purpose VARCHAR(200) NOT NULL,
                    purpose_entry['purpose'] = purpose['identifier']

                    # Get the data category list.
                    try:
                        data_categories = purpose['dataCategories']
                        purpose_entry['dataCategories'] = []
                    except Exception as exception:
                        print(f'Failed to access value from app_data JSON. {exception}')
                        data_categories = []

                    # For each data category insert the data category identifier and the related data types.
                    for data_category in data_categories:
                        data_category_entry = {}
                        # data_category VARCHAR(100) NOT NULL,
                        data_category_entry['dataCategory'] = data_category['identifier']

                        # For each data type insert the data type.
                        try:
                            data_category_entry['dataTypes'] = data_category['dataTypes']
                        except Exception as exception:
                            print(f'Failed to access value from app_data JSON. {exception}')
                            data_category_entry['dataTypes'] = []

                        purpose_entry['dataCategories'].append(data_category_entry)

                    privacy_detail_entry['purposes'].append(purpose_entry)

                privacy_details.append(privacy_detail_entry)

        return privacy_details

# Process a Given App URL

In [6]:
def process_app_url(app_url):
    # Set the headers for retrieving the app page from the Apple app store.
    app_headers = {
        "Accept": "text/html",
        "Connection": "keep-alive",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "Origin": "https://apps.apple.com",
        "Referer": "https://apps.apple.com",
        "User-Agent": random.choice(_user_agents),
    }

    # Get the app id from the app url
    app_id = get_app_id_from_app_url(app_url)

    # Get the country code from the app url, used when creating the api url.
    country_code = get_app_country_code_from_app_url(app_url)


    # Create the request params.
    request_params = {'l': 'en-us'}

    # Get the app page from the Apple app store.
    response = get_app_url(app_url, headers=app_headers, params=request_params)

    # Parse the response text into an app data JSON.
    app_data = parse_app(response.text)

    # Get the token
    token = get_token(response.text)

    parsed_data = {}

    parsed_data["appMetadata"] = get_app_metadata(app_data)

    # Get the privacy types if they exist.
    try:
        privacy_types = app_data['attributes']['privacy']['privacyTypes']
    except Exception as exception:
        print('Failed to access value from app_data JSON.', exception, sep='---')
        privacy_types = []

    # Only request privacy details if there are privacy details available for this app.
    if len(privacy_types) > 0:
        parsed_data["privacyLabel"] = get_privacy_labels(app_url, app_id, country_code, token)

    return parsed_data

# Sample Apps

In [7]:
app_urls = {
    "walmart": 'https://apps.apple.com/us/app/walmart-shopping-savings/id338137227',
    "instagram": 'https://apps.apple.com/us/app/instagram/id389801252',
    "creditkarma": 'https://apps.apple.com/us/app/intuit-credit-karma/id519817714'
}

In [8]:
app_data = process_app_url(app_urls["instagram"])

In [9]:
app_data['appMetadata']

{'app_type': 'apps',
 'user_rating_value': 4.7,
 'user_rating_count': 26437179,
 'user_rating_label': '4.7 stars',
 'artist_name': 'Instagram, Inc.',
 'app_store_position': 2,
 'app_store_genre_name': 'Photo & Video',
 'app_store_genre_code': 6008,
 'app_store_chart': 'top-free',
 'content_rating': '12+',
 'distribution_kind': 'APP_STORE',
 'app_name': 'Instagram',
 'app_version': '332.0.0',
 'version_release_date': '2024-05-20',
 'release_date': '2010-10-06',
 'privacy_policy_url': 'http://instagram.com/legal/privacy/',
 'has_in_app_purchases': True,
 'seller': 'Instagram, Inc.',
 'price_formatted': '$0.00',
 'price': 0,
 'currency_code': 'USD',
 'app_flavor': 'iosSoftware',
 'app_size': 288575488,
 'price_': 0}

In [10]:
app_data['privacyLabel']

[{'privacyType': 'DATA_USED_TO_TRACK_YOU',
  'dataCategories': [{'dataCategory': 'CONTACT_INFO',
    'dataTypes': ['Physical Address',
     'Email Address',
     'Name',
     'Phone Number']},
   {'dataCategory': 'IDENTIFIERS', 'dataTypes': ['User ID', 'Device ID']},
   {'dataCategory': 'OTHER', 'dataTypes': ['Other Data Types']}],
  'purposes': []},
 {'privacyType': 'DATA_LINKED_TO_YOU',
  'dataCategories': [],
  'purposes': [{'purpose': 'THIRD_PARTY_ADVERTISING',
    'dataCategories': [{'dataCategory': 'PURCHASES',
      'dataTypes': ['Purchase History']},
     {'dataCategory': 'FINANCIAL_INFO', 'dataTypes': ['Other Financial Info']},
     {'dataCategory': 'LOCATION',
      'dataTypes': ['Precise Location', 'Coarse Location']},
     {'dataCategory': 'CONTACT_INFO',
      'dataTypes': ['Physical Address',
       'Email Address',
       'Name',
       'Phone Number',
       'Other User Contact Info']},
     {'dataCategory': 'CONTACTS', 'dataTypes': ['Contacts']},
     {'dataCategory': 