In [12]:
import os
import json
import requests
import xmltodict

from collections import Counter

In [2]:
def fetch_reviews(app_id, country = 'us', sortBy = 'mostRecent', page = 1):
    """ Get max 500 user reviews for a given app.
    
    country = default to United States (us).
    sorty = 'mostRecent'(default) or 'mostHelpful'.
    page = page number. default = 1, max = 10.
    """
    
    url = 'https://itunes.apple.com/%s/rss/customerreviews/id=%s/sortBy=%s/page=%s/xml' % (country, 
                                                                                           str(app_id), 
                                                                                           sortBy,
                                                                                           str(page))
    r = requests.get(url)
    reviews_dict = xmltodict.parse(r.text)
    
    try:  # If there are no reviews on this page, break out of loop
        reviews_list = reviews_dict['feed']['entry']
    except:
        return []
    
    reviews = []
    for review in reviews_list:
        reviews.append({'title': review['title'],
                       'author': review['author']['name'],
                       'authorUrl': review['author']['uri'],
                       'rating': review['im:rating'],
                       'date': review['updated'],
                       'voteSum': review['im:voteSum'],
                       'voteCount': review['im:voteCount'],
                       'content': review['content'][0]['#text'].replace('\n', ' ')
                      
    return reviews

In [13]:
def fetch_apps(terms: list, limit: int):
    """Uses the iTunes store search API to get metadata for apps.
    
    terms: A list of terms to search (ie. ["Medical", "Personal", "Health"])
    limit: Number of apps to return (max 200)
    """
    
    url = "https://itunes.apple.com/search?media=software&entity=software&term=%s&limit=%s" % ("+".join(terms),
                                                                                              limit)
            
    r = requests.get(url)
    print(r.text)
    return r.text

In [14]:

#filtered_apps = [ app for app in apps['results'] 
#                 if app['primaryGenreName'] == 'Health & Fitness' ] # Filter out all non-Health&Fitness apps.


In [15]:
fetch_apps(["medical", "personal"], 2)




{
 "resultCount":2,
 "results": [
{"isGameCenterEnabled":false, 
"screenshotUrls":["https://is2-ssl.mzstatic.com/image/thumb/Purple128/v4/d3/a0/c1/d3a0c164-6f6e-95c2-a388-833c90bd2a54/source/392x696bb.jpg", "https://is1-ssl.mzstatic.com/image/thumb/Purple128/v4/1c/53/b4/1c53b4cb-67b0-62eb-864c-d1e841620ff7/source/392x696bb.jpg", "https://is2-ssl.mzstatic.com/image/thumb/Purple128/v4/e1/5b/01/e15b0157-8510-6b81-4c84-db0e82b3c25a/source/392x696bb.jpg", "https://is2-ssl.mzstatic.com/image/thumb/Purple118/v4/22/ae/76/22ae76da-4c49-da25-d1d3-53f5bba5c6a6/source/392x696bb.jpg", "https://is2-ssl.mzstatic.com/image/thumb/Purple118/v4/f5/b8/fa/f5b8fa07-a31c-77c9-172e-f1060b1f42c4/source/392x696bb.jpg", "https://is3-ssl.mzstatic.com/image/thumb/Purple118/v4/22/7a/af/227aaf21-d9e4-f16b-ebcb-06310dcc9c9b/source/392x696bb.jpg"], "ipadScreenshotUrls":[], "appletvScreenshotUrls":[], "artworkUrl60":"https://is1-ssl.mzstatic.com/image/thumb/Purple128/v4/6e/be/63/6ebe632c-bcc8-ccf5-8a77-f24aa0b02cf9/

'\n\n\n{\n "resultCount":2,\n "results": [\n{"isGameCenterEnabled":false, \n"screenshotUrls":["https://is2-ssl.mzstatic.com/image/thumb/Purple128/v4/d3/a0/c1/d3a0c164-6f6e-95c2-a388-833c90bd2a54/source/392x696bb.jpg", "https://is1-ssl.mzstatic.com/image/thumb/Purple128/v4/1c/53/b4/1c53b4cb-67b0-62eb-864c-d1e841620ff7/source/392x696bb.jpg", "https://is2-ssl.mzstatic.com/image/thumb/Purple128/v4/e1/5b/01/e15b0157-8510-6b81-4c84-db0e82b3c25a/source/392x696bb.jpg", "https://is2-ssl.mzstatic.com/image/thumb/Purple118/v4/22/ae/76/22ae76da-4c49-da25-d1d3-53f5bba5c6a6/source/392x696bb.jpg", "https://is2-ssl.mzstatic.com/image/thumb/Purple118/v4/f5/b8/fa/f5b8fa07-a31c-77c9-172e-f1060b1f42c4/source/392x696bb.jpg", "https://is3-ssl.mzstatic.com/image/thumb/Purple118/v4/22/7a/af/227aaf21-d9e4-f16b-ebcb-06310dcc9c9b/source/392x696bb.jpg"], "ipadScreenshotUrls":[], "appletvScreenshotUrls":[], "artworkUrl60":"https://is1-ssl.mzstatic.com/image/thumb/Purple128/v4/6e/be/63/6ebe632c-bcc8-ccf5-8a77-f24aa

In [7]:
app_metadata = []

for app in filtered_apps[40:42]:
    # Get app metadata
    app_data = {'name': app['trackName'],
                'id': app['trackId'],
                'url': app['trackViewUrl'],
                'price': app['price'],
                'avgUserRating': app['averageUserRating'],
                'userRatingCount': app['userRatingCount'],
                'currentVersionReleaseDate': app['currentVersionReleaseDate'],
                'description': app['description'].replace('\n', ' ')
                }
    
    # Get most recent and most helpful reviews (separately) for app (up to 500)
    recent_reviews = []
    helpful_reviews = []
    for i in range(1, 11): 
        recent_reviews += fetch_reviews(app_data['id'], page=i)
        helpful_reviews += fetch_reviews(app_data['id'], sortBy='mostHelpful', page=i)

    app_data['recent_reviews'] = recent_reviews
    app_data['helpful_reviews'] = helpful_reviews
    app_metadata.append(app_data)

In [9]:
with open('appstore_metadata_and_reviews.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(app_metadata, ensure_ascii=False))