In [1]:
import json
import requests
from collections import OrderedDict
from datetime import datetime

import numpy as np
import pandas as pd
import xmltodict

In [26]:
df_metadata = pd.read_csv("data/appstore_metadata_phr.csv")
df_reviews = pd.read_csv("data/appstore_recent_reviews_phr.csv")

In [3]:
def fetch_metadata(app_id):
    """Fetch metadata given an app_id
    """
    
    url = "https://itunes.apple.com/lookup?id=%s" % (str(app_id))
    r = requests.get(url)
    return json.loads(r.text)['results']

In [4]:
def fetch_reviews(app_id, country = 'us', sortBy = 'mostRecent', page = 1):
    """ Get max 500 user reviews for a given app.
    
    country = default to United States (us).
    sorty = 'mostRecent'(default) or 'mostHelpful'.
    page = page number. default = 1, max = 10.
    """
    
    url = 'https://itunes.apple.com/%s/rss/customerreviews/id=%s/sortBy=%s/page=%s/xml' % (country, 
                                                                                           str(app_id), 
                                                                                           sortBy,
                                                                                           str(page))
    r = requests.get(url)
    reviews_dict = xmltodict.parse(r.text)
    
    try:  # If there are no reviews on this page, break out of loop
        reviews_list = reviews_dict['feed']['entry']
    except:
        return []
    
    reviews = []
    for review in reviews_list:
        try:
            reviews.append({'title': review['title'],
                           'author': review['author']['name'],
                           'authorUrl': review['author']['uri'],
                           'rating': review['im:rating'],
                           'date': str(datetime.strptime(review['updated'][:-6], "%Y-%m-%dT%H:%M:%S")),
                           'voteSum': review['im:voteSum'],
                           'voteCount': review['im:voteCount'],
                           'content': review['content'][0]['#text'].replace('\n', ' ')
                           })
        except:
            break
    return reviews

## Update metadata

In [20]:
updated_metadata_list = []
for ix, app in df_metadata.iterrows():
    new_metadata = fetch_metadata(app['id'])[0]
    metadata = OrderedDict()
    metadata['name'] = new_metadata['trackName']
    metadata['id'] = new_metadata['trackId']
    metadata['url'] = new_metadata['trackViewUrl']
    metadata['price'] = new_metadata['price']
    metadata['userRatingCount'] = new_metadata.get('userRatingCount')
    metadata['currentVersionReleaseDate'] = str(datetime.strptime(new_metadata['currentVersionReleaseDate'],
                                                                  "%Y-%m-%dT%H:%M:%SZ"))
    metadata['avgUserRating'] = new_metadata.get('averageUserRating')
    metadata['description'] = new_metadata['description'].replace('\n', ' ')
    
    updated_metadata_list.append(metadata)

In [22]:
df_updated_metadata = pd.DataFrame(updated_metadata_list)
assert df_updated_metadata.shape == df_metadata.shape

In [25]:
df_updated_metadata.to_csv("data/appstore_metadata_phr.csv", index=False)

## Fetch new reviews

In [31]:
app_new_reviews = {}

# Get most recent and most helpful reviews (separately) for app (up to 500)
# NOTE: If the app does not have many reviews, the same reviews may be included in both recent
# and helpful reviews.
for ix, app in df_metadata.iterrows():
    print(app['name'])
    reviews_for_app = df_reviews[df_reviews['name'] == app['name']]
    try:
        latest_review_date = max(reviews_for_app['date'])
    except:
        continue
        
    new_reviews = []

    for i in range(1, 11): 
        fetched_reviews = fetch_reviews(app['id'], page=i)
        
        # Stop fetching once we have passed the newest review we have stored (by date)
        last_review_in_new = False
        for j, review in enumerate(fetched_reviews):
            if  review['date'] <= latest_review_date:
                last_review_in_new = True
                break
        
        if last_review_in_new == False:
            new_reviews += fetched_reviews
        else:
            new_reviews += fetched_reviews[:j]
            break

    app_new_reviews[app['name']] = new_reviews

FollowMyHealth®
healow
Healthspek - Personal Health Record & Family Health Record - Complete Medical Record for iPhone
DrChrono OnPatient Portal
Teladoc
The Diary Health App
Emrify - Personal Health Record
Personal Records
UPMC Health Plan
OpenTreatment Personal Health Record (PHR)
Capzule
Health Tracker & Manager for iPhone - Personal Healthbook App for Tracking Blood Pressure BP, Glucose & Weight BMI
Sanford
PortalConnect
CareSync | Care Coordination
MTBC PHR
bant - Simplifying diabetes
Medical Wiz - Family Health Information
Healthjump
YourHealthRecord Mobile
MyQuadMed
Get Your Health Record
RaceAddict
MedXCom for Patients
WODTrackr
Healee
Medical Wiz Pro - Family Health Information
iBlueButton®
I'M HOME! ®
Dyrii Journal
BCM


In [32]:
new_reviews_list = []
for ix, app in df_metadata.iterrows():
    for new_review in app_new_reviews[app['name']]:
        review = OrderedDict()
        review['name'] = app['name']
        review['id'] = app['id']
        review['userRatingCount'] = app['userRatingCount']
        review['avgUserRating'] = app['avgUserRating']
        review['title'] = new_review.get('title')
        review['author'] = new_review.get('author')
        review['authorUrl'] = new_review.get('authorUrl')
        review['rating'] = new_review.get('rating')
        review['date'] = new_review.get('date')
        review['voteSum'] = new_review.get('voteSum')
        review['voteCount'] = new_review.get('voteCount')
        review['content'] = new_review.get('content')
        new_reviews_list.append(review)
    
print("Gathered %s new reviews" % (str(len(new_reviews_list))))

Gathered 0 new reviews


In [24]:
df_reviews = df_reviews.append(pd.DataFrame(new_reviews_list), ignore_index=True)
df_reviews.to_csv("data/appstore_recent_reviews_phr.csv", index=False)