In [72]:
import json
import requests
from collections import OrderedDict
from datetime import datetime

import numpy as np
import pandas as pd
import xmltodict

In [73]:
df_metadata = pd.read_csv("data/appstore_metadata_phr.csv")
df_reviews = pd.read_csv("data/appstore_recent_reviews_phr.csv")

In [74]:
def fetch_metadata(app_id):
    """Fetch metadata given an app_id
    """
    
    url = "https://itunes.apple.com/lookup?id=%s" % (str(app_id))
    r = requests.get(url)
    return json.loads(r.text)['results']

In [75]:
def fetch_reviews(app_id, country = 'us', sortBy = 'mostRecent', page = 1):
    """ Get max 500 user reviews for a given app.
    
    country = default to United States (us).
    sorty = 'mostRecent'(default) or 'mostHelpful'.
    page = page number. default = 1, max = 10.
    """
    
    url = 'https://itunes.apple.com/%s/rss/customerreviews/id=%s/sortBy=%s/page=%s/xml' % (country, 
                                                                                           str(app_id), 
                                                                                           sortBy,
                                                                                           str(page))
    r = requests.get(url)
    reviews_dict = xmltodict.parse(r.text)
    
    try:  # If there are no reviews on this page, break out of loop
        reviews_list = reviews_dict['feed']['entry']
    except:
        return []
    
    reviews = []
    for review in reviews_list:
        try:
            reviews.append({'title': review['title'],
                           'author_name': review['author']['name'],
                           'author_uri': review['author']['uri'],
                           'rating': review['im:rating'],
                           'date': str(datetime.strptime(review['updated'][:-6], "%Y-%m-%dT%H:%M:%S")),
                           'voteSum': review['im:voteSum'],
                           'voteCount': review['im:voteCount'],
                           'text': review['content'][0]['#text'].replace('\n', ' ')
                           })
        except:
            break
    return reviews

## Update metadata

In [76]:
updated_metadata_list = []
for ix, app in df_metadata.iterrows():
    new_metadata = fetch_metadata(app['id'])[0]
    metadata = OrderedDict()
    metadata['name'] = new_metadata['trackName']
    metadata['id'] = new_metadata['trackId']
    metadata['url'] = new_metadata['trackViewUrl']
    metadata['price'] = new_metadata['price']
    metadata['avgUserRating'] = new_metadata.get('averageUserRating')
    metadata['userRatingCount'] = new_metadata.get('userRatingCount')
    metadata['currentVersionReleaseDate'] = str(datetime.strptime(new_metadata['currentVersionReleaseDate'],
                                                                  "%Y-%m-%dT%H:%M:%SZ"))
    metadata['description'] = new_metadata['description'].replace('\n', ' ')
    
    updated_metadata_list.append(metadata)

In [77]:
df_updated_metadata = pd.DataFrame(updated_metadata_list)
assert df_updated_metadata.shape == df_metadata.shape

In [78]:
df_updated_metadata.to_csv("data/appstore_metadata_phr.csv", index=False)

## Fetch new reviews

In [79]:
app_new_reviews = {}

# Get most recent and most helpful reviews (separately) for app (up to 500)
# NOTE: If the app does not have many reviews, the same reviews may be included in both recent
# and helpful reviews.
for ix, app in df_metadata.iterrows():
    print(app['name'])
    reviews_for_app = df_reviews[df_reviews['name'] == app['name']]
    try:
        latest_review_date = max(reviews_for_app['date'])
    except:
        continue
        
    new_reviews = []

    for i in range(1, 11): 
        fetched_reviews = fetch_reviews(app['id'], page=i)
        
        # Stop fetching once we have passed the newest review we have stored (by date)
        last_review_in_new = False
        for j, review in enumerate(fetched_reviews):
            if  review['date'] <= latest_review_date:
                last_review_in_new = True
                break
        
        if last_review_in_new == False:
            new_reviews += fetched_reviews
        else:
            new_reviews += fetched_reviews[:j]
            break

    app_new_reviews[app['name']] = new_reviews

Microsoft HealthVault
FitnessSyncer
MTBC PHR
Accolade, Inc.
Capzule
My Medical
Health Records : GenexEHR
Healthspek - Personal Health Record & Family Health Record - Complete Medical Record
Medfusion Plus
higi
Sharecare
Aetna Mobile
Kaiser Permanente
myCigna
Blue Shield of California
MyHumana
Anthem Anywhere
Capital BlueCross Virtual Care


In [80]:
new_reviews_list = []
for ix, app in df_metadata.iterrows():
    for new_review in app_new_reviews[app['name']]:
        review = OrderedDict()
        review['name'] = app['name']
        review['id'] = app['id']
        review['title'] = new_review.get('title')
        review['author_name'] = new_review.get('author_name')
        review['author_uri'] = new_review.get('author_uri')
        review['voteSum'] = new_review.get('voteSum')
        review['voteCount'] = new_review.get('voteCount')
        review['rating'] = new_review.get('rating')
        review['text'] = new_review.get('text')
        review['date'] = new_review.get('date')
        new_reviews_list.append(review)
    
print("Gathered %s new reviews" % (str(len(new_reviews_list))))

Gathered 16 new reviews


In [83]:
df_reviews_updated = df_reviews.append(pd.DataFrame(new_reviews_list), ignore_index=True)
df_reviews_updated.head()

Unnamed: 0,name,id,title,author_name,author_uri,voteSum,voteCount,rating,text,date
0,Health Records : GenexEHR,1089533617,,,,,,,,
1,ncy,https://itunes.apple.com/us/reviews/id327374344,5,From a wonderful company like MS I always expe...,2018-10-30T21:11:53-07:00,,,,,
2,Microsoft HealthVault,546835834,Itâ€™s a shame for Microsoft to publish such a...,omg.itsjj,https://itunes.apple.com/us/reviews/id100859113,0.0,0.0,2.0,The design is way too outdated and desktop-lik...,2018-10-29T22:44:26-07:00
3,Microsoft HealthVault,546835834,Lab Corp blood results,liver transplant patient,https://itunes.apple.com/us/reviews/id782157250,0.0,0.0,1.0,Lab Corp had my weekly blood work results on t...,2018-08-08T05:03:25-07:00
4,Microsoft HealthVault,546835834,What happened,Bistline,https://itunes.apple.com/us/reviews/id335994415,0.0,0.0,3.0,This app used to be my favorite. It would sync...,2018-06-27T14:07:45-07:00


In [None]:
df_reviews_updated.ro