In [1]:
import os
import json
import requests
import xmltodict
import re

from collections import Counter

In [2]:

def fetch_reviews(app_id, country = 'us', sortBy = 'mostRecent', page =1):

    URL = 'https://itunes.apple.com/%s/rss/customerreviews/id=%s/sortBy=%s/page=%s/xml' 
    args = (country,str(app_id),sortBy, page)

    url = URL % args

    r = requests.get(url)
    xml_data = r.text

    xml_dict = xmltodict.parse(xml_data)

    try:
        reviews_list = xml_dict['feed']['entry']
    except:
        return []
    
    reviews = []
    for review in reviews_list:
        try:
            reviews.append(
        {
            'title': review['title'],
            'author_name': review['author']['name'],
            'author_uri': review['author']['uri'],
            'voteSum': review['im:voteSum'],
            'voteCount': review['im:voteCount'],
            'rating': review['im:rating'],
            'text': review['content'][0]['#text'],
            'date': review['updated']    
        }
            )
        except:
            pass
    return reviews

In [3]:
# Open the .csv file with the list of PHR/EHR apps

with open('data/appstore_app_id.csv') as f:
    data = f.readlines()
    

In [4]:
# Get an app name and its iTunes id
import re
data_clean = []
j =0
# Create a list of lists where each list is [App name, link]
for s in data:
    j+=1
    data_clean.append([s.split(',')[0],s.split(',')[1]])
# Filter rows with non-empty links
data_clean = [(x[0],x[1].split('""')[1]) for x in data_clean if len(x[1])>1]
app_ids =[]
for line in data_clean:
    data_dict = {'name':line[0],
                 'id': ''.join(re.findall(r'\d+', line[1].split('/')[-1].split('?')[0] ))}# extracting id number'
    app_ids.append(data_dict)

In [5]:
app_ids

[{'name': 'MS HealthVault', 'id': '546835834'},
 {'name': 'FitnessSyncer', 'id': '1159207899'},
 {'name': 'MTBC PHR', 'id': '499832131'},
 {'name': 'Accolade', 'id': '1203452254'},
 {'name': 'Capzule', 'id': '386321118'},
 {'name': 'MyMedical', 'id': '347860026'},
 {'name': 'GenexEHR', 'id': '1089533617'},
 {'name': 'HealthSpek', 'id': '576488481'},
 {'name': 'MedFusionPlus', 'id': '922524241'},
 {'name': 'higi', 'id': '599485135'},
 {'name': 'Sharecare', 'id': '964313779'},
 {'name': 'Aetna', 'id': '380845816'},
 {'name': 'Kaiser Permanente', 'id': '493390354'},
 {'name': 'Cigna', 'id': '569266174'},
 {'name': 'Blue Shield', 'id': '728293729'},
 {'name': 'Humana', 'id': '779622024'},
 {'name': 'Anthem', 'id': '589443627'},
 {'name': 'Capital Blue cross virtual care', 'id': '1375554671'}]

In [6]:
len(app_ids)

18

In [7]:
# Lookup metadata (including reviews) for an app with the specific iTunes ID
URL = 'https://itunes.apple.com/lookup?id=%s'

app_metadata = []
j = 0
for line in app_ids: 
    # Get app metadata
    j+=1
    print(j)
    app_id = line['id']

    url = URL % app_id
    r = requests.get(url)
    app = json.loads(r.text)["results"][0]
    
    app_data = {'name': app['trackName'],
                'id': app['trackId'],
                'url': app['trackViewUrl'],
                'price': app['price'],
                'avgUserRating': app.get('averageUserRating',''),
                'userRatingCount': app.get('userRatingCount',''),
                'currentVersionReleaseDate': app['currentVersionReleaseDate'],
                'description': app['description'].replace('\n', ' ')
                }
    print(app_data['name'])
    
    # Get most recent and most helpful reviews (separately) for app (up to 500)
    recent_reviews = []
    helpful_reviews = []
    for i in range(1, 11): 
        recent_reviews += fetch_reviews(app_data['id'], page=i)
        helpful_reviews += fetch_reviews(app_data['id'], sortBy='mostHelpful', page=i)

    app_data['recent_reviews'] = recent_reviews
    app_data['helpful_reviews'] = helpful_reviews
    app_metadata.append(app_data)

1
Microsoft HealthVault
2
FitnessSyncer
3
MTBC PHR
4
Accolade, Inc.
5
Capzule
6
My Medical
7
Health Records : GenexEHR
8
Healthspek - Personal Health Record & Family Health Record - Complete Medical Record
9
Medfusion Plus
10
higi
11
Sharecare
12
Aetna Mobile
13
Kaiser Permanente
14
myCigna
15
Blue Shield of California
16
MyHumana
17
Anthem Anywhere
18
Capital BlueCross Virtual Care


In [8]:
len(app_metadata)

18

In [9]:
with open('data/phr_apps_metadata_and_reviews.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(app_metadata, ensure_ascii=False))