In [1]:
import os
import json
import requests
import xmltodict
import re

from collections import Counter

In [2]:

def fetch_reviews(app_id, country = 'us', sortBy = 'mostRecent', page =1):

    URL = 'https://itunes.apple.com/%s/rss/customerreviews/id=%s/sortBy=%s/page=%s/xml' 
    args = (country,str(app_id),sortBy, page)

    url = URL % args

    r = requests.get(url)
    xml_data = r.text

    xml_dict = xmltodict.parse(xml_data)

    try:
        reviews_list = xml_dict['feed']['entry']
    except:
        return []
    
    reviews = []
    for review in reviews_list:
        try:
            reviews.append(
        {
            'title': review['title'],
            'author_name': review['author']['name'],
            'author_uri': review['author']['uri'],
            'voteSum': review['im:voteSum'],
            'voteCount': review['im:voteCount'],
            'rating': review['im:rating'],
            'text': review['content'][0]['#text'],
            'date': review['updated']    
        }
            )
        except:
            pass
    return reviews

In [9]:
# Open the .csv file with the list of PHR/EHR apps

with open('data/data_appstore_dna.csv') as f:
    data = f.readlines()
    

In [11]:
data.pop(0)

',\n'

In [18]:
data_clean

[['23andMe Health',
  'https://itunes.apple.com/us/app/23andme-dna-testing/id952516687\n'],
 ['DNA Passport (Helix)',
  'https://itunes.apple.com/us/app/dnapassport-explore-your-dna/id1306945262']]

In [19]:
# Get an app name and its iTunes id
import re
data_clean = []
j =0
# Create a list of lists where each list is [App name, link]
for s in data:
    print(s)
    j+=1
    print(j)
    data_clean.append([s.split(',')[0],s.split(',')[1]])
# Filter rows with non-empty links
data_clean = [(x[0],x[1].split('\n')[0]) for x in data_clean if len(x[1])>1]
app_ids =[]
for line in data_clean:
    data_dict = {'name':line[0],
                 'id': ''.join(re.findall(r'\d+', line[1].split('/')[-1].split('?')[0] ))}# extracting id number'
    app_ids.append(data_dict)

23andMe Health,https://itunes.apple.com/us/app/23andme-dna-testing/id952516687

1
DNA Passport (Helix),https://itunes.apple.com/us/app/dnapassport-explore-your-dna/id1306945262
2


In [20]:
len(app_ids)

2

In [21]:
app_ids

[{'name': '23andMe Health', 'id': '952516687'},
 {'name': 'DNA Passport (Helix)', 'id': '1306945262'}]

In [23]:
# Lookup metadata (including reviews) for an app with the specific iTunes ID
URL = 'https://itunes.apple.com/lookup?id=%s'

app_metadata = []
j = 0
for line in app_ids: 
    # Get app metadata
    j+=1
    print(j)
    app_id = line['id']

    url = URL % app_id
    r = requests.get(url)
    app = json.loads(r.text)["results"][0]
    
    app_data = {'name': app['trackName'],
                'id': app['trackId'],
                'url': app['trackViewUrl'],
                'price': app['price'],
                'avgUserRating': app.get('averageUserRating',''),
                'userRatingCount': app.get('userRatingCount',''),
                'currentVersionReleaseDate': app['currentVersionReleaseDate'],
                'description': app['description'].replace('\n', ' ')
                }
    print(app_data['name'])
    
    # Get most recent and most helpful reviews (separately) for app (up to 500)
    recent_reviews = []
    helpful_reviews = []
    for i in range(1, 11): 
        recent_reviews += fetch_reviews(app_data['id'], page=i)
        helpful_reviews += fetch_reviews(app_data['id'], sortBy='mostHelpful', page=i)

    app_data['recent_reviews'] = recent_reviews
    app_data['helpful_reviews'] = helpful_reviews
    app_metadata.append(app_data)

1
23andMe - DNA Testing
2
DNAPassport - Explore Your DNA


In [24]:
len(app_metadata)

2

In [25]:
with open('data/phr_apps_metadata_and_reviews_dna.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(app_metadata, ensure_ascii=False))

## Convert from json to csv and separate recent and helpful reviews and metadata

In [26]:
# From json to csv
import os, sys, string, json, csv
import pandas as pd


In [27]:
def app_metadata_csv(json_file_path, csv_output_path):
    '''Write a csv file with app metadata without reviews'''
    
    # Load json data
    json_data=open(json_file_path).read()
    jsn = json.loads(json_data)  
    # Open the output file
    csv_data = open(csv_output_path, 'w')
    # Create csv writer object
    csvwriter = csv.writer(csv_data)
    
    #Write headers to the output csv file
    headers = []
    for head in jsn[0].keys():
        if head not in ['recent_reviews', 'helpful_reviews']:
            headers.append(head)
    csvwriter.writerow(headers)
    
    for i in range(len(jsn)):
        row = []
        for head in headers:
            value = str(jsn[i][head])
            row.append(value)

        '''print('-------' + str(i) + '------')
        print(row)'''
        csvwriter.writerow(row)
    csv_data.close()

In [28]:
def recent_reviews_csv(json_file_path, csv_output_path):
    '''Write a csv file with app recent reviews where each row is one review'''
    
    # Load json data
    json_data=open(json_file_path).read()
    jsn = json.loads(json_data)  
    # Open the output file
    csv_data = open(csv_output_path, 'w')
    # Create csv writer object
    csvwriter = csv.writer(csv_data)
    
    #Write headers to the output csv file
    headers = []
    for head in jsn[0].keys():
        if head in ['name','id']:
            headers.append(head)

    for head in jsn[0]['recent_reviews'][0].keys():
        headers.append(head)
    csvwriter.writerow(headers)
    
    #Write recent reviews with one review per row
    for i in range(len(jsn)):
        if len(jsn[i]['recent_reviews']) != 0:
            for j in range(len(jsn[i]['recent_reviews'])):
                row = []
                for head in headers:
                    if head in ['name','id']:
                        value = str(jsn[i][head])
                        row.append(value)
                    else:
                        value = str(jsn[i]['recent_reviews'][j][head])        
                        row.append(value)
                csvwriter.writerow(row) 
                '''print('-------' + str(j) + '------')
                print(row)'''
                
        else:
            row = []
            for head in headers:
                if head in ['name','id']:
                    value = str(jsn[i][head])
                    row.append(value)
                else:
                    value = ' '      
                    row.append(value)
            csvwriter.writerow(row)   
    csv_data.close()

In [29]:
def helpful_reviews_csv(json_file_path, csv_output_path):
    '''Write a csv file with app helpful reviews where each row is one review'''
    
    # Load json data
    json_data=open(json_file_path).read()
    jsn = json.loads(json_data)  
    # Open the output file
    csv_data = open(csv_output_path, 'w')
    # Create csv writer object
    csvwriter = csv.writer(csv_data)
    
    #Write headers to the output csv file
    headers = []
    for head in jsn[0].keys():
        if head in ['name','id']:
            headers.append(head)

    for head in jsn[0]['helpful_reviews'][0].keys():
        headers.append(head)
    csvwriter.writerow(headers)
    
    #Write helpful reviews with one review per row
    for i in range(len(jsn)):
            if len(jsn[i]['helpful_reviews']) != 0:
                for j in range(len(jsn[i]['helpful_reviews'])):
                    row = []
                    for head in headers:
                        if head in ['name','id']:
                            value = str(jsn[i][head])
                            row.append(value)
                        else:
                            value = str(jsn[i]['helpful_reviews'][j][head])        
                            row.append(value)
                    csvwriter.writerow(row) 

            else:
                row = []
                for head in headers:
                    if head in ['name','id']:
                        value = str(jsn[i][head])
                        row.append(value)
                    else:
                        value = ' '      
                        row.append(value)
                csvwriter.writerow(row)   
    csv_data.close()

In [30]:
json_path = os.path.expanduser("~/USF/uhc/UHC/data/phr_apps_metadata_and_reviews_dna.json")
recent_csv_path = os.path.expanduser('~/USF/uhc/UHC/data/appstore_recent_reviews_dna.csv')
helpful_csv_path = os.path.expanduser('~/USF/uhc/UHC/data/appstore_helpful_reviews_dna.csv')
metadata_csv_path = os.path.expanduser('~/USF/uhc/UHC/data/appstore_metadata_dna.csv')

In [31]:
app_metadata_csv(json_path, metadata_csv_path)

In [32]:
recent_reviews_csv(json_path, recent_csv_path)

In [33]:
helpful_reviews_csv(json_path, helpful_csv_path)

In [34]:
#OPTIONAL: check the number of rows in each file

json_data=open(json_path).read()
jsn = json.loads(json_data)  

count = 0
for i in range(len(jsn)):
        count+=(len(jsn[i]['recent_reviews']))
print("Total number of recent reviews is:", count)          

count = 0
for i in range(len(jsn)):
        count+=(len(jsn[i]['helpful_reviews']))
print("Total number of helpful reviews is:", count)     

count = 0
for i in range(len(jsn)):
        count+=1
print("Total number of apps is:", count)          

Total number of recent reviews is: 535
Total number of helpful reviews is: 535
Total number of apps is: 2
