In [2]:
import json
import pandas as pd
import requests
from collections import OrderedDict

searchman_key = "064b0ca7fa6f14c721f8f6cd69f8e94e"

#### Google play metadata


In [19]:
df_metadata = pd.read_csv("data/data_googleplay_dna.csv", header=None)
df_metadata.columns = ['name', 'id']; df_metadata

Unnamed: 0,name,id
0,23andMe Health,com.twentythreeandme.app


In [20]:
app_names = list(df_metadata['name'])
app_ids = list(df_metadata['id'])

In [21]:
app_ids

['com.twentythreeandme.app']

In [28]:
# Use Searchman API to get up to 100 reviews

def get_reviews(names, ids, sortby):
    sort = sortby
    count = 100

    app_reviews_all = {}
    for app_name, app_id in list(zip(names, ids)):
        print(app_id)
        url = "http://api.searchman.io/v1/android/us/app/reviews?" \
            "appId=%s&sort=%s&count=%s&apiKey=%s" % (app_id, sort, str(count), searchman_key)
        r = requests.get(url)
        reviews = json.loads(r.text)
        app_reviews_all[app_name] = reviews['data']
    return app_reviews_all

In [29]:
app_reviews_helpful = get_reviews(app_names, app_ids, "mosthelpful")
app_reviews_recent = get_reviews(app_names, app_ids, "mostrecent")

com.twentythreeandme.app
com.twentythreeandme.app


In [30]:
with open("googleplay_reviews_recent_raw_dna.json", "w") as f:
    f.write(json.dumps(app_reviews_recent))
    
with open("googleplay_reviews_helpful_raw_dna.json", "w") as f:
    f.write(json.dumps(app_reviews_helpful))

In [31]:
app_reviews_helpful

{'23andMe Health': [{'id': 1548179804630,
   'rating': 3,
   'body': "the learning aspect of the app it's great, but it constantly won't pull up messages sent so that i can read them. some times they don't even pull up on the website as well. it's just a coding mess honestly.",
   'reviewId': 'gp:AOqpTOGNnfT-V6GN59kAhgQ0wDQByTecpodB-M3dJ-cMVnIUMn_gYtillJVLvPVTdb13_S_3YYyPOO9E-bWzZ3E',
   'version': '4.76.2',
   'authorId': '110568310443405189483',
   'author': 'Carrie Rios',
   'authorProfileImageUrl': 'https://lh3.googleusercontent.com/a-/AAuE7mD-5AfkvjKFkPn5LQZduP-_WZSDFkSKZTRTRfEh3Q',
   'timestampEpoch': 1548179804,
   'timestamp': '2019-01-22 17:56:44'},
  {'id': 1548673102638,
   'rating': 3,
   'body': "This app would be awesome if I could also receive my messages through it. I have family members trying to contact me, and I can't see it unless I use the actual website. Please make messages an option on the app as well.",
   'reviewId': 'gp:AOqpTOFIKHB8Ifr3xCcDPJDuHZ4vTwJiCZTqBW

In [32]:
# Aggregate helpful and recent reviews and 
# create a unique list of reviews for each app.

app_reviews_together = {}
for app in app_names:
    helpful_review_ids = [review['id'] for review in app_reviews_helpful[app]]
    recent_review_ids = [review['id'] for review in app_reviews_recent[app]]
    all_reviews_ids = list(set(helpful_review_ids).union(set(recent_review_ids)))
    app_reviews_all = app_reviews_helpful[app] + app_reviews_recent[app]
    app_reviews_unique = list({review['id']:review for review in app_reviews_all}.values())
    app_reviews_together[app] = app_reviews_unique

In [35]:
# json to tabular format.

app_reviews_tabular = []
for ix, metadata in df_metadata.iterrows():
    reviews = app_reviews_together[metadata['name']]
    for review in reviews:
        row = OrderedDict()
        row['name'] = metadata['name']
        row['id'] = metadata['id']
        #row['userRatingCount'] = metadata['userRatingCount']
        #row['avgUserRating'] = metadata['avgUserRating']
        row['title'] = review.get('title')
        row['author'] = review.get('author')
        row['authorId'] = review.get('authorId')
        row['rating'] = review.get('rating')
        row['date'] = review.get('timestamp')
        row['voteSum'] = review.get('voteSum')
        row['voteCount'] = review.get('voteCount')
        row['content'] = review.get('body')
        if row['content'] is not None:
            row['content'] = row['content'].replace('\n', '')
        app_reviews_tabular.append(row)

In [36]:
df_out = pd.DataFrame(app_reviews_tabular); df_out.head(2)

Unnamed: 0,name,id,title,author,authorId,rating,date,voteSum,voteCount,content
0,23andMe Health,com.twentythreeandme.app,,Carrie Rios,110568310443405189483,3,2019-01-22 17:56:44,,,"the learning aspect of the app it's great, but..."
1,23andMe Health,com.twentythreeandme.app,,Angie Q,104347726632168158977,3,2019-01-28 10:58:22,,,This app would be awesome if I could also rece...


In [37]:
df_out = df_out[df_out['content'].isna() == False]  # Remove reviews with no content

In [39]:
df_out.to_csv("data/googleplay_reviews_dna.csv")