In [None]:
import json
import pandas as pd
import requests
from collections import OrderedDict

searchman_key = ""

#### Google play metadata


In [None]:
df_metadata = pd.read_csv("data/googleplay_metadata_phr.csv"); df_metadata.shape

In [None]:
app_names = list(df_metadata['name'])
app_ids = list(df_metadata['id'])

In [None]:
# Use Searchman API to get up to 100 reviews

def get_reviews(names, ids, sortby):
    sort = sortby
    count = 100

    app_reviews_all = {}
    for app_name, app_id in list(zip(names, ids)):
        print(app_id)
        url = "http://api.searchman.io/v1/android/us/app/reviews?" \
            "appId=%s&sort=%s&count=%s&apiKey=%s" % (app_id, sort, str(count), searchman_key)
        r = requests.get(url)
        reviews = json.loads(r.text)
        app_reviews_all[app_name] = reviews['data']

In [None]:
app_reviews_helpful = get_reviews(app_names, app_ids, "mosthelpful")
app_reviews_recent = get_reviews(app_names, app_ids, "mostrecent")

In [None]:
with open("googleplay_reviews_recent_raw.json", "w") as f:
    f.write(json.dumps(app_reviews_recent))
    
with open("googleplay_reviews_helpful_raw.json", "w") as f:
    f.write(json.dumps(app_reviews_helpful))

In [None]:
# Aggregate helpful and recent reviews and 
# create a unique list of reviews for each app.

app_reviews_together = {}
for app in app_names:
    helpful_review_ids = [review['id'] for review in app_reviews_helpful[app]]
    recent_review_ids = [review['id'] for review in app_reviews_recent[app]]
    all_reviews_ids = list(set(helpful_review_ids).union(set(recent_review_ids)))
    app_reviews_all = app_reviews_helpful[app] + app_reviews_recent[app]
    app_reviews_unique = list({review['id']:review for review in app_reviews_all}.values())
    app_reviews_together[app] = app_reviews_unique

In [None]:
# json to tabular format.

app_reviews_tabular = []
for ix, metadata in df_metadata.iterrows():
    reviews = app_reviews_together[metadata['name']]
    for review in reviews:
        row = OrderedDict()
        row['name'] = metadata['name']
        row['id'] = metadata['id']
        row['userRatingCount'] = metadata['userRatingCount']
        row['avgUserRating'] = metadata['avgUserRating']
        row['title'] = review.get('title')
        row['author'] = review.get('author')
        row['authorId'] = review.get('authorId')
        row['rating'] = review.get('rating')
        row['date'] = review.get('timestamp')
        row['voteSum'] = review.get('voteSum')
        row['voteCount'] = review.get('voteCount')
        row['content'] = review.get('body')
        if row['content'] is not None:
            row['content'] = row['content'].replace('\n', '')
        app_reviews_tabular.append(row)

In [None]:
df_out = pd.DataFrame(app_reviews_tabular); df_out.head(2)

In [None]:
df_out = df_out[df_out['content'].isna() == False]  # Remove reviews with no content

In [None]:
df_out.to_csv("data/googleplay_reviews_phr.csv")