# Scraping Google Playstore Reviews of GCash

* reference = https://python.plainenglish.io/scraping-storing-google-play-app-reviews-with-python-5640c933c476
* app url = https://play.google.com/store/apps/details?id=com.globe.gcash.android&hl=en&gl=US
* app id = com.globe.gcash.android

#### Importing necessary libraries

In [1]:
#Imports

import pandas as pd
from google_play_scraper import app, Sort, reviews

from pprint import pprint

# for storing in MongoDB
import pymongo
from pymongo import MongoClient

# for keeping track of timing
import datetime as dt
from tzlocal import get_localzone


# for building in wait times
import random
import time

#### Setting up Mongo DB tables

MongoDB creates databases and collections lazily. This means these things won’t actually exist until we start inserting documents (essentially the MongoDB equivalent to the rows of relational database tables) into our collections.

This code block sets up the Mongo Client, creatse a new database for our project, and sets up new collections (essentially the MongoDB equivalent to the tables of relational databases). We’ll store app info in one collection and app reviews in another.


In [6]:
## Set up Mongo client
client = MongoClient(host='localhost', port=27017)

## Database for project
gcash_reviews_db = client['gcash_reviews_db']

## Set up new collection within project db for app info
info_collection = gcash_reviews_db['info_collection']

## Set up new collection within project db for app reviews
review_collection = gcash_reviews_db['review_collection']

#### Setting up App Scraping

In [7]:
app_names = ['GCash']
app_ids = ['com.globe.gcash.android']

In [8]:
## Loop through app IDs to get app info
app_info = []
for i in app_ids:
    info = app(i)
    del info['comments']
    app_info.append(info)

## Pretty print the data for the first app
pprint(app_info[0])

{'adSupported': None,
 'androidVersion': '4.4',
 'androidVersionText': '4.4 and up',
 'appId': 'com.globe.gcash.android',
 'containsAds': False,
 'contentRating': 'Everyone',
 'contentRatingDescription': None,
 'currency': 'USD',
 'description': 'A mobile wallet app that lets you instantly pay bills, buy '
                'load, send money, shop, and more – all in the safety of your '
                'own home!\r\n'
                '\r\n'
                "Enjoy fast and easy payments with GCash! It's a safe, secure "
                'mobile wallet that connects to your mobile number, making it '
                'available anytime, anywhere. Download GCash today so you can '
                'complete your errands without having to break quarantine!\r\n'
                '\r\n'
                '• Open to all networks\r\n'
                '• Zero-rated for Globe and TM\r\n'
                '• Regulated by BSP\r\n'
                '\r\n'
                '<b>SEND MONEY IN REAL TIME</b>\r\n'


In [9]:
## Insert app details into info_collection
info_collection.insert_many(app_info)

<pymongo.results.InsertManyResult at 0x2944ebbed88>

In [10]:
## Query the collection and create DataFrame from the list of dicts
info_df = pd.DataFrame(list(info_collection.find({})))
info_df.head()

Unnamed: 0,_id,title,description,descriptionHTML,summary,summaryHTML,installs,minInstalls,score,ratings,...,adSupported,containsAds,released,updated,version,recentChanges,recentChangesHTML,editorsChoice,appId,url
0,60b75decc5eccca0e375ce54,"GCash - Buy Load, Pay Bills, Send Money",A mobile wallet app that lets you instantly pa...,A mobile wallet app that lets you instantly pa...,"A mobile wallet that lets you shop, send, save...","A mobile wallet that lets you shop, send, save...","10,000,000+",10000000,4.366425,554450,...,,False,"Mar 23, 2012",1621925171,5.41.0,Getting around the GCash app is now easier! We...,Getting around the GCash app is now easier! We...,False,com.globe.gcash.android,https://play.google.com/store/apps/details?id=...


## Scraping App Reviews

In [13]:
## Loop through apps to get reviews
for app_name, app_id in zip(app_names, app_ids):
    
    # Get start time
    start = dt.datetime.now(tz=get_localzone())
    fmt= "%m/%d/%y - %T %p"    
    
    # Print starting output for app
    print('---'*20)
    print('---'*20)    
    print(f'***** {app_name} started at {start.strftime(fmt)}')
    print()
    
    # Empty list for storing reviews
    app_reviews = []
    
    # Number of reviews to scrape per batch
    count = 200
    
    # To keep track of how many batches have been completed
    batch_num = 0
    
    
    # Retrieve reviews (and continuation_token) with reviews function
    rvws, token = reviews(
        app_id,           # found in app's url
        lang='en',        # defaults to 'en'
        country='us',     # defaults to 'us'
        sort=Sort.NEWEST, # start with most recent
        count=count       # batch size
    )
    
    
    # For each review obtained
    for r in rvws:
        r['app_name'] = app_name # add key for app's name
        r['app_id'] = app_id     # add key for app's id
     
    
    # Add the list of review dicts to overall list
    app_reviews.extend(rvws)
    
    # Increase batch count by one
    batch_num +=1 
    print(f'Batch {batch_num} completed.')
    
    # Wait 1 to 5 seconds to start next batch
    time.sleep(random.randint(3,5))
    
    
    
    # Append review IDs to list prior to starting next batch
    pre_review_ids = []
    for rvw in app_reviews:
        pre_review_ids.append(rvw['reviewId'])
    
    
    # Loop through at most max number of batches
    for batch in range(4999):
        rvws, token = reviews( # store continuation_token
            app_id,
            lang='en',
            country='us',
            sort=Sort.NEWEST,
            count=count,
            # using token obtained from previous batch
            continuation_token=token
        )
        
        # Append unique review IDs from current batch to new list
        new_review_ids = []
        for r in rvws:
            new_review_ids.append(r['reviewId'])
            
            # And add keys for name and id to ea review dict
            r['app_name'] = app_name # add key for app's name
            r['app_id'] = app_id     # add key for app's id
     
        # Add the list of review dicts to main app_reviews list
        app_reviews.extend(rvws)
        
        # Increase batch count by one
        batch_num +=1
        
        # Break loop and stop scraping for current app if most recent batch
          # did not add any unique reviews
        all_review_ids = pre_review_ids + new_review_ids
        if len(set(pre_review_ids)) == len(set(all_review_ids)):
            print(f'No reviews left to scrape. Completed {batch_num} batches.\n')
            break
        
        # all_review_ids becomes pre_review_ids to check against 
          # for next batch
        pre_review_ids = all_review_ids
        
        
        # At every 100th batch
        if batch_num%100==0:
            
            # print update on number of batches
            print(f'Batch {batch_num} completed.')
            
            # insert reviews into collection
            review_collection.insert_many(app_reviews)
            
            # print update about num reviews inserted
            store_time = dt.datetime.now(tz=get_localzone())
            print(f"""
            Successfully inserted {len(app_reviews)} {app_name} 
            reviews into collection at {store_time.strftime(fmt)}.\n
            """)
            
            # empty our list for next round of 100 batches
            app_reviews = []
        
        # Wait 1 to 5 seconds to start next batch
        time.sleep(random.randint(3,5))
      
    
    # Print update when max number of batches has been reached
      # OR when last batch didn't add any unique reviews
    print(f'Done scraping {app_name}.')
    print(f'Scraped a total of {len(set(pre_review_ids))} unique reviews.\n')
    
    
    # Insert remaining reviews into collection
    review_collection.insert_many(app_reviews)
    
    # Get end time
    end = dt.datetime.now(tz=get_localzone())
    
    # Print ending output for app
    print(f"""
    Successfully inserted all {app_name} reviews into collection
    at {end.strftime(fmt)}.\n
    """)
    print(f'Time elapsed for {app_name}: {end-start}')
    print('---'*20)
    print('---'*20)
    print('\n')
    
    # Wait 1 to 5 seconds to start scraping next app
    time.sleep(random.randint(1,5))

------------------------------------------------------------
------------------------------------------------------------
***** GCash started at 06/02/21 - 18:36:12 PM

Batch 1 completed.
Batch 100 completed.

            Successfully inserted 20000 GCash 
            reviews into collection at 06/02/21 - 18:44:25 PM.

            
Batch 200 completed.

            Successfully inserted 20000 GCash 
            reviews into collection at 06/02/21 - 18:52:49 PM.

            
Batch 300 completed.

            Successfully inserted 20000 GCash 
            reviews into collection at 06/02/21 - 19:01:38 PM.

            
Batch 400 completed.

            Successfully inserted 20000 GCash 
            reviews into collection at 06/02/21 - 19:09:56 PM.

            
Batch 500 completed.

            Successfully inserted 20000 GCash 
            reviews into collection at 06/02/21 - 19:18:32 PM.

            
Batch 600 completed.

            Successfully inserted 20000 GCash 
            r

In [12]:
len(app_reviews)

6600

In [20]:
app_reviews[3]

{'reviewId': 'gp:AOqpTOFGVLBuwumsJtFw91d8J4zVPx7oNpGHosC2NFSNUeRmypleYbvPSVOPu8CpAx6trlYJxEBI4cXNJH0aAQ',
 'userName': 'Irma Reyes',
 'userImage': 'https://play-lh.googleusercontent.com/a/AATXAJwm4GG5mzMbxFW7rW-naTjloLqSb8_QkKMwHdQ0=mo',
 'content': 'Relevant',
 'score': 5,
 'thumbsUpCount': 0,
 'reviewCreatedVersion': '5.40.0',
 'at': datetime.datetime(2021, 6, 2, 16, 12, 30),
 'replyContent': None,
 'repliedAt': None,
 'app_name': 'GCash',
 'app_id': 'com.globe.gcash.android'}

In [14]:
review_df = pd.DataFrame(list(review_collection.find({})))
review_df.head()

# remove ids
# split date column 
# ask for release dates ng software then tag

# least priority -> tag what language the reviews are 

Unnamed: 0,_id,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,app_name,app_id
0,60b76108c5eccca0e375ce55,gp:AOqpTOHa4x04QC93zb0pxQZ-1Q1GwFeysD-XpcDTW78...,Emmanuel Jim Roldan,https://play-lh.googleusercontent.com/a/AATXAJ...,Good for mobile online deals.,1,0,5.40.0,2021-06-02 18:32:37,,NaT,GCash,com.globe.gcash.android
1,60b76108c5eccca0e375ce56,gp:AOqpTOFOvZwSS0iVvmGbcBKkwFyprROh5KqfcJuI5jg...,Harem Tuazon,https://play-lh.googleusercontent.com/a-/AOh14...,Ilang beses nang naulit na nag load ako nag ba...,1,0,5.41.0,2021-06-02 18:32:14,,NaT,GCash,com.globe.gcash.android
2,60b76108c5eccca0e375ce57,gp:AOqpTOFb35bOoUsvlsEEPVO0UrtxJ2fut8jq5sdiexC...,Fraxilyn Nael,https://play-lh.googleusercontent.com/a-/AOh14...,Its a great experience and convenient,4,0,5.40.0,2021-06-02 18:32:07,,NaT,GCash,com.globe.gcash.android
3,60b76108c5eccca0e375ce58,gp:AOqpTOFRgY4C4LVX_-Cr5D1zxW881WGrlPf01jMWwr4...,Mary rose Manipolo,https://play-lh.googleusercontent.com/a-/AOh14...,Ok na ok sya para sa mga easy transaction lalo...,5,0,5.41.0,2021-06-02 18:31:31,,NaT,GCash,com.globe.gcash.android
4,60b76108c5eccca0e375ce59,gp:AOqpTOH8Znzr1cV9K-1A9ci8IBaQOr-fUkvUhC4P7ZU...,Ferritch Vlog,https://play-lh.googleusercontent.com/a-/AOh14...,very helpful and contented,5,0,5.40.0,2021-06-02 18:31:25,,NaT,GCash,com.globe.gcash.android


In [15]:
review_df.shape

(199298, 13)

In [17]:
review_df.to_csv('gcash_reviews_playstore.csv')

In [20]:
review_df.columns

Index(['_id', 'reviewId', 'userName', 'userImage', 'content', 'score',
       'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent',
       'repliedAt', 'app_name', 'app_id'],
      dtype='object')

In [24]:
review_df.sort_values(by=['at']).head(30)

Unnamed: 0,_id,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,app_name,app_id
198697,60b77333c5eccca0e378d67e,lg:AOqpTOEuRDzBAl8r4_kyXgkmLFXU0n51uFDExjDgogL...,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Works fine.. I like the graphics and layout.. ...,5,4,1.0.1.0,2012-03-26 13:49:59,,NaT,GCash,com.globe.gcash.android
198696,60b77333c5eccca0e378d67d,lg:AOqpTOEtSVLgUNG0zDOcuKw30rV9L22JYj_0luEr52-...,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"""Unknown error occurred"" always popping up! Ne...",1,0,1.0.0.0,2012-03-26 18:49:57,,NaT,GCash,com.globe.gcash.android
198695,60b77333c5eccca0e378d67c,lg:AOqpTOHtVCRQ280_md40DmafRCh73olr3srKIPr2UV6...,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,very convenient to use..,5,0,1.0.1.0,2012-05-08 11:32:34,,NaT,GCash,com.globe.gcash.android
198694,60b77333c5eccca0e378d67b,lg:AOqpTOEfo_fYQ8nkv3Rm0ST063JhAUsdnucRkmUR2XO...,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"It would really be great if you add ""payable t...",4,7,1.0.1.0,2012-05-31 21:53:30,,NaT,GCash,com.globe.gcash.android
198693,60b77333c5eccca0e378d67a,lg:AOqpTOFFSwUPBt4vUzDjb_A0tpfUPkQmXFc8YZmHUeC...,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Its working fine with my motorola droid razr. ...,5,1,1.0.1.0,2012-06-20 21:38:43,Thank you for giving us a 5-star rating! Keep ...,2018-02-23 18:26:01,GCash,com.globe.gcash.android
198692,60b77333c5eccca0e378d679,lg:AOqpTOEqa8V0KOvOM57V83v8t1BmodlxwEReZOjNfmP...,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,love the way you pay!,5,1,1.0.1.0,2012-07-02 22:10:54,,NaT,GCash,com.globe.gcash.android
198691,60b77333c5eccca0e378d678,lg:AOqpTOGmQzUEyeSopSDKec2h1sR2_rzLAcweOxs3Ag_...,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Ill try if itworks,3,0,,2012-07-04 06:34:46,,NaT,GCash,com.globe.gcash.android
198690,60b77333c5eccca0e378d677,lg:AOqpTOEIFp10kTGbXQvLlgkWAEfIIPzEQ4sUTQG1TDx...,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,"I dont know about the others with low ratings,...",5,7,,2012-08-10 09:51:20,,NaT,GCash,com.globe.gcash.android
198689,60b77333c5eccca0e378d676,lg:AOqpTOHM_26Wt43A1_4pAuvUKrhrhiAU32BoNMaRi30...,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,For those QWERTY Android phones (Samsung Galax...,2,0,1.3.0,2012-08-17 15:29:38,,NaT,GCash,com.globe.gcash.android
198688,60b77333c5eccca0e378d675,lg:AOqpTOHN7S4eUAmawWt1MB7SESCeQJQpR5BYV5sJ-Vk...,A Google user,https://play-lh.googleusercontent.com/EGemoI2N...,Godd app! Hope u also create one for windows p...,4,1,1.3.0,2012-08-19 07:44:10,,NaT,GCash,com.globe.gcash.android


In [23]:
review_df['reviewId'].nunique()

198936