### Google Play Scrapping

https://pypi.org/project/google-play-scraper/

### Apple Store Scrapping

https://pypi.org/project/apple-store-scraper/


### Install libs

```bash
pip install pandas
pip install selenium
pip install parsel
pip install apple-store-scraper
pip install google-play-scraper
pip install nltk
pip install textblob
#pip install translate
pip install -U deep-translator
```

## Getting review data from stores

### Imports, configs and helpers

In [None]:
# Import libs

import json
import pandas as pd
import time
from datetime import datetime
from parsel import Selector
from pprint import pprint
from deep_translator import GoogleTranslator
from datetime import datetime

from google_play_scraper import app as GooglePlayStore, reviews_all as GooglePlayAllReviews, Sort as GooglePlaySort, reviews as GooglePlayReviews
from apple_store_scraper import AppStore as AppleAppStore

In [None]:
# Configs

export_folder = 'outputs'
export_json = True
reviewsVolume = 20000
sleepTime = 10
reviewsPerRequest = 100
persistData = True
dateFrom = datetime(2021, 12, 31)

google_play_ids = [
    { 'app_name': 'nubank', 'app_id': 'com.nu.production', 'lang': 'pt', 'country': 'br' },
    { 'app_name': 'bb', 'app_id': 'br.com.bb.android', 'lang': 'pt', 'country': 'br' },
    { 'app_name': 'itau', 'app_id': 'com.itau', 'lang': 'pt', 'country': 'br' },
]

apple_store_ids = [
    { 'app_name': 'nubank', 'country': 'br', 'app_id': 814456780 },
    { 'app_name': 'bb', 'country': 'br', 'app_id': 330984271 },
    { 'app_name': 'itau', 'country': 'br', 'app_id': 474505665 },
]

In [None]:
# Helpers

def serializarDatetime(obj):
    if isinstance(obj, datetime):
        return obj.isoformat()
    raise TypeError(f"Tipo de objeto {type(obj)} não é serializável.")

def toEnglish(text):
    return GoogleTranslator(source='pt', target='en').translate(text=text)

def toEnglishBatch(texts):
    pprint(f"Starting translating of {len(texts)} reviews")
    separator = ' && '
    mergedTexts = separator.join(texts)
    result = GoogleTranslator(source='pt', target='en').translate_batch(mergedTexts)
    splittedResult = result.split(separator)
    pprint(f"Finished translating of {len(splittedResult)} reviews")
    return splittedResult

def saveStoreJson(fileContent, company, store):
    file_export = f"{export_folder}/reviews-{company}-{store}-{reviewsVolume}.json"

    with open(file_export, 'w', encoding='utf-8') as arquivo:
        #arquivo.write(json)
        json.dump(fileContent, arquivo, indent=4, ensure_ascii=False, default=serializarDatetime)

def getPersistedReviews(company, store):
    file_import = f"{export_folder}/reviews-{company}-{store}-{reviewsVolume}.json"

    with open(file_import, 'r', encoding='utf-8') as arquivo:
        data = json.load(arquivo)
        return data
    
def checkIfFileExists(company, store):
    file_import = f"{export_folder}/reviews-{company}-{store}-{reviewsVolume}.json"

    try:
        with open(file_import, 'r', encoding='utf-8') as arquivo:
            return True
    except:
        return False
    
def mergeListWithObject(list, obj, objKey):
    return [{**obj, objKey: item} for obj, item in zip(obj, list)]

def formatGoogleReviews(reviews, company):
    result = []

    for review in reviews:
        review['store'] = 'Google'
        review['company'] = company
        result.append(review)
    
    return result

def formatAppleReviews(reviews, company):
    result = []

    for review in reviews:
        review['store'] = 'Apple'
        review['company'] = company
        review['at'] = review['date']
        review['content'] = review['review']
        review['score'] = review['rating']
        del review['date']
        del review['review']
        del review['rating']

        result.append(review)
    
    return result

### Getting data from Google and Apple stores

In [None]:
# Apple Store Getting Data

resultAS = []

for apple_store_id in apple_store_ids:
    currentCompany = apple_store_id
    currentCompanyAppName = currentCompany["app_name"]
    currentComapnyAppId = currentCompany["app_id"]
    currentCompanyCountry = currentCompany["country"]
    store = "Apple"
    persistDataExists = checkIfFileExists(currentCompanyAppName, store)
    
    pprint(f"Getting reviews for {currentCompanyAppName} in {store} store")

    if persistData and persistDataExists:
        result = getPersistedReviews(currentCompanyAppName, store)
        pprint(f"Found {len(result)} persisted reviews")
    else:
        result = AppleAppStore(
            country=currentCompanyCountry,
            app_name=currentCompanyAppName,
            app_id=currentComapnyAppId,
        )

        result.review(how_many=reviewsVolume, sleep=sleepTime, after=dateFrom)
        result = result.reviews
        saveStoreJson(result, currentCompanyAppName, store)

        pprint(f"Found {len(result)} new reviews")

    formattedResult = formatAppleReviews(result, currentCompanyAppName)    
    resultAS.extend(formattedResult)

pprint(f"Found total of {len(resultAS)} reviews in {len(apple_store_ids)} companies")

In [None]:
# Google Play Getting Data

resultGP = []

def recursion(acc, token, firstVerify):
    time.sleep(sleepTime)
    if not token:
        return acc
    
    if len(acc) >= reviewsVolume:
        print('Ending with: ', len(acc), ' items')
        return acc
    
    if firstVerify:
        resultRecur, continuation_token = GooglePlayReviews(
            currentCompanyAppId,
            lang = currentCompanyLang, # defaults to 'en'
            country = currentCompanyCountry, # defaults to 'us'
            sort = GooglePlaySort.NEWEST,  
        )
        
    if not firstVerify:
        resultRecur, continuation_token = GooglePlayReviews(
            currentCompanyAppId,
            lang = currentCompanyLang, # defaults to 'en'
            country = currentCompanyCountry, # defaults to 'us'
            sort = GooglePlaySort.NEWEST,
            continuation_token = token
        )
        
    newAcc = acc + resultRecur
    
    return recursion(newAcc, continuation_token, False)

for google_play_id in google_play_ids:
    currentCompany = google_play_id
    currentCompanyAppName = currentCompany["app_name"]
    currentCompanyAppId = currentCompany["app_id"]
    currentCompanyCountry = currentCompany["country"]
    currentCompanyLang = currentCompany["lang"]
    store = "Google"
    persistDataExists = checkIfFileExists(currentCompanyAppName, store)
    
    pprint(f"Getting reviews for {currentCompanyAppName} in {store} store")

    if persistData and persistDataExists:
        result = getPersistedReviews(currentCompanyAppName, store)
        pprint(f"Found {len(result)} persisted reviews for {currentCompanyAppName} in {store} store")
    else:
        result = recursion([], True, True)
        saveStoreJson(result, currentCompanyAppName, store)
        pprint(f"Found {len(result)} new reviews for {currentCompanyAppName} in {store} store")

    formattedResult = formatGoogleReviews(result, currentCompanyAppName)    
    resultGP.extend(formattedResult)

pprint(f"Found total of {len(resultGP)} reviews in {len(google_play_ids)} companies")

In [None]:
# Saving Data
result = resultAS + resultGP
store = "merged"
company = "multi-company"

saveStoreJson(result, company, store)

## Translating review

In [None]:
# Translate Google and Apple reviews to english 

dataToTranslate = result if result else getPersistedReviews('multi-company', 'merged')

def toTranslateList(review):
    reviewText = review.get('content')
    translateReview = reviewText

    return translateReview

toDataTranslateList = list(map(toTranslateList, dataToTranslate))

translatedList = toEnglishBatch(toDataTranslateList)

translatedReviews = mergeListWithObject(translatedList, resultAS, "ContentEN")


In [None]:
# Orverriding merged file with translated reviews

store = "merged"
company = "multi-company"

saveStoreJson(translatedReviews, company, store)