<a href="https://colab.research.google.com/github/marcelounb/BB/blob/master/Download_de_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Baixando da APPStore

default.py



In [24]:
from enum import IntEnum
import re

class Sort(IntEnum):
    NEWEST = 2
    RATING = 3
    MOST_RELEVANT = 1

class Start:
    ID = "br.com.bb.android"
    LANG = "pt"
    COUNTRY = "br"
    COUNT = 199
    URL = "https://play.google.com"
    LAST_UPDATE = None

class Regex:
    NOT_NUMBER = re.compile("[^\d]")
    SCRIPT = re.compile("AF_initDataCallback[\s\S]*?<\/script")
    KEY = re.compile("(ds:.*?)'")
    VALUE = re.compile("return ([\s\S]*?)}}\);<\/")
    REVIEWS = re.compile("\)]}'\n\n([\s\S]+)")

element.py

In [25]:
from datetime import datetime
# from default import Regex

def nested_lookup(source, indexes):
    if len(indexes) == 1:
        return source[indexes[0]]
    return nested_lookup(source[indexes[0]], indexes[1::])

class ElementSpec:
    def __init__(self, extraction_map, post_processor=None, post_processor_except_fallback=None):
        self.extraction_map = extraction_map
        self.post_processor = post_processor
        self.post_processor_except_fallback = post_processor_except_fallback

    def extract_content(self, source):
        try:
            result = nested_lookup(source, self.extraction_map)
        except (KeyError, IndexError, TypeError):
            result = None

        if result is not None and self.post_processor is not None:
            try:
                result = self.post_processor(result)
            except:
                result = self.post_processor_except_fallback

        return result


class ElementSpecs:

    Review = {
        "id": ElementSpec([0]),
        "userName": ElementSpec([1, 0]),
        "userImage": ElementSpec([1, 1, 3, 2]),
        "review": ElementSpec([4]),
        "score": ElementSpec([2]),
        "thumbsUpCount": ElementSpec([6]),
        "appVersion": ElementSpec([10]),
        "date": ElementSpec([5, 0], lambda v: datetime.fromtimestamp(v)),
        "reply": ElementSpec([7, 1]),
        "replyDate": ElementSpec([7, 2, 0], lambda v: datetime.fromtimestamp(v)),
    }

payload.py

In [26]:
from abc import abstractmethod
from urllib.parse import quote
# from default import Start, Sort

class URLFormat:
    @abstractmethod
    def build(self, *args):
        raise NotImplementedError


class Formats:
    class _Reviews(URLFormat):
        URL_FORMAT = "{}/_/PlayStoreUi/data/batchexecute?hl={{lang}}&gl={{country}}".format(
            Start.URL
        )

        def build(self, lang, country):
            return self.URL_FORMAT.format(lang=lang, country=country)

    class _ReviewPayload(URLFormat):
        PAYLOAD_FORMAT_FOR_FIRST_PAGE = 'f.req=[[["UsvDTd","[null,null,[2,{sort},[{count},null,null],null,[null,{score}]],[\\"{app_id}\\",7]]",null,"generic"]]]'
        PAYLOAD_FORMAT_FOR_PAGINATED_PAGE = 'f.req=[[["UsvDTd","[null,null,[2,{sort},[{count},null,\\"{pagination_token}\\"],null,[null,{score}]],[\\"{app_id}\\",7]]",null,"generic"]]]'

        def build(self, app_id, sort, count, filter_score_with, pagination_token):
            if pagination_token is not None:
                result = self.PAYLOAD_FORMAT_FOR_PAGINATED_PAGE.format(
                    app_id=app_id,
                    sort=sort,
                    count=count,
                    score=filter_score_with,
                    pagination_token=pagination_token,
                )
            else:
                result = self.PAYLOAD_FORMAT_FOR_FIRST_PAGE.format(
                    app_id=app_id, sort=sort, score=filter_score_with, count=count
                )
            return quote(result, safe='=').encode()

    Reviews = _Reviews()
    ReviewPayload = _ReviewPayload()

exceptions.py

In [27]:
class GooglePlayScraperException(Exception):
    pass


class ContentNotFoundException(GooglePlayScraperException):
    pass


class InvalidURLError(GooglePlayScraperException):
    pass


class NotFoundError(GooglePlayScraperException):
    pass


class ExtraHTTPError(GooglePlayScraperException):
    pass

request.py


In [28]:
# from exceptions import NotFoundError, ExtraHTTPError

try:
    from urllib.error import HTTPError
except ImportError:
    from urllib2 import HTTPError

try:
    from urllib.request import urlopen, Request
except ImportError:
    from urllib2 import urlopen, Request


def _urlopen(obj):
    try:
        resp = urlopen(obj)
    except HTTPError as e:
        if e.code == 404:
            raise NotFoundError("App not found(404).")
        else:
            raise ExtraHTTPError(
                "App not found. Status code {} returned.".format(e.code)
            )

    return resp.read().decode("UTF-8")


def post(url, data, headers):
    return _urlopen(Request(url, data=data, headers=headers))


def get(url):
    return _urlopen(url)

reviews.py

In [29]:
'''
Codigo adaptado de https://github.com/JoMingyu/google-play-scraper
'''

import json
# from default import Start, Sort, Regex
# from element import ElementSpecs
# from payload import Formats
# from request import post


def fetch_review_items(url, app_id, sort, count, filter_score_with, pagination_token):
    dom = post(
        url,
        Formats.ReviewPayload.build(
            app_id,
            sort,
            count,
            "null" if filter_score_with is None else filter_score_with,
            pagination_token,
        ),
        {"content-type": "application/x-www-form-urlencoded"},
    )

    match = json.loads(Regex.REVIEWS.findall(dom)[0])

    return json.loads(match[0][2])[0], json.loads(match[0][2])[-1][-1]


def reviews(app_id, lang=Start.LANG, country=Start.COUNTRY, sort=Sort.NEWEST, count=Start.COUNT, filter_score_with=None, continuation_token=None, last_update=None, last_id_review=None):
    
    if continuation_token is not None:
        token = continuation_token        
    else:
        token = None

    if count < 200:
        _count = count
    else:
        _count = 199

    url = Formats.Reviews.build(lang=lang, country=country)

    result = []
    cont = _count
    early_stop = False

    while True:
        review_items, token = fetch_review_items(
            url, app_id, sort, _count, filter_score_with, token
        )

        for review in review_items:
            review_dict = {}

            for k, spec in ElementSpecs.Review.items():
                review_dict[k] = spec.extract_content(review)
                if k == 3:
                    pass

            if last_update != None and review_dict['date'] < last_update:
                early_stop = True
                break

            result.append(review_dict)

        if early_stop:
            break

        remaining_count_of_reviews_to_fetch = count - len(result)

        if remaining_count_of_reviews_to_fetch == 0:
            break

        if isinstance(token, list):
            break

        if remaining_count_of_reviews_to_fetch < 200:
            _count = remaining_count_of_reviews_to_fetch
        

        print('Download de', cont, 'reviews ate agora!', app_id)
        cont+=_count

    return (
        result,
        token
    )

main.py

In [30]:
#!/usr/bin/env python
# -*- coding: utf-8 -*- 
import pandas as pd
# from reviews import reviews
from datetime import datetime


app_id = Start.ID
count = 5000000
##Altere a data
date = '10-11-2020 08:00:00'
###Fim
formatter =  '%d-%m-%Y %H:%M:%S'
last_update = datetime.strptime(date, formatter)

result, token = reviews(
    app_id=app_id,
    count=count, 
    last_update=last_update
)

df = pd.DataFrame(result)
df.to_csv('data.csv',index=False)

writer = pd.ExcelWriter("baixado_da_APPStore.xlsx")
df.to_excel(writer, index=False)
writer.save()


Download de 199 reviews ate agora! br.com.bb.android
Download de 398 reviews ate agora! br.com.bb.android
Download de 597 reviews ate agora! br.com.bb.android


In [31]:
df2 = pd.DataFrame(columns=['Responsável', 'Natureza', 'Classificação', 'Produtos/Serviços', 'Detalhamento', 'Descrição do Problema', 'Produtos/Serviços 2', 'Detalhamento 2', 'Descrição do Problema 2', 'Produtos/Serviços 3', 'Detalhamento 3', 'Descrição do Problema 3'])
df2 = df2.append(df)
df2.Responsável = 'Modelo_IA'
df2.head(3)

Unnamed: 0,Responsável,Natureza,Classificação,Produtos/Serviços,Detalhamento,Descrição do Problema,Produtos/Serviços 2,Detalhamento 2,Descrição do Problema 2,Produtos/Serviços 3,Detalhamento 3,Descrição do Problema 3,id,userName,userImage,review,score,thumbsUpCount,appVersion,date,reply,replyDate
0,Modelo_IA,,,,,,,,,,,,gp:AOqpTOGgPDnePRz3YJWFDy_O7W7qqwkFUjIKH0wAKr4...,Renara Lima,https://play-lh.googleusercontent.com/a-/AOh14...,amo o Banco BB prático rápido . nunca travou,5.0,0.0,7.30.5.1,2020-11-11 11:35:19,,NaT
1,Modelo_IA,,,,,,,,,,,,gp:AOqpTOE-7G3Q0SnWw4betPFoPWqPe6RC-gDaBi64g6x...,Erica andrade,https://play-lh.googleusercontent.com/a-/AOh14...,"até no momento não tenho nada a reclamar ,apli...",5.0,0.0,7.30.5.1,2020-11-11 11:33:31,,NaT
2,Modelo_IA,,,,,,,,,,,,gp:AOqpTOFFBqUR7nogg4hatbhJX7Vh0Aw2bw3_xp-EXSw...,Angerliane Jordão,https://play-lh.googleusercontent.com/a-/AOh14...,O App apresenta algumas falhas... não consigo ...,2.0,0.0,7.30.5.1,2020-11-11 11:33:19,,NaT
