<a href="https://colab.research.google.com/github/mahima8178/Google-Product-Reviews-Scrape-Analyse/blob/main/Google_Product_Reviews_Scrape%26Analyse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers

In [2]:
pip install stqdm



In [3]:
!pip install apify-client



In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification


In [5]:
#load the models with the tokenizer
tokenizer_sent = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model_sent = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

In [6]:
import torch

In [7]:
import requests

In [8]:
%%writefile utils.py
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
import streamlit as st
import nltk

MODEL_NAME_AMAZON = "LiYuan/amazon-review-sentiment-analysis"
MODEL_NAME_TWITTER = "cardiffnlp/twitter-roberta-base-sentiment-latest"

AMAZON_ROW_LABELS = ['1 star', '2 stars', '3 stars', '4 stars', '5 stars']
TWITTER_ROW_LABELS = ['Negative', 'Neutral', 'Positive']

AMAZON_ROW_VALUES = list(range(1,6))
TWITTER_ROW_VALUES = [0, 0.5, 1]


@st.cache_resource
def download_model(model_name: str = MODEL_NAME_AMAZON):
    '''
    Downloads the model from huggingface.
    '''
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    device = torch.cuda.current_device() if torch.cuda.is_available() else None
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)
    return nlp

@st.cache_resource
def init():
    '''
    Downloads the stopwords for nltk.
    '''
    nltk.download('stopwords')

Overwriting utils.py


In [9]:
%%writefile scraper.py
import requests, json, datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob
from wordcloud import WordCloud
from nltk.corpus import stopwords
from string import punctuation
import streamlit as st
from apify_client import ApifyClient
APIFY_TOKEN='apify_api_esQN883jVZCZCBhLP0CvrcQidXHeAh3ukmtI'

def gt(dt_str):
     '''
     Converts an isoformat string to a datetime object.
     '''
     dt, _, us = dt_str.partition(".")
     dt = datetime.datetime.strptime(dt, "%Y-%m-%dT%H:%M:%S")
     us = int(us.rstrip("Z"), 10)
     return dt + datetime.timedelta(microseconds=us)

def query_for_usage():
    '''
    Queries Apify for the number of queries used.
    '''
    print('Querying for usage...')
    url = 'https://api.apify.com/v2/users/me/usage/monthly?token=' + APIFY_TOKEN
    r = requests.get(url)
    d = json.loads(r.text)
    date = gt(d['data']['usageCycle']['endAt'])
    date_diff = (date - datetime.datetime.now())
    return f"**{d['data']['monthlyServiceUsage']['PROXY_SERPS']['quantity']*100} / 50,000** queries used this month. Resets in **{date_diff.days} days, {date_diff.seconds//3600} hours**."

@st.cache_data
def query_google(query: str, num_of_queries: int, use_json=True):
    '''
    Queries Google for a given query.
    Returns a list of descriptions and a list of ratings.
    '''
    print(f'Searching for {num_of_queries} reviews...')
    if use_json:
        with open('data.json') as f:
            res_ls = json.load(f)
    else:
        client = ApifyClient(APIFY_TOKEN)

        run_input = { "queries": f"{query} review",
                    "maxPagesPerQuery": num_of_queries // 100,
                    "resultsPerPage": 100,
                    "countryCode": "",
                    "customDataFunction": """async ({ input, $, request, response, html }) => {
                    return {
                    pageTitle: $('title').text(),
                    };
                };""",
                }
        run = client.actor("apify/google-search-scraper").call(run_input=run_input)
        res_ls = []
        for item in client.dataset(run["defaultDatasetId"]).iterate_items():
            res_ls.append(item)


    rating_dataset = []
    desc_dataset = []
    for res in res_ls:
        # res is a dictionary
        for row in res['organicResults']:
            try:
                rating = float(row['productInfo']['rating'])
                if rating > 10:
                    rating = rating/100
                elif rating > 5:
                    rating = rating/10
                else:
                    rating = rating/5
                rating_dataset.append(rating)
            except:
                pass

            desc = row['description'].replace('\xa0','')
            if len(desc) > 14:

                # Check case for Nov XX, XXXX
                if desc[12].isalpha() and desc[8:12].isdigit():
                    desc = desc[12:]

                # Check case for Nov X, XXXX
                elif desc[11].isalpha() and desc[7:11].isdigit():
                    desc = desc[11:]

                desc_dataset.append(desc)


    return desc_dataset, np.array(rating_dataset, dtype=np.float32)


def create_wordcloud(desc_dataset: list):
    '''
    Creates a wordcloud from a list of descriptions.
    '''
    pos_words = ''
    neg_words = ''
    neutral_words = ''
    stop_words = set(stopwords.words('english'))

    for desc in desc_dataset:
        for word in desc.split():
            word = word.lstrip(punctuation).rstrip(punctuation)
            if word:
                analysis = TextBlob(word)
                if analysis.sentiment.polarity > 0:
                    pos_words += word + ' '
                elif analysis.sentiment.polarity < 0:
                    neg_words += word + ' '
                else:
                    neutral_words += word + ' '

    figs = []
    for title, words in [('Positive words', pos_words), ('Negative words', neg_words), ('Neutral words', neutral_words)]:
        wordcloud = WordCloud(background_color ='white',
                        stopwords = stop_words,
                        min_font_size = 10,
                        max_words = 20).generate(words)

        # Plot the WordCloud image
        fig, ax = plt.subplots(figsize=(15,5))
        ax.set_facecolor('black')
        plt.imshow(wordcloud)
        plt.title(title, fontsize=55, color='black', pad=40)
        plt.axis("off")
        plt.show()
        figs.append(fig)
    return figs



def show_ratings(rating_dataset, rating_round=10, plot_type='line'):
    '''
    Shows a chart of the pure ratings.
    rating_round: 0 = Continuous (0-1), 5 = Integer /5, 10 = Integer /10
    plot_type: 'Line', 'Scatter', 'Both' <- For continuous only
    '''
    fig, ax = plt.subplots(figsize=(6,6))
    if rating_round == 5 or rating_round == 10:
        if rating_round == 5:
            rating_dataset = np.vectorize(lambda x: round(x * 5))(rating_dataset)
        else:
            rating_dataset = np.vectorize(lambda x: round(x * 10))(rating_dataset)

        # Count occurrences of each loan
        df = pd.Series(rating_dataset)
        df2 = df.value_counts()
        df2 = df2.reindex(list(range(1, rating_round + 1)))
        df2.plot(ax=ax, kind='bar')

    else:
        x ,y  = np.unique(rating_dataset, return_counts=True)
        if plot_type == 'Line':
            plt.plot(x,y)
        elif plot_type == 'Scatter':
            plt.scatter(x,y)
        elif plot_type == 'Both':
            plt.plot(x,y)
            plt.scatter(x,y)
        plt.xlim(xmin=0, xmax=1)
    plt.title('Raw Ratings', fontsize=15, color='black')
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)
    plt.show()
    return fig


def eval_sentiment(desc_dataset, model, row_labels, row_values, title):
    '''
    Shows a chart of the sentiment of the descriptions.
    '''
    res = model(desc_dataset)
    df = pd.DataFrame(res)['label'].str.capitalize()
    fig, ax = plt.subplots(figsize=(8,8))
    df2 = df.value_counts()
    df2 = df2.reindex(row_labels)
    df2.plot(ax=ax, kind='bar')
    plt.title(title, fontsize=25, color='black')
    plt.xticks(fontsize=25)
    plt.yticks(fontsize=25)
    plt.show()

    total = 0
    count = 0
    total = 0
    for df2_val, row_val in zip(df2, row_values):
        if not pd.isna(df2_val):
            total += row_val * df2_val
            count += df2_val

    mean = total / len(desc_dataset)


    return fig, mean

Overwriting scraper.py


In [10]:
%%writefile main.py
import streamlit as st
import scraper
import utils
import numpy as np

import warnings
warnings.simplefilter("ignore", UserWarning)

### INITIALIZATION ###
# if "load_state" not in st.session_state:
#      st.session_state.load_state = False
if "rating_round" not in st.session_state:
     st.session_state.rating_round = 'Continuous (0 to 1)'
if "plot_type" not in st.session_state:
     st.session_state.plot_type = 'Line'

twitter_model = utils.download_model(utils.MODEL_NAME_TWITTER)
amazon_model = utils.download_model(utils.MODEL_NAME_AMAZON)

utils.init()


### MAIN APP ###
st.title('Review Scraper')
st.caption('Scrapes [Google](https://www.google.com/) for reviews of a product')

st.caption('Powered by [HuggingFace](https://huggingface.co/), [Streamlit](https://streamlit.io/), and [Apify](https://apify.com/)')

st.markdown("""
    <style>
div[class*="stRadio"] > label > div[data-testid="stMarkdownContainer"] > p {
    font-size: 17px;

}
.big-font {
    font-size: 18px;
}
    </style>
    """, unsafe_allow_html=True)

st.warning('Apify has a limit of 50,000 queries a month', icon="⚠️")
col1, col2 = st.columns(2)
with col1:
    query_usage = st.button('See current usage')
if query_usage:
    with col2:
        st.write(scraper.query_for_usage())


form = st.form(key='main_form')
item = form.text_input('Enter an item (product, movie, etc.) to search for reviews: ', value='PS5 Console')
num_of_queries = form.slider('Number of queries: ', min_value=100, max_value=1000, value=100, step=100)
submit = form.form_submit_button('Search')
if submit:
    st.write(f'Searching for reviews for **{item}**...')
    desc_dataset, rating_dataset = scraper.query_google(item, num_of_queries, use_json=False) # use_json=True for dummy data
    st.write(f'<span class=big-font>Found `{len(desc_dataset)}` reviews</span>', unsafe_allow_html = True)
    st.write(f'<span class=big-font>Found `{len(rating_dataset)}` ratings</span>', unsafe_allow_html = True)

    if len(rating_dataset) > 0:
        st.markdown("***")
        st.subheader('Raw ratings')

        col1, col2 = st.columns(2)
        with col1:
            rating_round_d = {'Continuous (0 to 1)': 0, 'Out of 5': 5, 'Out of 10': 10}
            st.session_state.rating_round = st.radio('Rating precision:', rating_round_d.keys(), index=0)

        with col2:
            plot_types_ls = ['Line', 'Scatter', 'Both']
            st.session_state.plot_type = st.radio('Chart type (for continuous only):', plot_types_ls, index=2)

        fig = scraper.show_ratings(rating_dataset, rating_round=rating_round_d[st.session_state.rating_round], plot_type=st.session_state.plot_type)
        st.pyplot(fig)
        multiplier = 1 if st.session_state.rating_round == 'Continuous (0 to 1)' else rating_round_d[st.session_state.rating_round]
        st.write(f'<span class=big-font><b>Average <u>raw</u> rating:</b> `{np.mean(rating_dataset) * multiplier: .2f} / {multiplier}`</span>', unsafe_allow_html = True)

    if len(desc_dataset) > 0:
        st.markdown("***")
        st.subheader('Sentiment Analysis')
        col1, col2 = st.columns(2)
        with col1:
            fig, mean = scraper.eval_sentiment(desc_dataset, amazon_model, utils.AMAZON_ROW_LABELS, utils.AMAZON_ROW_VALUES, 'Rating from 1 - 5 stars')
            st.pyplot(fig)
            st.write(f'<span class=big-font><b>Average rating from <u>1 - 5</u>:</b> `{mean: .2f} / 5`</span>', unsafe_allow_html = True)
        with col2:
            fig, mean = scraper.eval_sentiment(desc_dataset, twitter_model, utils.TWITTER_ROW_LABELS, utils.TWITTER_ROW_VALUES, 'Negative / Neutral / Positive')
            st.pyplot(fig)
            st.write(f'<span class=big-font><b>Average rating from <u>0 / 0.5 / 1</u>:</b> `{mean: .2f} / 1`</span>', unsafe_allow_html = True)

        st.markdown("***")
        wc_figures = scraper.create_wordcloud(desc_dataset)
        st.subheader('Word clouds')
        col3, col4, col5 = st.columns(3)
        with col3:
            st.pyplot(wc_figures[0])
        with col4:
            st.pyplot(wc_figures[1])
        with col5:
            st.pyplot(wc_figures[2])



Overwriting main.py


In [11]:
!streamlit run main.py &>/content/logs.txt &
import urllib
print("Password/Enpoint IP for localtunnel is:",urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip("\n"))
!npx localtunnel --port 8501 wget -q -O - ipv4.icanhazip.com

Password/Enpoint IP for localtunnel is: 34.125.184.58
[K[?25hnpx: installed 22 in 4.08s
your url is: https://curly-pandas-open.loca.lt
^C
