# Analyze Google Play Store App Reviews using Snowflake Cortex LLMs

In [None]:
# Python Packages
import streamlit as st
import pandas as pd
from google_play_scraper import Sort, reviews_all
import warnings
warnings.filterwarnings("ignore")

# Snowpark Packages
from snowflake.cortex import Complete
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark.functions import col, lit
import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T

# Create Snowpark Session
session = get_active_session()

## Retrieve Google Playstore Reviews

In [None]:
appids = {
    'ABB-free@home® Next':'com.abb.freeathomeflex',
    'ABB-free@home®':'com.abb.freeathomeflex',
    'ABB-Welcome':'com.abb.welcome',
    'Busch-ControlTouch':'de.buschjaeger.controltouch',
    'Drivetune':'com.abb.spider',
    'Smart Sensor Platform':'com.abb.ability.smartsensor'
}

for key in appids:
    print(f"Loading rewviews for appid: {appids[key]}")
    reviews = reviews_all(
        appids[key],
        sleep_milliseconds=10, # defaults to 0
        lang='en', # defaults to 'en'
        country='us', # defaults to 'us'
        sort=Sort.NEWEST, # defaults to Sort.MOST_RELEVANT
    )
    reviews_df = pd.DataFrame(reviews)
    print(f"Loaded {len(reviews_df)} rewviews for appid: {appids[key]}")
    reviews_df['APP_NAME'] = key
    reviews_df['APP_ID'] = appids[key]
    try:
        df = pd.concat([df, reviews_df], ignore_index=True)
    except:
        df = reviews_df
        
df.columns = [col.upper() for col in df.columns]

## Persist reviews in Snowflake table

In [None]:
app_reviews = session.write_pandas(df, table_name='ABB_APP_REVIEWS', overwrite=True, auto_create_table=True)
# filter reviews with low rating and enough content
app_reviews = app_reviews.filter(col('score') < 2).filter(F.length('CONTENT') > 10)

print('Numer of reviews:', app_reviews.count())
app_reviews

## Analyze Reviews with Cortex

In [None]:
llm_model = 'llama3.1-8b'

# Create a prompt
prompt = F.concat(lit("Derive the sentiment (1-5) from the review and list the main complaint of the review if there is any. \
The review: "), col('CONTENT'), lit("Return a JSON like this {sentiment:sentiment, main_complaint:main_complaint}.\
Only return the JSON, no other text."))

# Run LLM and parse outputs
app_reviews = app_reviews.with_column('LLM_OUTPUT', F.call_builtin('try_parse_json',Complete(llm_model,prompt))).cache_result()
app_reviews = app_reviews.filter(col('LLM_OUTPUT').is_not_null())
app_reviews = app_reviews.with_column('LLM_SENTIMENT', col('LLM_OUTPUT')['sentiment'].cast('int'))
app_reviews = app_reviews.with_column('LLM_TOP_COMPLAINT', col('LLM_OUTPUT')['main_complaint'].cast('string'))
app_reviews = app_reviews[['APP_NAME','CONTENT','SCORE','LLM_SENTIMENT','LLM_TOP_COMPLAINT']]
app_reviews.show()

In [None]:
plot_df = app_reviews.group_by(['APP_NAME','LLM_SENTIMENT']).agg(F.count('APP_NAME').as_('COUNT'))
st.subheader('ABB App Reviews by App')
st.bar_chart(plot_df, x='LLM_SENTIMENT', y='COUNT', color='APP_NAME')

In [None]:
for key in appids:
    st.subheader(f'Top reasons for bad reviews for App: {key}')
    llm_data = app_reviews.filter(col('APP_NAME') == key)[['LLM_SENTIMENT','LLM_TOP_COMPLAINT']].to_pandas().to_markdown()
    response = Complete(llm_model, f'What are the three main reasons for bad reviews? The review data: {llm_data}')
    st.markdown(response)