#  Gemini App Reviews - Sentiment Analysis

## Project Overview
Analisis sentiment dari review aplikasi **Google Gemini** di Play Store menggunakan:
- **Web Scraping** dari Google Play Store
- **Word Frequency Analysis** (menghitung kata yang sering muncul)
- **Linear Regression** untuk prediksi rating
- **Logistic Regression** untuk klasifikasi sentiment

---

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re

# scarping
from google_play_scraper import app, Sort , reviews_all, reviews

# Nlp
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from textblob import TextBlob

#ml
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

print("succes")

succes


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\test\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\test\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\test\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# scarpping google play reviews

In [26]:
# app id for gemini
APP_ID = 'com.google.android.apps.bard'
countries = ['us', 'id', 'sg', 'cn', 'gb', 'ca']
country_name = {
    'us': 'United States',
    'id': 'Indonesia',
    'sg': 'Singapore',
    'cn': 'China',
    'gb': 'United Kingdom',
    'ca': 'Canada'
}

# get app info
for country_code in countries:
    try:
        app_info = app(APP_ID, lang='en', country=country_code)

        print(f"\nData for {country_code.upper()}")
        print(f"APP Name : {app_info['title']}")
        print(f"Rating : {app_info['score']:.2f}")
        print(f"Download : {app_info['installs']}")
        print(f"Total Reviews : {app_info['reviews']:,}")
    
    except Exception as e:
        print(f"failed to fetch data for {country_code.upper()}")


Data for US
APP Name : Google Gemini
Rating : 4.54
Download : 1,000,000,000+
Total Reviews : 102,556

Data for ID
APP Name : Google Gemini
Rating : 4.72
Download : 1,000,000,000+
Total Reviews : 104,576

Data for SG
APP Name : Google Gemini
Rating : 4.76
Download : 1,000,000,000+
Total Reviews : 2,618

Data for CN
APP Name : Google Gemini
Rating : 4.55
Download : 1,000,000,000+
Total Reviews : 1,617,858

Data for GB
APP Name : Google Gemini
Rating : 4.53
Download : 1,000,000,000+
Total Reviews : 14,948

Data for CA
APP Name : Google Gemini
Rating : 4.57
Download : 1,000,000,000+
Total Reviews : 9,647


In [27]:
# scrape reviews 
print("may take a few minutes")

all_reviews = []
scraping_stats = {}

for country_code in countries:
    try:
        result, continuation_token = reviews(
            APP_ID,
            lang='en',
            country=country_code,
            sort=Sort.NEWEST,
            count=2000
        )
        
        # add country info to each reviews
        for review in result:
            review['country'] = country_code.upper()
            review['country_name'] = country_name.get(country_code, country_code.upper())

        all_reviews.extend(result)
        scraping_stats[country_code.upper()] = len(result)
    
        print(f"\n    {country_name.get(country_code, country_code.upper())}: {len(result)} reviews")

    except Exception as e:
        print(f"Failed to scrape reviews for {country_code.upper()} : {e}")
        scraping_stats[country_code.upper()] = 0
        
print(f"\nScraping Complete")
print(f"Total reviews scraped across all countries: {total_scraped_reviews}")

may take a few minutes

    United States: 2000 reviews

    Indonesia: 2000 reviews

    Singapore: 2000 reviews

    China: 2000 reviews

    United Kingdom: 2000 reviews

    Canada: 2000 reviews

Scraping Complete
Total reviews scraped across all countries: 12000


In [29]:
# convert to dataframe
df_raw = pd.DataFrame(all_reviews)
print(f"ävailable columns from scraping {list(df_raw.columns)}")
print(f"shape {df_raw.shape}")

ävailable columns from scraping ['reviewId', 'userName', 'userImage', 'content', 'score', 'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent', 'repliedAt', 'appVersion', 'country', 'country_name']
shape (12000, 13)


# Feature engginering 

In [None]:
# create df
df = pd.DataFrame()

# basic columns
df['review_id'] = range(1, len(df_raw) + 1)
df['user_name'] = df_raw['userName'] 
df['review_text'] = ['content']
df['rating'] = df_raw['score']
df['thumbs_up'] = df_raw['thumbsUpCount']
df['review_date'] = df_raw['at']
df['country_code'] = df_raw['country_code']
df['country_name'] = df_raw['country_name']

# data features
df['review_date'] = pd.to_datetime(df['review_date'])