In [20]:
# 📦 Data wrangling
import pandas as pd
import numpy as np
import re
import string

# 📊 Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# 🌐 Scraping Google Play Store
from google_play_scraper import reviews, Sort

# 📝 NLP - NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize

# ⚡ NLP - spaCy
import spacy
import contractions 

# 🤖 Machine Learning tools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

In [21]:
# NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [22]:
package_name = 'com.cygames.umamusume'

# --- Scraping ---
# Ambil 2000 review terbaru
newest_reviews, _ = reviews(
    package_name,
    lang='en',
    country='gb',
    sort=Sort.NEWEST,
    count=4000
)

# Ambil 1000 review paling helpful
helpful_reviews, _ = reviews(
    package_name,
    lang='en',
    country='gb',
    sort=Sort.MOST_RELEVANT,
    count=1000
)

# --- Gabungkan hasil scraping ---
all_reviews = newest_reviews + helpful_reviews
df = pd.DataFrame(all_reviews)
# Hapus duplikat (jika ada)
df = df.drop_duplicates(subset='reviewId').reset_index(drop=True)
# Pilih kolom penting
df = df[['reviewId', 'score', 'content', 'at', 'thumbsUpCount']]
# --- Simpan ke CSV ---
df.to_csv("umamusume_reviews_en.csv", index=False, encoding="utf-8")

print(f"✅ Total collected: {len(df)} reviews")

✅ Total collected: 4287 reviews


In [23]:
# Menampilkan lima baris pertama dari DataFrame app_reviews_df
df.head()

Unnamed: 0,reviewId,score,content,at,thumbsUpCount
0,5230c913-debc-4f61-8691-526727473438,5,Why cows have hooves instead of feet? They're ...,2025-09-15 16:52:52,0
1,212b779b-9a6e-417e-b68c-3c56ede54a33,2,"Sure it's a fun game. But, the issue of having...",2025-09-15 16:38:56,5
2,460b2338-3e8d-4586-9b4a-f5406644019f,5,Umazing👍,2025-09-15 13:58:11,0
3,c8454650-0f08-4ae7-a823-dcab5a8e3ae3,5,ABSOLUTE CINEMA,2025-09-15 13:53:56,0
4,59d1083d-b51a-4e12-879a-593cb7f6c308,5,HASIRE HASIRE UMAMUSUME,2025-09-15 13:39:44,0


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4287 entries, 0 to 4286
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   reviewId       4287 non-null   object        
 1   score          4287 non-null   int64         
 2   content        4287 non-null   object        
 3   at             4287 non-null   datetime64[ns]
 4   thumbsUpCount  4287 non-null   int64         
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 167.6+ KB


In [44]:
# 1. Cleaning
def cleaningText(text):
    # hapus link
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    # hapus HTML entities
    text = re.sub(r"&\w+;", ' ', text)
    # hapus karakter non-ASCII kecuali emoji
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # hapus tanda baca kecuali apostrof (biar "i'm" tetap ada)
    text = re.sub(r"[^\w\s']", '', text)
    # ubah newline ke spasi
    text = text.replace("\n", " ")
    # hapus spasi berlebihan
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# 2. Casefolding
def casefoldingText(text):
    return text.lower()

# Fungsi expand contraction
def expand_contractions(text):
    return contractions.fix(text)

# 3. Tokenizing
def tokenizingText(text):
    return word_tokenize(text)

# 4. Stopwords Filtering
stop_words = set(stopwords.words('english'))
custom_stopwords = {
    "time", "one", "get", "make", "go", "play", "really", "still", "even", "well",
    "much", "many", "lot", "thing", "stuff", "something", "anything", "everything", "nothing",
    "also", "maybe", "actually", "probably", "almost", "kinda", "sort", "bit", "very",
    "etc", "like", "just", "pretty", "rather", "quite", "somehow",
    "game", "app", "application", "gameplay", "character", "horse", "girl", "story",
    "graphics", "system", "feature", "update", "dev", "developer", "version",
    "can", "could", "would", "should", "may", "might", "will", "shall", "must",
    "i", "me", "my", "we", "us", "our", "you", "your", "they", "them", "their", "he", "she", "it", "its"
}
stop_words.update(custom_stopwords)
def filteringText(tokens):
    return [word for word in tokens if word not in stop_words]

# 5. Slang Normalization
# Fungsi untuk baca slang dari file txt
def load_slang_dict(filepath="slang.txt"):
    slang_dict = {}
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if "=" in line:  # pastikan format benar
                key, value = line.split("=", 1)
                slang_dict[key.strip().lower()] = value.strip()
    return slang_dict

# Fungsi normalisasi teks
def normalize_text(text, slang_dict):
    # urutkan slang dari yang panjang ke pendek supaya frasa lebih dulu diproses
    for slang in sorted(slang_dict.keys(), key=len, reverse=True):
        pattern = r"\b" + re.escape(slang) + r"\b"
        text = re.sub(pattern, slang_dict[slang], text, flags=re.IGNORECASE)
    return text

slang_dict = load_slang_dict("slang.txt")

# 6. Lemmatization (spaCy)
nlp = spacy.load("en_core_web_sm")

def lemmatizationText(tokens):
    doc = nlp(" ".join(tokens))  # spaCy butuh kalimat
    return [token.lemma_ for token in doc]

# 7. To sentence
def toSentence(list_words):
    return " ".join(word for word in list_words)



In [45]:
def preprocess_review(text, slang_dict):
    text = cleaningText(text)            # 1. Cleaning
    text = casefoldingText(text)         # 2. Casefolding
    text = expand_contractions(text)     # 3. Expand contractions (🔥 di sini)
    text = normalize_text(text, slang_dict)  # 4. Slang normalization
    tokens = tokenizingText(text)        # 5. Tokenizing
    tokens = filteringText(tokens)       # 6. Stopword filtering
    tokens = lemmatizationText(tokens)   # 7. Lemmatization
    return toSentence(tokens)            # 8. To sentence

from tqdm import tqdm
tqdm.pandas()  # aktifkan tqdm untuk pandas

df['text_final'] = df['content'].fillna("").progress_apply(lambda x: preprocess_review(x, slang_dict))

100%|██████████| 4287/4287 [00:28<00:00, 152.95it/s]


In [46]:
import pandas as pd
from collections import Counter

def get_most_common_words(texts, top_n=100):
    all_tokens = []
    for text in texts:
        cleaned = cleaningText(text)          # pakai fungsi cleaning kamu
        lowered = casefoldingText(cleaned)
        tokens = tokenizingText(lowered)
        all_tokens.extend(tokens)

    # hitung frekuensi kata
    counter = Counter(all_tokens)
    return counter.most_common(top_n)


# Ambil kolom review
reviews = df["text_final"].dropna().tolist()

# Cari kata paling sering muncul
common_words = get_most_common_words(reviews, top_n=100)

# Masukkan ke DataFrame supaya rapi
df_freq = pd.DataFrame(common_words, columns=["word", "count"])

# Lihat hasil
df_freq

Unnamed: 0,word,count
0,gacha,840
1,good,640
2,fun,613
3,love,435
4,fix,429
...,...,...
95,always,103
96,2,102
97,cute,102
98,without,101


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd

# Inisialisasi VADER
sia = SentimentIntensityAnalyzer()

# Update lexicon supaya kata slang yang sudah dinormalisasi punya bobot sentimen
custom_words = {
    "generous": 2.5,   # positif
    "stingy": -2.5,    # negatif
}

sia.lexicon.update(custom_words)

# Contoh fungsi untuk kasih label
def get_sentiment(text):
    score = sia.polarity_scores(text)['compound']  # ambil skor compound
    if score >= 0.05:
        return "positive"
    elif score <= -0.05:
        return "negative"
    else:
        return "neutral"

# Contoh: buat kolom label dari review
df['label'] = df['text_final'].apply(get_sentiment)

In [49]:
df

Unnamed: 0,reviewId,score,content,at,thumbsUpCount,text_final,label
0,5230c913-debc-4f61-8691-526727473438,5,Why cows have hooves instead of feet? They're ...,2025-09-15 16:52:52,0,cow hoof instead foot lactose,neutral
1,212b779b-9a6e-417e-b68c-3c56ede54a33,2,"Sure it's a fun game. But, the issue of having...",2025-09-15 16:38:56,5,sure fun issue little mean premium currency ha...,positive
2,460b2338-3e8d-4586-9b4a-f5406644019f,5,Umazing👍,2025-09-15 13:58:11,0,amazing,positive
3,c8454650-0f08-4ae7-a823-dcab5a8e3ae3,5,ABSOLUTE CINEMA,2025-09-15 13:53:56,0,masterpiece,positive
4,59d1083d-b51a-4e12-879a-593cb7f6c308,5,HASIRE HASIRE UMAMUSUME,2025-09-15 13:39:44,0,hasire hasire umamusume,neutral
...,...,...,...,...,...,...,...
4282,9576a149-f47d-4798-95d0-37af11c358b6,2,The game keeps crashing whenever i click on th...,2025-07-08 15:15:58,0,keeps crash whenever click scout option please...,positive
4283,622f508c-8bee-4e76-84a5-eb1a2d2d3e4c,1,PLEASEEEEE fix the scout feature! Trying to ge...,2025-07-11 08:02:20,9,pleaseeeee fix scout try good career run terri...,positive
4284,ed9e532d-4d55-4104-a51f-7912548843eb,2,The game is fun with charming characters HOWEV...,2025-07-01 19:14:21,0,fun charming character however accelerate sche...,positive
4285,5abfb637-c41e-4c18-9aab-22174deb379a,1,"I'm on a Galaxy s23 Ultra. After reinstall, re...",2025-07-01 02:00:01,1,galaxy s23 ultra reinstall repair clear cache ...,positive
