In [None]:
# 📦 Data wrangling
import pandas as pd
import numpy as np
import re
import string

# 📊 Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# 🌐 Scraping Google Play Store
from google_play_scraper import reviews, Sort

# 📝 NLP - NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize

# ⚡ NLP - spaCy
import spacy

# 🤖 Machine Learning tools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

In [6]:
# NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...


True

In [None]:
package_name = 'com.cygames.umamusume'

# --- Scraping ---
# Ambil 2000 review terbaru
newest_reviews, _ = reviews(
    package_name,
    lang='en',
    country='gb',
    sort=Sort.NEWEST,
    count=4000
)

# Ambil 1000 review paling helpful
helpful_reviews, _ = reviews(
    package_name,
    lang='en',
    country='gb',
    sort=Sort.MOST_RELEVANT,
    count=1000
)

# --- Gabungkan hasil scraping ---
all_reviews = newest_reviews + helpful_reviews
df = pd.DataFrame(all_reviews)
# Hapus duplikat (jika ada)
df = df.drop_duplicates(subset='reviewId').reset_index(drop=True)
# Pilih kolom penting
df = df[['reviewId', 'score', 'content', 'at', 'thumbsUpCount']]
# --- Simpan ke CSV ---
df.to_csv("umamusume_reviews_en.csv", index=False, encoding="utf-8")

print(f"✅ Total collected: {len(df)} reviews")

✅ Total collected: 4273 reviews


In [27]:
# Menampilkan lima baris pertama dari DataFrame app_reviews_df
df.head()

Unnamed: 0,reviewId,score,content,at,thumbsUpCount
0,8097847a-9292-4f4e-be23-04a82ee69d6e,2,it keeps getting stuck on the loading screen a...,2025-09-13 15:33:17,0
1,f9f85c62-4363-40eb-b70a-edc298cfcfbc,1,update mid september: i dropped a few bucks fo...,2025-09-13 13:52:23,6
2,3b434889-3a41-4e01-a4b7-5ad9a9189b53,5,The best,2025-09-13 12:49:22,0
3,df496990-9ff0-4a22-be36-567eef25a980,5,Goldshi Goldshi Goldshi,2025-09-13 12:42:18,0
4,49a65a02-7d25-4689-8b4f-c6a38446cc3e,1,The game has too many moving parts. Tutorial w...,2025-09-13 12:29:04,1


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4273 entries, 0 to 4272
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   reviewId       4273 non-null   object        
 1   score          4273 non-null   int64         
 2   content        4273 non-null   object        
 3   at             4273 non-null   datetime64[ns]
 4   thumbsUpCount  4273 non-null   int64         
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 167.0+ KB


In [43]:
# 1. Cleaning
def cleaningText(text):
    text = re.sub(r'[0-9]+', '', text)          # hapus angka
    text = re.sub(r'[^\w\s]', '', text)         # hapus karakter selain huruf/angka
    
    text = text.replace('\n', ' ')  
    text = text.translate(str.maketrans('', '', string.punctuation))  
    text = text.strip(' ')  
    return text

# 2. Casefolding
def casefoldingText(text):
    return text.lower()

# 3. Tokenizing
def tokenizingText(text):
    return word_tokenize(text)

# 4. Stopwords Filtering
stop_words = set(stopwords.words('english'))
def filteringText(tokens):
    return [word for word in tokens if word not in stop_words]

# 5. Slang Normalization
# Fungsi untuk baca slang dari file txt
def load_slang_dict(filepath="slang.txt"):
    slang_dict = {}
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if "=" in line:  # pastikan format benar
                key, value = line.split("=", 1)
                slang_dict[key.strip().lower()] = value.strip()
    return slang_dict

# Fungsi normalisasi teks
def normalize_text(text, slang_dict):
    words = text.split()
    normalized = [slang_dict.get(w.lower(), w) for w in words]
    return " ".join(normalized)

# 🔥 Coba pakai
slang_dict = load_slang_dict("slang.txt")

sample = "I'm f2p but still got good pulls lol"
print(normalize_text(sample, slang_dict))

I'm free to play but still got good gacha attempts laugh out loud
