In [1]:
# pip install google-play-scraper

In [2]:
# pip install nltk

In [3]:
# pip install sastrawi

In [4]:
# pip install pandas

In [5]:
# Memanggil seluruh library yang diperlukan
from google_play_scraper import Sort, reviews

import pandas as pd
import numpy as np

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Unduh resource NLTK (jika belum)
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Proses Scraping

In [7]:
# mulai scraping
result, continuation_token = reviews(
    'com.strava', #masukkan link setelah kata 'id'
    lang='id', # defaults to 'en'
    country='id', # defaults to 'us'
    sort=Sort.NEWEST, # defaults to Sort.MOST_RELEVANT
    count=2000, # defaults to 100
    filter_score_with=None # defaults to None(means all score)
)

In [8]:
# membuat Dataframe dari hasil scraping
df = pd.DataFrame(np.array(result),columns=['review'])
df = df.join(pd.DataFrame(df.pop('review').tolist()))

In [9]:
# mengekspor dataset
df.to_csv("Review Strava-Google Play Store.csv", index=False, encoding='utf-8')

# Membaca Dataset

In [10]:
# Membaca dataset hasil scraping review
file_path = "Review Strava-Google Play Store.csv"
df = pd.read_csv(file_path)

print("=== Data Asli (5 Baris Pertama) ===")
display(df.head())

=== Data Asli (5 Baris Pertama) ===


Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,252770b8-d91f-4b0e-b711-cccbf065738c,Gilang Setiawann,https://play-lh.googleusercontent.com/a-/ALV-U...,oke,5,0,432.11,2025-10-15 19:38:49,,,432.11
1,90a189e7-dc4a-45ff-be39-973205d418fd,si Paipai,https://play-lh.googleusercontent.com/a/ACg8oc...,tolong buat pihak strava saya tidak langganan ...,1,0,432.11,2025-10-15 15:07:38,,,432.11
2,dfb70f17-5e5f-4e6e-8ec1-31824c559c7a,Sajimin Sajimin,https://play-lh.googleusercontent.com/a-/ALV-U...,keren,5,0,432.11,2025-10-15 07:56:41,,,432.11
3,bda1f1d2-445e-4c15-b9c0-7c3237fbf0f1,Bintang Raspati,https://play-lh.googleusercontent.com/a-/ALV-U...,Sangat membantu buat tracking olahraga! mayan ...,5,0,432.11,2025-10-15 07:05:30,,,432.11
4,c3ca84ad-65c0-4eec-b150-2b2416dafc13,PRO FANI,https://play-lh.googleusercontent.com/a/ACg8oc...,Saya tidak sengaja berlangganan di aplikasi in...,1,0,,2025-10-15 01:08:03,,,


# Preprocessing

In [11]:
# Case Folding
# Mengubah semua huruf menjadi huruf kecil
df['casefolding'] = df['content'].astype(str).str.lower()

print("=== Hasil Case Folding ===")
display(df[['content', 'casefolding']].head())


=== Hasil Case Folding ===


Unnamed: 0,content,casefolding
0,oke,oke
1,tolong buat pihak strava saya tidak langganan ...,tolong buat pihak strava saya tidak langganan ...
2,keren,keren
3,Sangat membantu buat tracking olahraga! mayan ...,sangat membantu buat tracking olahraga! mayan ...
4,Saya tidak sengaja berlangganan di aplikasi in...,saya tidak sengaja berlangganan di aplikasi in...


In [12]:
# Cleansing
# Menghapus karakter selain huruf dan spasi
df['cleaned'] = df['casefolding'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))

In [13]:
# Tokenizing
# Memecah kalimat menjadi kata-kata (token)
df['token'] = df['cleaned'].apply(lambda x: word_tokenize(x))

In [14]:
# Stopward
# Menghapus kata-kata umum (stopword)
stop_words = set(stopwords.words('indonesian'))
df['token_stopword'] = df['token'].apply(lambda tokens: [w for w in tokens if w not in stop_words])

In [16]:
# Stemming
# Mengubah kata ke bentuk dasarnya
factory = StemmerFactory()
stemmer = factory.create_stemmer()

df['stemmed'] = df['token_stopword'].apply(lambda tokens: [stemmer.stem(w) for w in tokens])
df['preprocessed_text'] = df['stemmed'].apply(lambda tokens: ' '.join(tokens))

print("=== Contoh Hasil Preprocessing ===")
display(df[['content', 'preprocessed_text']].head())

=== Contoh Hasil Preprocessing ===


Unnamed: 0,content,preprocessed_text
0,oke,oke
1,tolong buat pihak strava saya tidak langganan ...,tolong strava langgan saldo potong tolong stra...
2,keren,keren
3,Sangat membantu buat tracking olahraga! mayan ...,bantu tracking olahraga mayan akurat easy to u...
4,Saya tidak sengaja berlangganan di aplikasi in...,sengaja langgan aplikasi dana dikembalikansaya...


# Menyimpan Hasil

In [18]:
# Mengganti nama kolom 'content' menjadi 'review'
df = df.rename(columns={'content': 'review'})

# Menyesuaikan semua nama kolom menjadi huruf kecil (lowercase)
# Ini adalah langkah penting yang menyebabkan error sebelumnya
df.columns = [c.lower() for c in df.columns]

# Menentukan kolom yang ingin disimpan DENGAN NAMA YANG SUDAH BENAR (lowercase)
selected_columns = ['username', 'review', 'casefolding', 'preprocessed_text', 'score', 'at', 'appversion']

# Menyimpan hasil preprocessing ke file baru
output_path = "Review_Strava_Preprocessed.csv"
df[selected_columns].to_csv(output_path, index=False)

print(f"✅ File hasil preprocessing berhasil disimpan sebagai: {output_path}")
print("\nKolom yang disimpan:")
# Mencetak list untuk memastikan
for col in selected_columns:
    print(f"- {col}")

✅ File hasil preprocessing berhasil disimpan sebagai: Review_Strava_Preprocessed.csv

Kolom yang disimpan:
- username
- review
- casefolding
- preprocessed_text
- score
- at
- appversion
