In [3]:
import re
import csv
import ast
import string
import swifter
import requests
from tqdm import tqdm
from io import StringIO

import pandas as pd
import plotly.graph_objects as go

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Preprocessing

In [5]:
df = pd.read_csv("../datasets/app_review_data_com.tokopedia.tkpd.csv")
print(df.shape)
df.head()

(17881, 9)


Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,at,reviewCreatedVersion,appVersion,clean_content
0,d92ebb06-3dc9-4ce8-98e0-b4ccf565dc67,UMS HERBAL INDONESIA,Jangan keseringan update dong... Bikin kesel t...,2,0,2025-05-27 02:38:46,3.313.0,3.313.0,jangan keseringan update dong bikin kesel tiap...
1,20a3af68-5e34-4abf-be01-db5f4d042cd5,Siti Sukaesih,Pelayanan dan respon oke mantap,5,0,2025-05-27 02:38:21,3.303.0,3.303.0,pelayanan dan respon oke mantap
2,1f2276f1-b51e-498a-b56c-7f4a5ed1ad04,Dedi Hay,lagi di coba dulu,4,0,2025-05-27 02:34:55,3.313.0,3.313.0,lagi di coba dulu
3,776974d8-3f01-4531-abf2-7cf5ce2b4172,Wahyu Saputra,bagus,5,0,2025-05-27 02:20:59,3.313.0,3.313.0,bagus
4,f534201c-2a5e-4f89-a2ab-5249f4f0131c,Hamda Sakhiyah,saya belanja disini selalu dpt harga murah,5,0,2025-05-27 02:11:54,3.313.0,3.313.0,saya belanja disini selalu dpt harga murah


In [None]:
# 1. Fungsi untuk membersihkan teks dari simbol, angka, URL, mention, hashtag, dsb.
def text_clean(text):
    """
    Membersihkan teks dari karakter yang tidak diperlukan seperti:
    - Mention (@username)
    - Hashtag (#tag)
    - URL/link
    - Angka
    - Newline dan tanda baca

    Args:
        text (str): Kalimat mentah.

    Returns:
        str: Kalimat yang sudah dibersihkan.
    """
    if not isinstance(text, str):
        return ""  # Tangani NaN atau tipe non-string

    text = re.sub(r'@[A-Za-z0-9]+', '', text)       # Hapus mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text)       # Hapus hashtag
    text = re.sub(r'http\S+', '', text)             # Hapus URL
    text = re.sub(r'[0-9]+', '', text)              # Hapus angka
    text = text.replace('\n', ' ')                  # Ganti newline dengan spasi
    text = text.translate(str.maketrans('', '', string.punctuation))  # Hapus tanda baca
    text = text.strip()                             # Hapus spasi di awal/akhir
    return text

# 2. Fungsi untuk mengubah semua huruf menjadi huruf kecil
def text_lowercase(text):
    """
    Mengubah semua huruf dalam teks menjadi huruf kecil.

    Args:
        text (str): Kalimat input.

    Returns:
        str: Kalimat dengan huruf kecil semua.
    """
    return text.lower()

# 3. Fungsi untuk melakukan tokenisasi
def text_tokenize(text):
    """
    Memecah kalimat menjadi token (kata per kata).

    Args:
        text (str): Kalimat.

    Returns:
        list: Daftar token kata.
    """
    return word_tokenize(text)

# 4. Fungsi untuk menghapus stopwords (kata umum yang tidak membawa banyak makna)
def text_filter(text):
    """
    Menghapus stopwords dari daftar token.

    Args:
        text (list): Daftar token kata.

    Returns:
        list: Token yang telah difilter (tanpa stopwords).
    """
    list_stopwords = set(stopwords.words('indonesian'))
    return [word for word in text if word not in list_stopwords]

# 5. Inisialisasi stemmer bahasa Indonesia
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# 6. Fungsi untuk melakukan stemming
def text_stemming(text):
    """
    Melakukan stemming terhadap kata-kata dalam teks.

    Args:
        text (list): Daftar kata/token.

    Returns:
        list: Token yang sudah distem (ke bentuk dasarnya).
    """
    return stemmer.stem(' '.join(text)).split()

In [None]:
# 1. Membersihkan dan menurunkan huruf pada konten ulasan
df['clean_content'] = df['content'].apply(text_clean)
df['clean_content'] = df['clean_content'].apply(text_lowercase)

# 2. Tokenisasi teks
tqdm.pandas(desc='Tokenisasi ulasan')
df['processed_content'] = df['clean_content'].progress_apply(text_tokenize)

# 3. Filter stopwords
tqdm.pandas(desc='Filter stopwords')
df['processed_content'] = df['processed_content'].progress_apply(text_filter)

# 4. Stemming (dalam batch 1000 baris agar lebih efisien)
print("Melakukan stemming...")
for i in tqdm(range(0, len(df), 1000), desc="Stemming"):
    df.iloc[i:i+1000, df.columns.get_loc('processed_content')] = \
        df.iloc[i:i+1000]['processed_content'].swifter.apply(text_stemming)

# 5. Menghapus duplikat berdasarkan clean_content
df.drop_duplicates(subset='clean_content', inplace=True)

# Reset deskripsi tqdm agar tidak mengganggu tqdm berikutnya
tqdm.pandas(desc='')

Tokenisasi ulasan: 100%|██████████| 17881/17881 [00:01<00:00, 12313.75it/s]
Filter stopwords: 100%|██████████| 17881/17881 [00:05<00:00, 3450.33it/s]


Melakukan stemming...


Stemming:   0%|          | 0/18 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Stemming:   6%|▌         | 1/18 [02:46<47:12, 166.60s/it]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Stemming:  11%|█         | 2/18 [04:49<37:38, 141.17s/it]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Stemming:  17%|█▋        | 3/18 [06:31<30:47, 123.19s/it]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Stemming:  22%|██▏       | 4/18 [08:11<26:36, 114.03s/it]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Stemming:  28%|██▊       | 5/18 [09:34<22:13, 102.59s/it]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Stemming:  33%|███▎      | 6/18 [11:08<19:56, 99.73s/it] 

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Stemming:  39%|███▉      | 7/18 [12:33<17:26, 95.13s/it]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Stemming:  44%|████▍     | 8/18 [13:42<14:26, 86.65s/it]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Stemming:  50%|█████     | 9/18 [14:46<11:56, 79.62s/it]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Stemming:  56%|█████▌    | 10/18 [16:02<10:29, 78.63s/it]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Stemming:  61%|██████    | 11/18 [17:15<08:57, 76.79s/it]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Stemming:  67%|██████▋   | 12/18 [18:23<07:23, 73.99s/it]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Stemming:  72%|███████▏  | 13/18 [19:29<05:57, 71.53s/it]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Stemming:  78%|███████▊  | 14/18 [20:37<04:41, 70.48s/it]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Stemming:  83%|████████▎ | 15/18 [21:34<03:19, 66.52s/it]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Stemming:  89%|████████▉ | 16/18 [22:32<02:08, 64.04s/it]

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

Stemming:  94%|█████████▍| 17/18 [23:28<01:01, 61.70s/it]

Pandas Apply:   0%|          | 0/881 [00:00<?, ?it/s]

Stemming: 100%|██████████| 18/18 [24:17<00:00, 80.97s/it]


In [None]:
df.head()

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,at,reviewCreatedVersion,appVersion,clean_content,processed_content
0,d92ebb06-3dc9-4ce8-98e0-b4ccf565dc67,UMS HERBAL INDONESIA,Jangan keseringan update dong... Bikin kesel t...,2,0,2025-05-27 02:38:46,3.313.0,3.313.0,jangan keseringan update dong bikin kesel tiap...,"[sering, update, bikin, kesel, buka, aplikasi,..."
1,20a3af68-5e34-4abf-be01-db5f4d042cd5,Siti Sukaesih,Pelayanan dan respon oke mantap,5,0,2025-05-27 02:38:21,3.303.0,3.303.0,pelayanan dan respon oke mantap,"[layan, respon, oke, mantap]"
2,1f2276f1-b51e-498a-b56c-7f4a5ed1ad04,Dedi Hay,lagi di coba dulu,4,0,2025-05-27 02:34:55,3.313.0,3.313.0,lagi di coba dulu,[coba]
3,776974d8-3f01-4531-abf2-7cf5ce2b4172,Wahyu Saputra,bagus,5,0,2025-05-27 02:20:59,3.313.0,3.313.0,bagus,[bagus]
4,f534201c-2a5e-4f89-a2ab-5249f4f0131c,Hamda Sakhiyah,saya belanja disini selalu dpt harga murah,5,0,2025-05-27 02:11:54,3.313.0,3.313.0,saya belanja disini selalu dpt harga murah,"[belanja, dpt, harga, murah]"


In [None]:
df.to_csv("../datasets/pre_processed_data_com.tokopedia.tkpd.csv", index=False)

In [None]:
try:
    # Ganti nama kolom 'processed_content' menjadi 'reprocessed_content'
    df['reprocessed_content'] = df['processed_content']
    df.drop('processed_content', axis=1, inplace=True)

    # Buat mask untuk mendeteksi baris dengan list kosong
    mask = df['reprocessed_content'].apply(lambda x: isinstance(x, list) and len(x) == 0)
    print(f'Jumlah baris yang mengandung list kosong: {mask.sum()}')

except Exception as e:
    print(f'Error saat mengganti dan mengecek data: {e}')

Jumlah baris yang mengandung list kosong: 92


In [None]:
try:
    # Hapus baris yang mengandung list kosong, lalu reset index
    mask = df['reprocessed_content'].apply(lambda x: isinstance(x, list) and len(x) == 0)
    df = df[~mask].reset_index(drop=True)

    # Periksa kembali apakah masih ada list kosong
    mask = df['reprocessed_content'].apply(lambda x: isinstance(x, list) and len(x) == 0)
    print(f'Sisa baris yang mengandung list kosong setelah dibersihkan: {mask.sum()}')

except Exception as e:
    print(f'Error saat membersihkan data: {e}')

Sisa baris yang mengandung list kosong setelah dibersihkan: 0


In [None]:
df.head()

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,at,reviewCreatedVersion,appVersion,clean_content,reprocessed_content
0,d92ebb06-3dc9-4ce8-98e0-b4ccf565dc67,UMS HERBAL INDONESIA,Jangan keseringan update dong... Bikin kesel t...,2,0,2025-05-27 02:38:46,3.313.0,3.313.0,jangan keseringan update dong bikin kesel tiap...,"[sering, update, bikin, kesel, buka, aplikasi,..."
1,20a3af68-5e34-4abf-be01-db5f4d042cd5,Siti Sukaesih,Pelayanan dan respon oke mantap,5,0,2025-05-27 02:38:21,3.303.0,3.303.0,pelayanan dan respon oke mantap,"[layan, respon, oke, mantap]"
2,1f2276f1-b51e-498a-b56c-7f4a5ed1ad04,Dedi Hay,lagi di coba dulu,4,0,2025-05-27 02:34:55,3.313.0,3.313.0,lagi di coba dulu,[coba]
3,776974d8-3f01-4531-abf2-7cf5ce2b4172,Wahyu Saputra,bagus,5,0,2025-05-27 02:20:59,3.313.0,3.313.0,bagus,[bagus]
4,f534201c-2a5e-4f89-a2ab-5249f4f0131c,Hamda Sakhiyah,saya belanja disini selalu dpt harga murah,5,0,2025-05-27 02:11:54,3.313.0,3.313.0,saya belanja disini selalu dpt harga murah,"[belanja, dpt, harga, murah]"


# Labeling

In [None]:
# Fungsi untuk memuat leksikon dari file TSV di GitHub
def load_lexicon_from_github_tsv(url):
    """
    Mengunduh dan memuat leksikon dari URL file TSV di GitHub.

    Args:
        url (str): URL ke file .tsv yang berisi kolom 'word' dan 'weight'.

    Returns:
        dict: Kamus kata dan bobotnya.
    """
    response = requests.get(url)
    response.raise_for_status()
    tsv_content = response.text

    lexicon = {}
    f = StringIO(tsv_content)
    reader = csv.DictReader(f, delimiter='\t')
    for row in reader:
        word = row['word'].strip()
        weight = int(row['weight'].strip())
        lexicon[word] = weight
    return lexicon

# URL leksikon positif dan negatif dari repositori InSet (Fajri91)
url_positive = 'https://raw.githubusercontent.com/fajri91/InSet/master/positive.tsv'
url_negative = 'https://raw.githubusercontent.com/fajri91/InSet/master/negative.tsv'

# Memuat leksikon positif dan negatif
lexicon_positive = load_lexicon_from_github_tsv(url_positive)
lexicon_negative = load_lexicon_from_github_tsv(url_negative)

# Konversi semua bobot negatif ke nilai negatif absolut
lexicon_negative = {k: -abs(v) for k, v in lexicon_negative.items()}

# Gabungkan leksikon positif dan negatif menjadi satu
lexicon = {**lexicon_positive, **lexicon_negative}

# Fungsi untuk menghitung skor dan polaritas sentimen dari teks
def lexicon_polarity_indonesia(text):
    """
    Menentukan polaritas dan skor sentimen berdasarkan leksikon Bahasa Indonesia.

    Args:
        text (list): Token kata hasil preprocessing.

    Returns:
        tuple: (skor_sentimen, polaritas) — polaritas: 'positive', 'negative', atau 'neutral'.
    """
    score = 0
    for word in text:
        if word in lexicon:
            score += lexicon[word]

    if score > 0:
        polarity = 'positive'
    elif score < 0:
        polarity = 'negative'
    else:
        polarity = 'neutral'
    return score, polarity

# Terapkan analisis sentimen ke kolom 'reprocessed_content'
results = df['reprocessed_content'].apply(lexicon_polarity_indonesia)

# Pisahkan hasil menjadi dua kolom: skor dan polaritas
results = list(zip(*results))
df['sentiment_score'] = results[0]
df['sentiment_polarity'] = results[1]

# Tampilkan jumlah masing-masing polaritas
print(df['sentiment_polarity'].value_counts())

sentiment_polarity
negative    8752
positive    3475
neutral     1624
Name: count, dtype: int64


In [None]:
df.head()

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,at,reviewCreatedVersion,appVersion,clean_content,reprocessed_content,sentiment_score,sentiment_polarity
0,d92ebb06-3dc9-4ce8-98e0-b4ccf565dc67,UMS HERBAL INDONESIA,Jangan keseringan update dong... Bikin kesel t...,2,0,2025-05-27 02:38:46,3.313.0,3.313.0,jangan keseringan update dong bikin kesel tiap...,"[sering, update, bikin, kesel, buka, aplikasi,...",-10,negative
1,20a3af68-5e34-4abf-be01-db5f4d042cd5,Siti Sukaesih,Pelayanan dan respon oke mantap,5,0,2025-05-27 02:38:21,3.303.0,3.303.0,pelayanan dan respon oke mantap,"[layan, respon, oke, mantap]",7,positive
2,1f2276f1-b51e-498a-b56c-7f4a5ed1ad04,Dedi Hay,lagi di coba dulu,4,0,2025-05-27 02:34:55,3.313.0,3.313.0,lagi di coba dulu,[coba],-1,negative
3,776974d8-3f01-4531-abf2-7cf5ce2b4172,Wahyu Saputra,bagus,5,0,2025-05-27 02:20:59,3.313.0,3.313.0,bagus,[bagus],-4,negative
4,f534201c-2a5e-4f89-a2ab-5249f4f0131c,Hamda Sakhiyah,saya belanja disini selalu dpt harga murah,5,0,2025-05-27 02:11:54,3.313.0,3.313.0,saya belanja disini selalu dpt harga murah,"[belanja, dpt, harga, murah]",-2,negative


In [None]:
df.to_csv("../datasets/analysis_result_com.tokopedia.tkpd.csv", index=False)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13851 entries, 0 to 13850
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   reviewId              13851 non-null  object
 1   userName              13851 non-null  object
 2   content               13851 non-null  object
 3   score                 13851 non-null  int64 
 4   thumbsUpCount         13851 non-null  int64 
 5   at                    13851 non-null  object
 6   reviewCreatedVersion  10658 non-null  object
 7   appVersion            10658 non-null  object
 8   clean_content         13851 non-null  object
 9   reprocessed_content   13851 non-null  object
 10  sentiment_score       13851 non-null  int64 
 11  sentiment_polarity    13851 non-null  object
dtypes: int64(3), object(9)
memory usage: 1.3+ MB


In [None]:
# Hitung proporsi masing-masing kategori sentimen
relative_size = df['sentiment_polarity'].value_counts(normalize=True)

# Tentukan warna untuk tiap kategori sentimen (dapat disesuaikan)
colors = ['#08306b', '#08519c', '#2171b5']  # Warna biru tua → muda

# Buat Donut Chart menggunakan Plotly
fig = go.Figure(data=[go.Pie(
    labels=relative_size.index,           # Label: positive, negative, neutral
    values=relative_size.values,          # Proporsi tiap kategori
    marker=dict(colors=colors),           # Warna per kategori
    hole=0.3                              # Hole 30% untuk efek Donut Chart
)])

# Update tampilan layout chart
fig.update_layout(
    title_text='Distribusi Polaritas Sentimen',  # Judul grafik
    template='plotly_dark',                      # Tema gelap (bisa ganti 'plotly_white')
    legend_title_text='Kategori Sentimen'        # Judul legenda
)

# Tampilkan grafik
fig.show()