In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from collections import defaultdict

# NLP
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import emoji
from nltk.corpus import stopwords
from deep_translator import GoogleTranslator

# Viz
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

[nltk_data] Downloading package punkt to C:\Users\Oriza
[nltk_data]     Sativa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
df = pd.read_csv("C:/Kuliah/Bangkit/Capstone/Dataset/Tambahan/label/playstore.csv",sep=';')
df = df[["at","text_data",'label']]

character = ['.',',',';',':','-,','...','?','!','!!!','(',')','[',']','{','}','<','>','"','/','\'','#','-','@',
             'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',
             'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']


# hapus karakter yang berulang
def repeatcharClean(text): 
  for i in range(len(character)):
    charac_long = 5
    while charac_long > 2:
      char = character[i]*charac_long 
      text = text.replace(char,character[i])
      charac_long -= 1
  return text

def clean_review(text):
    if pd.notna(text):

        # ubah text menjadi huruf kecil
        text = text.lower()
        # ubah enter menjadi spasi
        text = re.sub(r'\n', ' ', text)
        # hapus emoji
        text = emoji.demojize(text)
        text = re.sub(':[A-Za-z_-]+:', ' ', text) # delete emoji
        # hapus emoticon
        text = re.sub(r"([xX;:]'?[dDpPvVoO3)(])", ' ', text)
        # hapus link
        text = re.sub(r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})", "", text)
        # hapus usename
        text = re.sub(r"@[^\s]+[\s]?", ' ', text)
        # hapus hashtag
        text = re.sub(r'#(\S+)', r'\1', text)
        # hapus angka dan beberapa simbol
        text = re.sub('[^a-zA-Z,.?!]+',' ',text)
        # hapus karakter berulang
        text = repeatcharClean(text)
        # translate
        text = translate(text)
        # clear spasi
        text = re.sub('[ ]+',' ',text)
    return text

def preprocess_v1(df):
  df_pp = df.copy()
  df_pp.text_data = df_pp.text_data.map(clean_review)  # Corrected the column name

  # Delete empty rows
  df_pp.text_data.replace('', np.nan, inplace=True)
  df_pp.text_data.replace(' ', np.nan, inplace=True)
  df_pp.dropna(subset=['text_data'], inplace=True)
  return df_pp

df_v1 = preprocess_v1(df)
df_v1


In [3]:
def tokenized(data):
  return data.apply(nltk.word_tokenize)

In [4]:
df_v1['tokenized'] = tokenized(df_v1['text_data'])
df_v1.head()

Unnamed: 0,at,text_data,label,tokenized
0,2023-11-18 04:48:36,kenapa ribet sekali ingin mengganti alamat sur...,Negative,"[kenapa, ribet, sekali, ingin, mengganti, alam..."
1,2023-11-14 22:34:19,"kecewa dengan aplikasi bukalapak, bintang lah,...",Negative,"[kecewa, dengan, aplikasi, bukalapak, ,, binta..."
2,2023-11-14 07:55:02,biasanya ada daftar harga langsung token list...,Negative,"[biasanya, ada, daftar, harga, langsung, token..."
3,2023-11-06 15:57:04,jagan beli pulsa di bualapak..karena bukan top...,Negative,"[jagan, beli, pulsa, di, bualapak, .., karena,..."
4,2023-11-12 01:30:38,"pembelian sering dibatalkan sendiri, keteranga...",Negative,"[pembelian, sering, dibatalkan, sendiri, ,, ke..."


In [11]:
from nltk.corpus import stopwords

def remove_stopwords(data):
    stop_words = set(stopwords.words('indonesian'))
    new_stop_words = ['yang','ya','gue','nya','saja','barang yang','kalau','sih','kayak','tetapi','memang','deh','tokopedia','bukalapak','genshin','ml','tokped','bagu']
    stop_words.update(new_stop_words)
    return data.apply(lambda x: [item for item in x if item not in stop_words])


In [12]:
nltk.download('stopwords')
from nltk.corpus import stopwords

df_v1['no_stopwords'] = remove_stopwords(df_v1['tokenized'])
df_v1

[nltk_data] Downloading package stopwords to C:\Users\Oriza
[nltk_data]     Sativa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,at,text_data,label,tokenized,no_stopwords
0,2023-11-18 04:48:36,kenapa ribet sekali ingin mengganti alamat sur...,Negative,"[kenapa, ribet, sekali, ingin, mengganti, alam...","[ribet, mengganti, alamat, surel, aktif, buka,..."
1,2023-11-14 22:34:19,"kecewa dengan aplikasi bukalapak, bintang lah,...",Negative,"[kecewa, dengan, aplikasi, bukalapak, ,, binta...","[kecewa, aplikasi, ,, bintang, ,, udah, x, tra..."
2,2023-11-14 07:55:02,biasanya ada daftar harga langsung token list...,Negative,"[biasanya, ada, daftar, harga, langsung, token...","[daftar, harga, langsung, token, listrik, harg..."
3,2023-11-06 15:57:04,jagan beli pulsa di bualapak..karena bukan top...,Negative,"[jagan, beli, pulsa, di, bualapak, .., karena,...","[jagan, beli, pulsa, bualapak, .., topup, .., ..."
4,2023-11-12 01:30:38,"pembelian sering dibatalkan sendiri, keteranga...",Negative,"[pembelian, sering, dibatalkan, sendiri, ,, ke...","[pembelian, dibatalkan, ,, keterangannya, pemb..."
...,...,...,...,...,...
28465,2020-05-18 10:57:31,"para costumer gi yang terhormat,,tolong dong k...",Negative,"[para, costumer, gi, yang, terhormat, ,, ,tolo...","[costumer, gi, terhormat, ,, ,tolong, order, t..."
28466,2019-04-03 14:17:34,membantu banget . kadang greget sih di bagian ...,Negative,"[membantu, banget, ., kadang, greget, sih, di,...","[membantu, banget, ., kadang, greget, voucher,..."
28467,2022-03-10 12:33:52,kenapa saya tidak menerima kode otp untuk akti...,Negative,"[kenapa, saya, tidak, menerima, kode, otp, unt...","[menerima, kode, otp, aktivasi, gopay, nomor, ..."
28468,2019-02-18 11:02:38,ship sangat membantu. raport saya di nilai pel...,Positive,"[ship, sangat, membantu, ., raport, saya, di, ...","[ship, membantu, ., raport, nilai, pelit, ngga..."


In [13]:
df_v1['ready'] = df_v1['no_stopwords'].apply(lambda x: ' '.join(x))
df_v1.head()

Unnamed: 0,at,text_data,label,tokenized,no_stopwords,ready
0,2023-11-18 04:48:36,kenapa ribet sekali ingin mengganti alamat sur...,Negative,"[kenapa, ribet, sekali, ingin, mengganti, alam...","[ribet, mengganti, alamat, surel, aktif, buka,...",ribet mengganti alamat surel aktif buka bantua...
1,2023-11-14 22:34:19,"kecewa dengan aplikasi bukalapak, bintang lah,...",Negative,"[kecewa, dengan, aplikasi, bukalapak, ,, binta...","[kecewa, aplikasi, ,, bintang, ,, udah, x, tra...","kecewa aplikasi , bintang , udah x transaksi p..."
2,2023-11-14 07:55:02,biasanya ada daftar harga langsung token list...,Negative,"[biasanya, ada, daftar, harga, langsung, token...","[daftar, harga, langsung, token, listrik, harg...",daftar harga langsung token listrik harga tota...
3,2023-11-06 15:57:04,jagan beli pulsa di bualapak..karena bukan top...,Negative,"[jagan, beli, pulsa, di, bualapak, .., karena,...","[jagan, beli, pulsa, bualapak, .., topup, .., ...",jagan beli pulsa bualapak .. topup .. transfer...
4,2023-11-12 01:30:38,"pembelian sering dibatalkan sendiri, keteranga...",Negative,"[pembelian, sering, dibatalkan, sendiri, ,, ke...","[pembelian, dibatalkan, ,, keterangannya, pemb...","pembelian dibatalkan , keterangannya pembeli y..."


In [14]:
df_v1.drop(['text_data', 'tokenized', 'no_stopwords'], inplace=True, axis=1)
df_v1 = df_v1.rename(columns={"ready": "title"})
df_v1 = df_v1.reindex(columns=['at','title', 'label'])
df_v1

Unnamed: 0,at,title,label
0,2023-11-18 04:48:36,ribet mengganti alamat surel aktif buka bantua...,Negative
1,2023-11-14 22:34:19,"kecewa aplikasi , bintang , udah x transaksi p...",Negative
2,2023-11-14 07:55:02,daftar harga langsung token listrik harga tota...,Negative
3,2023-11-06 15:57:04,jagan beli pulsa bualapak .. topup .. transfer...,Negative
4,2023-11-12 01:30:38,"pembelian dibatalkan , keterangannya pembeli y...",Negative
...,...,...,...
28465,2020-05-18 10:57:31,"costumer gi terhormat , ,tolong order tuh bene...",Negative
28466,2019-04-03 14:17:34,membantu banget . kadang greget voucher motorn...,Negative
28467,2022-03-10 12:33:52,"menerima kode otp aktivasi gopay nomor , aplik...",Negative
28468,2019-02-18 11:02:38,"ship membantu . raport nilai pelit nggak tip ,...",Positive


In [15]:
alay_dict = pd.read_csv('new_kamusalay.csv', names=['original', 'replacement'], encoding='latin-1')

alay_dict_map = dict(zip(alay_dict['original'], alay_dict['replacement']))

def normalize_alay(text):
    return ' '.join([alay_dict_map[word] if word in alay_dict_map else word for word in text.split(' ')])

In [16]:
df_v1['title'] = df_v1['title'].apply(normalize_alay)
df_v1

Unnamed: 0,at,title,label
0,2023-11-18 04:48:36,ribet mengganti alamat surel aktif buka bantua...,Negative
1,2023-11-14 22:34:19,"kecewa aplikasi , bintang , sudah x transaksi ...",Negative
2,2023-11-14 07:55:02,daftar harga langsung token listrik harga tota...,Negative
3,2023-11-06 15:57:04,jangan beli pulsa bualapak .. top up .. transf...,Negative
4,2023-11-12 01:30:38,"pembelian dibatalkan , keterangannya pembeli y...",Negative
...,...,...,...
28465,2020-05-18 10:57:31,"costumer lagi terhormat , ,tolong order itu be...",Negative
28466,2019-04-03 14:17:34,membantu banget . kadang greget kupon motornya...,Negative
28467,2022-03-10 12:33:52,"menerima kode otp aktivasi gopay nomor , aplik...",Negative
28468,2019-02-18 11:02:38,ship membantu . raport nilai pelit enggak tip ...,Positive


In [17]:
# Assuming 'label' is the column containing 'positive' and 'negative'
negative_rows = df_v1[df_v1['label'] == 'Negative']

# Keep up to 10471 rows of 'negative' and all rows of 'positive'
df_v1 = pd.concat([negative_rows.head(10471), df_v1[df_v1['label'] != 'Negative']])


In [18]:
# Assuming 'label' is the column containing 'positive' and 'negative'
label_counts = df_v1['label'].value_counts()

# Display the counts
print(label_counts)


label
Negative    10471
Positive    10471
Name: count, dtype: int64


In [None]:
df_v1.to_csv("C:/Kuliah/Bangkit/Capstone/Dataset/Tambahan/play_stopword.csv",sep=';')

In [None]:
import csv

with open("C:/Kuliah/Bangkit/Capstone/Dataset/Tambahan/label_playstore.csv", 'r', encoding='utf-8', newline='') as file:
    vocabulary = set()  # Using a set for faster membership checks
    i = 0
    reader = csv.reader(file, delimiter=';')
    for row in reader:
        for word in row:
            if word in vocabulary:
                break
            else:
                vocabulary.add(word)
                i = i + 1

print(i)


In [None]:
lengths = {}
with open("C:/Kuliah/Bangkit/Capstone/Dataset/Tambahan/label_playstore.csv",'r', encoding='utf-8', newline='') as csvfile:
    test = csv.reader(csvfile, dialect='excel-tab')
    for row in test:
        for colno, col in enumerate(row):
            lengths[colno] = max(len(col), lengths.get(colno, 0))
print(lengths[colno])

In [None]:
with open("C:/Kuliah/Bangkit/Capstone/Dataset/Tambahan/label_playstore.csv",'r', encoding='utf-8', newline='') as csvfile:
    test = csv.reader(csvfile, dialect='excel-tab')
    columns = list(zip(*test))

In [None]:
for col in columns:
    print(max(col)) 