In [1]:
# Load data ke dalam data frame 
import pandas as pd

# Spesifikasi encoding diperlukan karena data tidak menggunakan UTF-8
df = pd.read_csv('scraping_skingame-neg.csv', encoding='latin-1')

df.head()

Unnamed: 0,Review,Product,Rating,Username,Date
0,"barang sampai di saya, plastik seal nya udh ru...",Skin Game Daily Kind Moisturizer,bintang 1,e***a,Lebih dari 1 tahun lalu
1,"Bau nya ko beda ya, bikin mual, aneh aja ngga ...",Skin Game Daily Kind Facial Wash,bintang 1,shopi,Lebih dari 1 tahun lalu
2,No review found,Skin Game Theory of Everything,bintang 1,Rachmat,8 bulan lalu
3,No review found,Skin Game Acne Warrior,bintang 1,K***n,2024-01-27
4,Tutup botol mampet,Skin Game Acne Warrior,bintang 1,b***u,Lebih dari 1 tahun lalu


In [2]:
# Drop kolom yang tidak terpakai
df = df.drop(df.iloc[:,1:], axis=1)

# Cek data teratas
df.head()

Unnamed: 0,Review
0,"barang sampai di saya, plastik seal nya udh ru..."
1,"Bau nya ko beda ya, bikin mual, aneh aja ngga ..."
2,No review found
3,No review found
4,Tutup botol mampet


PREPROCESSING

In [3]:
# PROSES CASE FOLDING (menjadikan semua huruf di kolom Review menjadi huruf kecil)
df['Review'] = df['Review'].str.lower()

print('Case Folding Result: \n')
df.head()

Case Folding Result: 



Unnamed: 0,Review
0,"barang sampai di saya, plastik seal nya udh ru..."
1,"bau nya ko beda ya, bikin mual, aneh aja ngga ..."
2,no review found
3,no review found
4,tutup botol mampet


In [4]:
# import hasil case folding menjadi csv
df.to_csv("casefolding_skingame.csv")

In [5]:
# CLEANING
import re # regex library

def remove_tweet_special(text):
    # remove tab, new line, and back slice
    text = text.replace('\\t'," ").replace('\\n', " ").replace('\\u', " ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")

df['Review'] = df['Review'].apply(remove_tweet_special)

# remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

df['Review'] = df['Review'].apply(remove_number)

# remove punctuation and replace with space
def remove_punctuation(text):
    return re.sub(r'[.,]', ' ', text)

df['Review'] = df['Review'].apply(remove_punctuation)

# remove punctuation
def remove_symbol(text):
    # Menghapus simbol-simbol tidak standar dan menggantinya dengan spasi
    cleaned_text = re.sub(r'[^\w\s]', ' ', text)
    
    # Menghapus multiple whitespace
    cleaned_text = re.sub('\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

df['Review'] = df['Review'].apply(remove_symbol)

# Menentukan ambang batas panjang string acak
threshold_length = 20

# Fungsi untuk menghapus string acak berdasarkan panjangnya
def hapus_string_acak_dengan_panjang(text):
    # Memeriksa panjang string dan menghapus jika melebihi ambang batas
    return ' '.join(word for word in text.split() if len(word) <= threshold_length)

# Menggunakan fungsi untuk menghapus string acak pada kolom 'Text'
df['Review'] = df['Review'].apply(hapus_string_acak_dengan_panjang)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()

df['Review'] = df['Review'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)

df['Review'] = df['Review'].apply(remove_whitespace_multiple)

# remove single char
def remove_single_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)

df['Review'] = df['Review'].apply(remove_single_char)

def remove_laughter(text):
    laughter_patterns = r'\b((ha)+h*|(he)+h*|(hi)+h*|(wk)+w*k*|(eh)+e*|(ah)+a*|(ih)+i*|(kw)+k*w*|(hem)+m*)\b'
    return re.sub(laughter_patterns, '', text, flags=re.IGNORECASE)

df['Review'] = df['Review'].apply(remove_laughter)

print('Cleaning Result : \n') 
print(df.head())

Cleaning Result : 

                                              Review
0  barang sampai di saya plastik seal nya udh rus...
1  bau nya ko beda ya bikin mual aneh aja ngga ky...
2                                    no review found
3                                    no review found
4                                 tutup botol mampet


In [6]:
# import hasil cleaning menjadi csv
df.to_csv("cleaning_skingame.csv")

In [7]:
# REMOVE DUPLICATE
df = df.drop_duplicates()
df = df.reset_index(drop=True)
# Menghapus baris yang kosong
df = df.dropna(subset=['Review'])
# Menghapus baris yang hanya berisi spasi atau whitespace
df = df[df['Review'].str.strip() != '']
df.head()

Unnamed: 0,Review
0,barang sampai di saya plastik seal nya udh rus...
1,bau nya ko beda ya bikin mual aneh aja ngga ky...
2,no review found
3,tutup botol mampet
4,pump macet


In [8]:
# import hasil remove duplicate menjadi csv
df.to_csv("removedup_skingame.csv")

In [9]:
# NORMALIZATION
import pandas as pd
import re

slang_dictionary = pd.read_csv('../colloquial-indonesian-lexicon2.csv')
slang_dict = pd.Series(slang_dictionary['formal'].values,index=slang_dictionary['slang']).to_dict()

slang_dictionary.head()

Unnamed: 0,slang,formal,In-dictionary,context,category1,category2,category3
0,woww,wow,1.0,wow,elongasi,0,0
1,aminn,amin,1.0,Selamat ulang tahun kakak tulus semoga panjang...,elongasi,0,0
2,met,selamat,1.0,Met hari netaas kak!? Wish you all the best @t...,abreviasi,0,0
3,netaas,menetas,1.0,Met hari netaas kak!? Wish you all the best @t...,afiksasi,elongasi,0
4,keberpa,keberapa,0.0,Birthday yg keberpa kak?,abreviasi,0,0


In [10]:
# Normalisasi kata menggunakan kamus colloquial-indonesian-lexicon2.csv
def Slangwords(text, slang_dict):
    for word in text.split():
        if word in slang_dict.keys():
            # menambahkan \b untuk menandakan batas kata di sekitar kata slang
            text = re.sub(r'\b{}\b'.format(re.escape(word)), slang_dict[word], text)
    
    text = re.sub('@[\w]+', '', text)
    return text

df['Review_norm'] = df['Review'].apply(lambda x: Slangwords(x, slang_dict))
print(df.head())

                                              Review  \
0  barang sampai di saya plastik seal nya udh rus...   
1  bau nya ko beda ya bikin mual aneh aja ngga ky...   
2                                    no review found   
3                                 tutup botol mampet   
4                                         pump macet   

                                         Review_norm  
0  barang sampai di saya plastik seal nya sudah r...  
1  bau nya kok beda ya bikin mual aneh saja engga...  
2                                    no review found  
3                                 tutup botol mampet  
4                                         pump macet  


In [11]:
# import hasil replace slang words menjadi csv
df.to_csv("normalization_skingame.csv")

In [12]:
# PROSES TOKENIZING (word_tokenize() untuk memecah string kedalam tokens)

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# Tokenizing
# NLTK word tokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

df['review_tokens'] = df['Review_norm'].apply(word_tokenize_wrapper)

print('Tokenizing Result : \n') 
print(df.head())

Tokenizing Result : 

                                              Review  \
0  barang sampai di saya plastik seal nya udh rus...   
1  bau nya ko beda ya bikin mual aneh aja ngga ky...   
2                                    no review found   
3                                 tutup botol mampet   
4                                         pump macet   

                                         Review_norm  \
0  barang sampai di saya plastik seal nya sudah r...   
1  bau nya kok beda ya bikin mual aneh saja engga...   
2                                    no review found   
3                                 tutup botol mampet   
4                                         pump macet   

                                       review_tokens  
0  [barang, sampai, di, saya, plastik, seal, nya,...  
1  [bau, nya, kok, beda, ya, bikin, mual, aneh, s...  
2                                [no, review, found]  
3                             [tutup, botol, mampet]  
4                            

In [13]:
# import hasil tokenizing menjadi csv
df.to_csv("tokenizing_skingame.csv")

In [14]:
# PROSES FILTERING (Stopword Removal)
from nltk.corpus import stopwords

# get stopword indonesia from NLTK stopword
list_stopwords = stopwords.words('indonesian')

# convert list to dictionary
list_stopwords = set(list_stopwords)

# remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in list_stopwords]

df['review_tokens_SR'] = df['review_tokens'].apply(stopwords_removal) 

print(df.head())

                                              Review  \
0  barang sampai di saya plastik seal nya udh rus...   
1  bau nya ko beda ya bikin mual aneh aja ngga ky...   
2                                    no review found   
3                                 tutup botol mampet   
4                                         pump macet   

                                         Review_norm  \
0  barang sampai di saya plastik seal nya sudah r...   
1  bau nya kok beda ya bikin mual aneh saja engga...   
2                                    no review found   
3                                 tutup botol mampet   
4                                         pump macet   

                                       review_tokens  \
0  [barang, sampai, di, saya, plastik, seal, nya,...   
1  [bau, nya, kok, beda, ya, bikin, mual, aneh, s...   
2                                [no, review, found]   
3                             [tutup, botol, mampet]   
4                                      [pump, 

In [15]:
# import hasil stopword removal menjadi csv
df.to_csv("stopword_skingame.csv")

In [16]:
# PROSES LEMMATIZATION 
# import Sastrawi package
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)

term_dict = {}

for document in df['review_tokens_SR']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))
print("------------------------")

for term in term_dict:
    term_dict[term] = stemmed_wrapper(term)
    print(term,":" ,term_dict[term])
    
print(term_dict)
print("------------------------")


# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

df['review_tokens_stemmed'] = df['review_tokens_SR'].swifter.apply(get_stemmed_term)
print(df['review_tokens_stemmed'])
print(df.head())

1452
------------------------
barang : barang
plastik : plastik
seal : seal
nya : nya
rusak : rusak
dikasih : kasih
tape : tape
bening : bening
biar : biar
tutup : tutup
kebuka : buka
maaf : maaf
ya : ya
beli : beli
loh : loh
bayar : bayar
lo : lo
gratisan : gratis
tolong : tolong
diperhatikan : perhati
quality : quality
control : control
bau : bau
beda : beda
bikin : bikin
mual : mual
aneh : aneh
kayak : kayak
repurchase : repurchase
fash : fash
wash : wash
skingame : skingame
kaget : kaget
banget : banget
sih : sih
sumpah : sumpah
no : no
review : review
found : found
botol : botol
mampet : mampet
pump : pump
macet : macet
breakout : breakout
pakai : pakai
selangnya : selang
sependek : pendek
hilang : hilang
kebawah : bawah
wtf : wtf
annoying : annoying
pakenya : pakenya
coba : coba
cari : cari
tube : tube
ajalah : aja
kalo : kalo
packaging : packaging
bermasalah : masalah
mulu : mulu
sayang : sayang
formulanya : formula
sebagus : bagus
packagingnya : packagingnya
big : big
deh : deh

Pandas Apply: 100%|██████████| 450/450 [00:00<00:00, 50031.46it/s]

0      [barang, plastik, seal, nya, rusak, kasih, sea...
1      [bau, nya, beda, ya, bikin, mual, aneh, kayak,...
2                                    [no, review, found]
3                                 [tutup, botol, mampet]
4                                          [pump, macet]
                             ...                        
446                 [bagus, ngebersihin, ngecilin, pori]
447                          [cocok, nyembuhin, jerawat]
448    [cocok, kulit, kombinasi, cepat, resap, kemas,...
449    [skin, type, time, coba, toner, affordable, bi...
450    [cocok, kulit, kombinasi, bikin, kulit, kenyal...
Name: review_tokens_stemmed, Length: 450, dtype: object
                                              Review  \
0  barang sampai di saya plastik seal nya udh rus...   
1  bau nya ko beda ya bikin mual aneh aja ngga ky...   
2                                    no review found   
3                                 tutup botol mampet   
4                                    




In [17]:
# import hasil preprocessing menjadi csv
df.to_csv("lemma_skingame.csv")